Fix setting GPU behaviour:

Condition maxSlotsPerGpu Behavior OptimizeModelStr 0 Bypass: non-shared temporary engine 1 GPU 1 Single slot, no round-robin >1 GPU, VRAM < 24 GB 1 Round-robin: 1 slot per GPU >1 GPU, VRAM >= 24 GB -1 Elastic: on-demand slot growth
2026-03-30 09:59:09 +11:00
parent 01eabf76bd
commit c1b919ec47
9 changed files with 123 additions and 6 deletions
--- a/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
@@ -40,7 +40,7 @@ bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId,
            options.maxBatchSize };
        m_engine = EnginePoolManager<float>::instance().acquire(
            m_poolKey, options, onnxPath,
-            kClsSubVals, kClsDivVals, true, -1);
+            kClsSubVals, kClsDivVals, true, getPoolMaxSlotsPerGpu());
        m_usingSharedPool = (m_engine != nullptr);

        if (!m_engine) {
--- a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
@@ -57,7 +57,7 @@ bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
            options.maxBatchSize };
        m_engine = EnginePoolManager<float>::instance().acquire(
            m_poolKey, options, onnxPath,
-            kDetSubVals, kDetDivVals, true, -1);
+            kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu());
        m_usingSharedPool = (m_engine != nullptr);

        if (!m_engine) {
--- a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
@@ -52,7 +52,7 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string&
            options.maxBatchSize };
        m_engine = EnginePoolManager<float>::instance().acquire(
            m_poolKey, options, onnxPath,
-            kRecSubVals, kRecDivVals, true, -1);
+            kRecSubVals, kRecDivVals, true, getPoolMaxSlotsPerGpu());
        m_usingSharedPool = (m_engine != nullptr);

        if (!m_engine) {
--- a/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h
@@ -7,6 +7,10 @@
 #include <algorithm>
 #include <numeric>
 #include <cmath>
+#include <climits>
+#include <mutex>
+#include <iostream>
+#include <cuda_runtime.h>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>

@@ -95,6 +99,38 @@ struct OCRPredictResult {
 // Utility functions
 // ============================================================================

+// Determine maxSlotsPerGpu based on GPU topology:
+//   1 GPU            → 1  (single slot, no round-robin needed)
+//   >1 GPU, VRAM<24GB → 1  (round-robin: 1 slot per GPU)
+//   >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
+// Result is cached after the first query.
+inline int getPoolMaxSlotsPerGpu() {
+    static int s_result = INT_MIN;
+    static std::mutex s_mutex;
+    std::lock_guard<std::mutex> lk(s_mutex);
+    if (s_result != INT_MIN) return s_result;
+    int gpuCount = 0;
+    cudaGetDeviceCount(&gpuCount);
+    if (gpuCount <= 1) {
+        s_result = 1;
+        std::cout << "Info [OCR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
+        return s_result;
+    }
+    // Multiple GPUs — check VRAM (GPUs are assumed same spec)
+    constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024;  // 24 GB
+    size_t totalMem = 0, freeMem = 0;
+    cudaSetDevice(0);
+    cudaMemGetInfo(&freeMem, &totalMem);
+    if (totalMem >= kLargeVramBytes) {
+        s_result = -1;
+        std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
+    } else {
+        s_result = 1;
+        std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
+    }
+    return s_result;
+}
+
 // Load character dictionary from file
 inline std::vector<std::string> LoadDict(const std::string& dictPath) {
    std::vector<std::string> keys;