diff --git a/modules/ANSFR/ANSFR.cpp b/modules/ANSFR/ANSFR.cpp index 7f2e899..5ba1719 100644 --- a/modules/ANSFR/ANSFR.cpp +++ b/modules/ANSFR/ANSFR.cpp @@ -757,6 +757,7 @@ namespace ANSCENTER { else { _detector = std::make_unique(); } + _detector->SetMaxSlotsPerGpu(m_maxSlotsPerGpu); // LOCK DURING INITIALIZATION bool initSuccess; @@ -796,6 +797,7 @@ namespace ANSCENTER { try { // Create recognizer instance _recognizer = std::make_unique(); + _recognizer->SetMaxSlotsPerGpu(m_maxSlotsPerGpu); // Configure model ModelConfig recognizerConfig; diff --git a/modules/ANSFR/ANSFR.h b/modules/ANSFR/ANSFR.h index 6b30a3a..7b683de 100644 --- a/modules/ANSFR/ANSFR.h +++ b/modules/ANSFR/ANSFR.h @@ -121,7 +121,9 @@ namespace ANSCENTER ~ANSFacialRecognition() noexcept; void UnloadEngine(); void Destroy(); + void SetMaxSlotsPerGpu(int n) { m_maxSlotsPerGpu = n; } private: + int m_maxSlotsPerGpu{ 1 }; // set by dllmain based on GPU topology int GetUser(int userId, UserRecord& userRecord); int GetUser(int userId, const std::string& userCode,const std::string& userName, UserRecord& userRecord); int GetUsers(std::vector& userRecords, std::vector& userIds); diff --git a/modules/ANSFR/ANSFaceRecognizer.h b/modules/ANSFR/ANSFaceRecognizer.h index 8f205d0..b3f2240 100644 --- a/modules/ANSFR/ANSFaceRecognizer.h +++ b/modules/ANSFR/ANSFaceRecognizer.h @@ -120,7 +120,7 @@ namespace ANSCENTER { std::shared_ptr> m_trtEngine = nullptr; // NVIDIA TensorRT EnginePoolManager::PoolKey m_poolKey; bool m_usingSharedPool = false; - int m_maxSlotsPerGpu{ -1 }; // -1 = elastic mode (on-demand slots, auto-cleanup) + int m_maxSlotsPerGpu{ 1 }; // 1 = single slot (default); set by dllmain based on GPU topology void SetMaxSlotsPerGpu(int n) override { m_maxSlotsPerGpu = n; } std::shared_ptr faiss_index; std::shared_ptr m_gpuResources; diff --git a/modules/ANSFR/dllmain.cpp b/modules/ANSFR/dllmain.cpp index fc9c8e6..4e40909 100644 --- a/modules/ANSFR/dllmain.cpp +++ b/modules/ANSFR/dllmain.cpp @@ -9,7 +9,10 @@ #include "FaceNet.h" #include "ANSFaceRecognizer.h" #include "ANSLibsLoader.h" +#include "engine/TRTEngineCache.h" +#include "engine/EnginePoolManager.h" #include +#include #include #include #include @@ -93,6 +96,36 @@ public: FRHandleGuard& operator=(const FRHandleGuard&) = delete; }; +// Determine maxSlotsPerGpu based on GPU topology: +// 1 GPU → 1 (single slot, no round-robin needed) +// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU) +// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth) +static int GetPoolMaxSlotsPerGpu() { + static int s_result = INT_MIN; + static std::mutex s_mutex; + std::lock_guard lk(s_mutex); + if (s_result != INT_MIN) return s_result; + int gpuCount = 0; + cudaGetDeviceCount(&gpuCount); + if (gpuCount <= 1) { + s_result = 1; + std::cout << "Info [FR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl; + return s_result; + } + constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB + size_t totalMem = 0, freeMem = 0; + cudaSetDevice(0); + cudaMemGetInfo(&freeMem, &totalMem); + if (totalMem >= kLargeVramBytes) { + s_result = -1; + std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl; + } else { + s_result = 1; + std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl; + } + return s_result; +} + BOOL APIENTRY DllMain( HMODULE hModule, DWORD ul_reason_for_call, LPVOID lpReserved @@ -117,8 +150,14 @@ BOOL APIENTRY DllMain( HMODULE hModule, case DLL_THREAD_DETACH: break; case DLL_PROCESS_DETACH: - // Clean up any handles that LabVIEW didn't release before closing. - // Without this, idle-timer threads keep the process alive indefinitely. + // ExitProcess: OS killed worker threads, CUDA context is dead. + // Set flag so Engine/Pool destructors skip CUDA cleanup. + if (lpReserved != nullptr) { + g_processExiting().store(true, std::memory_order_relaxed); + break; + } + + // Dynamic FreeLibrary — threads are still alive, safe to clean up. try { std::vector leakedHandles; { @@ -130,6 +169,8 @@ BOOL APIENTRY DllMain( HMODULE hModule, for (auto* h : leakedHandles) { try { h->Destroy(); delete h; } catch (...) {} } + try { EnginePoolManager::instance().clearAll(); } catch (...) {} + try { TRTEngineCache::instance().clearAll(); } catch (...) {} } catch (...) {} break; } @@ -185,6 +226,8 @@ extern "C" ANSFR_API int CreateANSRFHandle(ANSCENTER::ANSFacialRecognition** const bool _enableFaceLiveness = (enableFaceLiveness == 1); const bool _enableAntiSpoofing = (enableAntiSpoofing == 1); + ptr->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu()); + int result = ptr->Initialize(licenseKey, configFilePath, databaseFilePath, diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp index 9faa622..2da68da 100644 --- a/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp +++ b/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp @@ -40,7 +40,7 @@ bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId, options.maxBatchSize }; m_engine = EnginePoolManager::instance().acquire( m_poolKey, options, onnxPath, - kClsSubVals, kClsDivVals, true, -1); + kClsSubVals, kClsDivVals, true, getPoolMaxSlotsPerGpu()); m_usingSharedPool = (m_engine != nullptr); if (!m_engine) { diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp index 9d4951f..41c290c 100644 --- a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp +++ b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp @@ -57,7 +57,7 @@ bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId, options.maxBatchSize }; m_engine = EnginePoolManager::instance().acquire( m_poolKey, options, onnxPath, - kDetSubVals, kDetDivVals, true, -1); + kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu()); m_usingSharedPool = (m_engine != nullptr); if (!m_engine) { diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp index 551440b..758963d 100644 --- a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp +++ b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp @@ -52,7 +52,7 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& options.maxBatchSize }; m_engine = EnginePoolManager::instance().acquire( m_poolKey, options, onnxPath, - kRecSubVals, kRecDivVals, true, -1); + kRecSubVals, kRecDivVals, true, getPoolMaxSlotsPerGpu()); m_usingSharedPool = (m_engine != nullptr); if (!m_engine) { diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h b/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h index 1ae3bc6..5395ded 100644 --- a/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h +++ b/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h @@ -7,6 +7,10 @@ #include #include #include +#include +#include +#include +#include #include #include @@ -95,6 +99,38 @@ struct OCRPredictResult { // Utility functions // ============================================================================ +// Determine maxSlotsPerGpu based on GPU topology: +// 1 GPU → 1 (single slot, no round-robin needed) +// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU) +// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth) +// Result is cached after the first query. +inline int getPoolMaxSlotsPerGpu() { + static int s_result = INT_MIN; + static std::mutex s_mutex; + std::lock_guard lk(s_mutex); + if (s_result != INT_MIN) return s_result; + int gpuCount = 0; + cudaGetDeviceCount(&gpuCount); + if (gpuCount <= 1) { + s_result = 1; + std::cout << "Info [OCR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl; + return s_result; + } + // Multiple GPUs — check VRAM (GPUs are assumed same spec) + constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB + size_t totalMem = 0, freeMem = 0; + cudaSetDevice(0); + cudaMemGetInfo(&freeMem, &totalMem); + if (totalMem >= kLargeVramBytes) { + s_result = -1; + std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl; + } else { + s_result = 1; + std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl; + } + return s_result; +} + // Load character dictionary from file inline std::vector LoadDict(const std::string& dictPath) { std::vector keys; diff --git a/modules/ANSODEngine/dllmain.cpp b/modules/ANSODEngine/dllmain.cpp index 7bf72d9..56dada8 100644 --- a/modules/ANSODEngine/dllmain.cpp +++ b/modules/ANSODEngine/dllmain.cpp @@ -5,6 +5,7 @@ #include "ANSGpuFrameRegistry.h" // gpu_frame_lookup(cv::Mat*) #include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH #include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH +#include // INT_MIN // Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers). // Defined here, declared extern in EngineBuildLoadNetwork.inl. @@ -96,6 +97,37 @@ static int GetNumGPUs() { return g_numGPUs; } +// Determine maxSlotsPerGpu based on GPU topology: +// 1 GPU → 1 (single slot, no round-robin needed) +// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU) +// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth) +// Result is cached after the first query. +static int GetPoolMaxSlotsPerGpu() { + static int s_result = INT_MIN; + static std::mutex s_mutex; + std::lock_guard lk(s_mutex); + if (s_result != INT_MIN) return s_result; + const int n = GetNumGPUs(); + if (n <= 1) { + s_result = 1; + std::cout << "Info [GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl; + return s_result; + } + // Multiple GPUs — check VRAM (GPUs are assumed same spec) + constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB + size_t totalMem = 0, freeMem = 0; + cudaSetDevice(0); + cudaMemGetInfo(&freeMem, &totalMem); + if (totalMem >= kLargeVramBytes) { + s_result = -1; + std::cout << "Info [GPU]: " << n << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl; + } else { + s_result = 1; + std::cout << "Info [GPU]: " << n << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl; + } + return s_result; +} + // Returns the next GPU index in round-robin order. // Thread-safe: uses atomic fetch_add. static int AssignNextGPU() { @@ -588,6 +620,7 @@ extern "C" ANSODENGINE_API std::string CreateANSODHandle(ANSCENTER::ANSODBase** CheckGPUVRAM(assignedGPU); RegisterODHandle(*Handle); + (*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu()); (*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately bool loadResult = (*Handle)->Initialize(licenseKey, modelConfig, modelFilePath, modelFileZipPassword, labelMap); return labelMap; @@ -894,6 +927,7 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase** CheckGPUVRAM(assignedGPU); RegisterODHandle(*Handle); + (*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu()); (*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately bool result = (*Handle)->LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap); if (result) return 1;