Fix setting GPU behaviour:

Condition	maxSlotsPerGpu	Behavior
OptimizeModelStr	0	Bypass: non-shared temporary engine
1 GPU	1	Single slot, no round-robin
>1 GPU, VRAM < 24 GB	1	Round-robin: 1 slot per GPU
>1 GPU, VRAM >= 24 GB	-1	Elastic: on-demand slot growth
This commit is contained in:
2026-03-30 09:59:09 +11:00
parent 01eabf76bd
commit c1b919ec47
9 changed files with 123 additions and 6 deletions

View File

@@ -757,6 +757,7 @@ namespace ANSCENTER {
else { else {
_detector = std::make_unique<ANSOVFD>(); _detector = std::make_unique<ANSOVFD>();
} }
_detector->SetMaxSlotsPerGpu(m_maxSlotsPerGpu);
// LOCK DURING INITIALIZATION // LOCK DURING INITIALIZATION
bool initSuccess; bool initSuccess;
@@ -796,6 +797,7 @@ namespace ANSCENTER {
try { try {
// Create recognizer instance // Create recognizer instance
_recognizer = std::make_unique<ANSFaceRecognizer>(); _recognizer = std::make_unique<ANSFaceRecognizer>();
_recognizer->SetMaxSlotsPerGpu(m_maxSlotsPerGpu);
// Configure model // Configure model
ModelConfig recognizerConfig; ModelConfig recognizerConfig;

View File

@@ -121,7 +121,9 @@ namespace ANSCENTER
~ANSFacialRecognition() noexcept; ~ANSFacialRecognition() noexcept;
void UnloadEngine(); void UnloadEngine();
void Destroy(); void Destroy();
void SetMaxSlotsPerGpu(int n) { m_maxSlotsPerGpu = n; }
private: private:
int m_maxSlotsPerGpu{ 1 }; // set by dllmain based on GPU topology
int GetUser(int userId, UserRecord& userRecord); int GetUser(int userId, UserRecord& userRecord);
int GetUser(int userId, const std::string& userCode,const std::string& userName, UserRecord& userRecord); int GetUser(int userId, const std::string& userCode,const std::string& userName, UserRecord& userRecord);
int GetUsers(std::vector<UserRecord>& userRecords, std::vector<int>& userIds); int GetUsers(std::vector<UserRecord>& userRecords, std::vector<int>& userIds);

View File

@@ -120,7 +120,7 @@ namespace ANSCENTER {
std::shared_ptr<Engine<float>> m_trtEngine = nullptr; // NVIDIA TensorRT std::shared_ptr<Engine<float>> m_trtEngine = nullptr; // NVIDIA TensorRT
EnginePoolManager<float>::PoolKey m_poolKey; EnginePoolManager<float>::PoolKey m_poolKey;
bool m_usingSharedPool = false; bool m_usingSharedPool = false;
int m_maxSlotsPerGpu{ -1 }; // -1 = elastic mode (on-demand slots, auto-cleanup) int m_maxSlotsPerGpu{ 1 }; // 1 = single slot (default); set by dllmain based on GPU topology
void SetMaxSlotsPerGpu(int n) override { m_maxSlotsPerGpu = n; } void SetMaxSlotsPerGpu(int n) override { m_maxSlotsPerGpu = n; }
std::shared_ptr<faiss::IndexIDMap> faiss_index; std::shared_ptr<faiss::IndexIDMap> faiss_index;
std::shared_ptr<faiss::gpu::StandardGpuResources> m_gpuResources; std::shared_ptr<faiss::gpu::StandardGpuResources> m_gpuResources;

View File

@@ -9,7 +9,10 @@
#include "FaceNet.h" #include "FaceNet.h"
#include "ANSFaceRecognizer.h" #include "ANSFaceRecognizer.h"
#include "ANSLibsLoader.h" #include "ANSLibsLoader.h"
#include "engine/TRTEngineCache.h"
#include "engine/EnginePoolManager.h"
#include <memory> #include <memory>
#include <climits>
#include <unordered_map> #include <unordered_map>
#include <condition_variable> #include <condition_variable>
#include <cstdint> #include <cstdint>
@@ -93,6 +96,36 @@ public:
FRHandleGuard& operator=(const FRHandleGuard&) = delete; FRHandleGuard& operator=(const FRHandleGuard&) = delete;
}; };
// Determine maxSlotsPerGpu based on GPU topology:
// 1 GPU → 1 (single slot, no round-robin needed)
// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU)
// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
static int GetPoolMaxSlotsPerGpu() {
static int s_result = INT_MIN;
static std::mutex s_mutex;
std::lock_guard<std::mutex> lk(s_mutex);
if (s_result != INT_MIN) return s_result;
int gpuCount = 0;
cudaGetDeviceCount(&gpuCount);
if (gpuCount <= 1) {
s_result = 1;
std::cout << "Info [FR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
return s_result;
}
constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB
size_t totalMem = 0, freeMem = 0;
cudaSetDevice(0);
cudaMemGetInfo(&freeMem, &totalMem);
if (totalMem >= kLargeVramBytes) {
s_result = -1;
std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
} else {
s_result = 1;
std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
}
return s_result;
}
BOOL APIENTRY DllMain( HMODULE hModule, BOOL APIENTRY DllMain( HMODULE hModule,
DWORD ul_reason_for_call, DWORD ul_reason_for_call,
LPVOID lpReserved LPVOID lpReserved
@@ -117,8 +150,14 @@ BOOL APIENTRY DllMain( HMODULE hModule,
case DLL_THREAD_DETACH: case DLL_THREAD_DETACH:
break; break;
case DLL_PROCESS_DETACH: case DLL_PROCESS_DETACH:
// Clean up any handles that LabVIEW didn't release before closing. // ExitProcess: OS killed worker threads, CUDA context is dead.
// Without this, idle-timer threads keep the process alive indefinitely. // Set flag so Engine/Pool destructors skip CUDA cleanup.
if (lpReserved != nullptr) {
g_processExiting().store(true, std::memory_order_relaxed);
break;
}
// Dynamic FreeLibrary — threads are still alive, safe to clean up.
try { try {
std::vector<ANSCENTER::ANSFacialRecognition*> leakedHandles; std::vector<ANSCENTER::ANSFacialRecognition*> leakedHandles;
{ {
@@ -130,6 +169,8 @@ BOOL APIENTRY DllMain( HMODULE hModule,
for (auto* h : leakedHandles) { for (auto* h : leakedHandles) {
try { h->Destroy(); delete h; } catch (...) {} try { h->Destroy(); delete h; } catch (...) {}
} }
try { EnginePoolManager<float>::instance().clearAll(); } catch (...) {}
try { TRTEngineCache::instance().clearAll(); } catch (...) {}
} catch (...) {} } catch (...) {}
break; break;
} }
@@ -185,6 +226,8 @@ extern "C" ANSFR_API int CreateANSRFHandle(ANSCENTER::ANSFacialRecognition**
const bool _enableFaceLiveness = (enableFaceLiveness == 1); const bool _enableFaceLiveness = (enableFaceLiveness == 1);
const bool _enableAntiSpoofing = (enableAntiSpoofing == 1); const bool _enableAntiSpoofing = (enableAntiSpoofing == 1);
ptr->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
int result = ptr->Initialize(licenseKey, int result = ptr->Initialize(licenseKey,
configFilePath, configFilePath,
databaseFilePath, databaseFilePath,

View File

@@ -40,7 +40,7 @@ bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId,
options.maxBatchSize }; options.maxBatchSize };
m_engine = EnginePoolManager<float>::instance().acquire( m_engine = EnginePoolManager<float>::instance().acquire(
m_poolKey, options, onnxPath, m_poolKey, options, onnxPath,
kClsSubVals, kClsDivVals, true, -1); kClsSubVals, kClsDivVals, true, getPoolMaxSlotsPerGpu());
m_usingSharedPool = (m_engine != nullptr); m_usingSharedPool = (m_engine != nullptr);
if (!m_engine) { if (!m_engine) {

View File

@@ -57,7 +57,7 @@ bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
options.maxBatchSize }; options.maxBatchSize };
m_engine = EnginePoolManager<float>::instance().acquire( m_engine = EnginePoolManager<float>::instance().acquire(
m_poolKey, options, onnxPath, m_poolKey, options, onnxPath,
kDetSubVals, kDetDivVals, true, -1); kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu());
m_usingSharedPool = (m_engine != nullptr); m_usingSharedPool = (m_engine != nullptr);
if (!m_engine) { if (!m_engine) {

View File

@@ -52,7 +52,7 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string&
options.maxBatchSize }; options.maxBatchSize };
m_engine = EnginePoolManager<float>::instance().acquire( m_engine = EnginePoolManager<float>::instance().acquire(
m_poolKey, options, onnxPath, m_poolKey, options, onnxPath,
kRecSubVals, kRecDivVals, true, -1); kRecSubVals, kRecDivVals, true, getPoolMaxSlotsPerGpu());
m_usingSharedPool = (m_engine != nullptr); m_usingSharedPool = (m_engine != nullptr);
if (!m_engine) { if (!m_engine) {

View File

@@ -7,6 +7,10 @@
#include <algorithm> #include <algorithm>
#include <numeric> #include <numeric>
#include <cmath> #include <cmath>
#include <climits>
#include <mutex>
#include <iostream>
#include <cuda_runtime.h>
#include <opencv2/core.hpp> #include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp> #include <opencv2/imgproc.hpp>
@@ -95,6 +99,38 @@ struct OCRPredictResult {
// Utility functions // Utility functions
// ============================================================================ // ============================================================================
// Determine maxSlotsPerGpu based on GPU topology:
// 1 GPU → 1 (single slot, no round-robin needed)
// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU)
// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
// Result is cached after the first query.
inline int getPoolMaxSlotsPerGpu() {
static int s_result = INT_MIN;
static std::mutex s_mutex;
std::lock_guard<std::mutex> lk(s_mutex);
if (s_result != INT_MIN) return s_result;
int gpuCount = 0;
cudaGetDeviceCount(&gpuCount);
if (gpuCount <= 1) {
s_result = 1;
std::cout << "Info [OCR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
return s_result;
}
// Multiple GPUs — check VRAM (GPUs are assumed same spec)
constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB
size_t totalMem = 0, freeMem = 0;
cudaSetDevice(0);
cudaMemGetInfo(&freeMem, &totalMem);
if (totalMem >= kLargeVramBytes) {
s_result = -1;
std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
} else {
s_result = 1;
std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
}
return s_result;
}
// Load character dictionary from file // Load character dictionary from file
inline std::vector<std::string> LoadDict(const std::string& dictPath) { inline std::vector<std::string> LoadDict(const std::string& dictPath) {
std::vector<std::string> keys; std::vector<std::string> keys;

View File

@@ -5,6 +5,7 @@
#include "ANSGpuFrameRegistry.h" // gpu_frame_lookup(cv::Mat*) #include "ANSGpuFrameRegistry.h" // gpu_frame_lookup(cv::Mat*)
#include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH #include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH #include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
#include <climits> // INT_MIN
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers). // Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
// Defined here, declared extern in EngineBuildLoadNetwork.inl. // Defined here, declared extern in EngineBuildLoadNetwork.inl.
@@ -96,6 +97,37 @@ static int GetNumGPUs() {
return g_numGPUs; return g_numGPUs;
} }
// Determine maxSlotsPerGpu based on GPU topology:
// 1 GPU → 1 (single slot, no round-robin needed)
// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU)
// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
// Result is cached after the first query.
static int GetPoolMaxSlotsPerGpu() {
static int s_result = INT_MIN;
static std::mutex s_mutex;
std::lock_guard<std::mutex> lk(s_mutex);
if (s_result != INT_MIN) return s_result;
const int n = GetNumGPUs();
if (n <= 1) {
s_result = 1;
std::cout << "Info [GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
return s_result;
}
// Multiple GPUs — check VRAM (GPUs are assumed same spec)
constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB
size_t totalMem = 0, freeMem = 0;
cudaSetDevice(0);
cudaMemGetInfo(&freeMem, &totalMem);
if (totalMem >= kLargeVramBytes) {
s_result = -1;
std::cout << "Info [GPU]: " << n << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
} else {
s_result = 1;
std::cout << "Info [GPU]: " << n << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
}
return s_result;
}
// Returns the next GPU index in round-robin order. // Returns the next GPU index in round-robin order.
// Thread-safe: uses atomic fetch_add. // Thread-safe: uses atomic fetch_add.
static int AssignNextGPU() { static int AssignNextGPU() {
@@ -588,6 +620,7 @@ extern "C" ANSODENGINE_API std::string CreateANSODHandle(ANSCENTER::ANSODBase**
CheckGPUVRAM(assignedGPU); CheckGPUVRAM(assignedGPU);
RegisterODHandle(*Handle); RegisterODHandle(*Handle);
(*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately (*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
bool loadResult = (*Handle)->Initialize(licenseKey, modelConfig, modelFilePath, modelFileZipPassword, labelMap); bool loadResult = (*Handle)->Initialize(licenseKey, modelConfig, modelFilePath, modelFileZipPassword, labelMap);
return labelMap; return labelMap;
@@ -894,6 +927,7 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
CheckGPUVRAM(assignedGPU); CheckGPUVRAM(assignedGPU);
RegisterODHandle(*Handle); RegisterODHandle(*Handle);
(*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately (*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
bool result = (*Handle)->LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap); bool result = (*Handle)->LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
if (result) return 1; if (result) return 1;