Fix setting GPU behaviour:
Condition maxSlotsPerGpu Behavior OptimizeModelStr 0 Bypass: non-shared temporary engine 1 GPU 1 Single slot, no round-robin >1 GPU, VRAM < 24 GB 1 Round-robin: 1 slot per GPU >1 GPU, VRAM >= 24 GB -1 Elastic: on-demand slot growth
This commit is contained in:
@@ -757,6 +757,7 @@ namespace ANSCENTER {
|
|||||||
else {
|
else {
|
||||||
_detector = std::make_unique<ANSOVFD>();
|
_detector = std::make_unique<ANSOVFD>();
|
||||||
}
|
}
|
||||||
|
_detector->SetMaxSlotsPerGpu(m_maxSlotsPerGpu);
|
||||||
|
|
||||||
// LOCK DURING INITIALIZATION
|
// LOCK DURING INITIALIZATION
|
||||||
bool initSuccess;
|
bool initSuccess;
|
||||||
@@ -796,6 +797,7 @@ namespace ANSCENTER {
|
|||||||
try {
|
try {
|
||||||
// Create recognizer instance
|
// Create recognizer instance
|
||||||
_recognizer = std::make_unique<ANSFaceRecognizer>();
|
_recognizer = std::make_unique<ANSFaceRecognizer>();
|
||||||
|
_recognizer->SetMaxSlotsPerGpu(m_maxSlotsPerGpu);
|
||||||
|
|
||||||
// Configure model
|
// Configure model
|
||||||
ModelConfig recognizerConfig;
|
ModelConfig recognizerConfig;
|
||||||
|
|||||||
@@ -121,7 +121,9 @@ namespace ANSCENTER
|
|||||||
~ANSFacialRecognition() noexcept;
|
~ANSFacialRecognition() noexcept;
|
||||||
void UnloadEngine();
|
void UnloadEngine();
|
||||||
void Destroy();
|
void Destroy();
|
||||||
|
void SetMaxSlotsPerGpu(int n) { m_maxSlotsPerGpu = n; }
|
||||||
private:
|
private:
|
||||||
|
int m_maxSlotsPerGpu{ 1 }; // set by dllmain based on GPU topology
|
||||||
int GetUser(int userId, UserRecord& userRecord);
|
int GetUser(int userId, UserRecord& userRecord);
|
||||||
int GetUser(int userId, const std::string& userCode,const std::string& userName, UserRecord& userRecord);
|
int GetUser(int userId, const std::string& userCode,const std::string& userName, UserRecord& userRecord);
|
||||||
int GetUsers(std::vector<UserRecord>& userRecords, std::vector<int>& userIds);
|
int GetUsers(std::vector<UserRecord>& userRecords, std::vector<int>& userIds);
|
||||||
|
|||||||
@@ -120,7 +120,7 @@ namespace ANSCENTER {
|
|||||||
std::shared_ptr<Engine<float>> m_trtEngine = nullptr; // NVIDIA TensorRT
|
std::shared_ptr<Engine<float>> m_trtEngine = nullptr; // NVIDIA TensorRT
|
||||||
EnginePoolManager<float>::PoolKey m_poolKey;
|
EnginePoolManager<float>::PoolKey m_poolKey;
|
||||||
bool m_usingSharedPool = false;
|
bool m_usingSharedPool = false;
|
||||||
int m_maxSlotsPerGpu{ -1 }; // -1 = elastic mode (on-demand slots, auto-cleanup)
|
int m_maxSlotsPerGpu{ 1 }; // 1 = single slot (default); set by dllmain based on GPU topology
|
||||||
void SetMaxSlotsPerGpu(int n) override { m_maxSlotsPerGpu = n; }
|
void SetMaxSlotsPerGpu(int n) override { m_maxSlotsPerGpu = n; }
|
||||||
std::shared_ptr<faiss::IndexIDMap> faiss_index;
|
std::shared_ptr<faiss::IndexIDMap> faiss_index;
|
||||||
std::shared_ptr<faiss::gpu::StandardGpuResources> m_gpuResources;
|
std::shared_ptr<faiss::gpu::StandardGpuResources> m_gpuResources;
|
||||||
|
|||||||
@@ -9,7 +9,10 @@
|
|||||||
#include "FaceNet.h"
|
#include "FaceNet.h"
|
||||||
#include "ANSFaceRecognizer.h"
|
#include "ANSFaceRecognizer.h"
|
||||||
#include "ANSLibsLoader.h"
|
#include "ANSLibsLoader.h"
|
||||||
|
#include "engine/TRTEngineCache.h"
|
||||||
|
#include "engine/EnginePoolManager.h"
|
||||||
#include <memory>
|
#include <memory>
|
||||||
|
#include <climits>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@@ -93,6 +96,36 @@ public:
|
|||||||
FRHandleGuard& operator=(const FRHandleGuard&) = delete;
|
FRHandleGuard& operator=(const FRHandleGuard&) = delete;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Determine maxSlotsPerGpu based on GPU topology:
|
||||||
|
// 1 GPU → 1 (single slot, no round-robin needed)
|
||||||
|
// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU)
|
||||||
|
// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
|
||||||
|
static int GetPoolMaxSlotsPerGpu() {
|
||||||
|
static int s_result = INT_MIN;
|
||||||
|
static std::mutex s_mutex;
|
||||||
|
std::lock_guard<std::mutex> lk(s_mutex);
|
||||||
|
if (s_result != INT_MIN) return s_result;
|
||||||
|
int gpuCount = 0;
|
||||||
|
cudaGetDeviceCount(&gpuCount);
|
||||||
|
if (gpuCount <= 1) {
|
||||||
|
s_result = 1;
|
||||||
|
std::cout << "Info [FR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
|
||||||
|
return s_result;
|
||||||
|
}
|
||||||
|
constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB
|
||||||
|
size_t totalMem = 0, freeMem = 0;
|
||||||
|
cudaSetDevice(0);
|
||||||
|
cudaMemGetInfo(&freeMem, &totalMem);
|
||||||
|
if (totalMem >= kLargeVramBytes) {
|
||||||
|
s_result = -1;
|
||||||
|
std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
|
||||||
|
} else {
|
||||||
|
s_result = 1;
|
||||||
|
std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
|
||||||
|
}
|
||||||
|
return s_result;
|
||||||
|
}
|
||||||
|
|
||||||
BOOL APIENTRY DllMain( HMODULE hModule,
|
BOOL APIENTRY DllMain( HMODULE hModule,
|
||||||
DWORD ul_reason_for_call,
|
DWORD ul_reason_for_call,
|
||||||
LPVOID lpReserved
|
LPVOID lpReserved
|
||||||
@@ -117,8 +150,14 @@ BOOL APIENTRY DllMain( HMODULE hModule,
|
|||||||
case DLL_THREAD_DETACH:
|
case DLL_THREAD_DETACH:
|
||||||
break;
|
break;
|
||||||
case DLL_PROCESS_DETACH:
|
case DLL_PROCESS_DETACH:
|
||||||
// Clean up any handles that LabVIEW didn't release before closing.
|
// ExitProcess: OS killed worker threads, CUDA context is dead.
|
||||||
// Without this, idle-timer threads keep the process alive indefinitely.
|
// Set flag so Engine/Pool destructors skip CUDA cleanup.
|
||||||
|
if (lpReserved != nullptr) {
|
||||||
|
g_processExiting().store(true, std::memory_order_relaxed);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dynamic FreeLibrary — threads are still alive, safe to clean up.
|
||||||
try {
|
try {
|
||||||
std::vector<ANSCENTER::ANSFacialRecognition*> leakedHandles;
|
std::vector<ANSCENTER::ANSFacialRecognition*> leakedHandles;
|
||||||
{
|
{
|
||||||
@@ -130,6 +169,8 @@ BOOL APIENTRY DllMain( HMODULE hModule,
|
|||||||
for (auto* h : leakedHandles) {
|
for (auto* h : leakedHandles) {
|
||||||
try { h->Destroy(); delete h; } catch (...) {}
|
try { h->Destroy(); delete h; } catch (...) {}
|
||||||
}
|
}
|
||||||
|
try { EnginePoolManager<float>::instance().clearAll(); } catch (...) {}
|
||||||
|
try { TRTEngineCache::instance().clearAll(); } catch (...) {}
|
||||||
} catch (...) {}
|
} catch (...) {}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -185,6 +226,8 @@ extern "C" ANSFR_API int CreateANSRFHandle(ANSCENTER::ANSFacialRecognition**
|
|||||||
const bool _enableFaceLiveness = (enableFaceLiveness == 1);
|
const bool _enableFaceLiveness = (enableFaceLiveness == 1);
|
||||||
const bool _enableAntiSpoofing = (enableAntiSpoofing == 1);
|
const bool _enableAntiSpoofing = (enableAntiSpoofing == 1);
|
||||||
|
|
||||||
|
ptr->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
|
||||||
|
|
||||||
int result = ptr->Initialize(licenseKey,
|
int result = ptr->Initialize(licenseKey,
|
||||||
configFilePath,
|
configFilePath,
|
||||||
databaseFilePath,
|
databaseFilePath,
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId,
|
|||||||
options.maxBatchSize };
|
options.maxBatchSize };
|
||||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||||
m_poolKey, options, onnxPath,
|
m_poolKey, options, onnxPath,
|
||||||
kClsSubVals, kClsDivVals, true, -1);
|
kClsSubVals, kClsDivVals, true, getPoolMaxSlotsPerGpu());
|
||||||
m_usingSharedPool = (m_engine != nullptr);
|
m_usingSharedPool = (m_engine != nullptr);
|
||||||
|
|
||||||
if (!m_engine) {
|
if (!m_engine) {
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
|
|||||||
options.maxBatchSize };
|
options.maxBatchSize };
|
||||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||||
m_poolKey, options, onnxPath,
|
m_poolKey, options, onnxPath,
|
||||||
kDetSubVals, kDetDivVals, true, -1);
|
kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu());
|
||||||
m_usingSharedPool = (m_engine != nullptr);
|
m_usingSharedPool = (m_engine != nullptr);
|
||||||
|
|
||||||
if (!m_engine) {
|
if (!m_engine) {
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string&
|
|||||||
options.maxBatchSize };
|
options.maxBatchSize };
|
||||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||||
m_poolKey, options, onnxPath,
|
m_poolKey, options, onnxPath,
|
||||||
kRecSubVals, kRecDivVals, true, -1);
|
kRecSubVals, kRecDivVals, true, getPoolMaxSlotsPerGpu());
|
||||||
m_usingSharedPool = (m_engine != nullptr);
|
m_usingSharedPool = (m_engine != nullptr);
|
||||||
|
|
||||||
if (!m_engine) {
|
if (!m_engine) {
|
||||||
|
|||||||
@@ -7,6 +7,10 @@
|
|||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <numeric>
|
#include <numeric>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
|
#include <climits>
|
||||||
|
#include <mutex>
|
||||||
|
#include <iostream>
|
||||||
|
#include <cuda_runtime.h>
|
||||||
#include <opencv2/core.hpp>
|
#include <opencv2/core.hpp>
|
||||||
#include <opencv2/imgproc.hpp>
|
#include <opencv2/imgproc.hpp>
|
||||||
|
|
||||||
@@ -95,6 +99,38 @@ struct OCRPredictResult {
|
|||||||
// Utility functions
|
// Utility functions
|
||||||
// ============================================================================
|
// ============================================================================
|
||||||
|
|
||||||
|
// Determine maxSlotsPerGpu based on GPU topology:
|
||||||
|
// 1 GPU → 1 (single slot, no round-robin needed)
|
||||||
|
// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU)
|
||||||
|
// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
|
||||||
|
// Result is cached after the first query.
|
||||||
|
inline int getPoolMaxSlotsPerGpu() {
|
||||||
|
static int s_result = INT_MIN;
|
||||||
|
static std::mutex s_mutex;
|
||||||
|
std::lock_guard<std::mutex> lk(s_mutex);
|
||||||
|
if (s_result != INT_MIN) return s_result;
|
||||||
|
int gpuCount = 0;
|
||||||
|
cudaGetDeviceCount(&gpuCount);
|
||||||
|
if (gpuCount <= 1) {
|
||||||
|
s_result = 1;
|
||||||
|
std::cout << "Info [OCR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
|
||||||
|
return s_result;
|
||||||
|
}
|
||||||
|
// Multiple GPUs — check VRAM (GPUs are assumed same spec)
|
||||||
|
constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB
|
||||||
|
size_t totalMem = 0, freeMem = 0;
|
||||||
|
cudaSetDevice(0);
|
||||||
|
cudaMemGetInfo(&freeMem, &totalMem);
|
||||||
|
if (totalMem >= kLargeVramBytes) {
|
||||||
|
s_result = -1;
|
||||||
|
std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
|
||||||
|
} else {
|
||||||
|
s_result = 1;
|
||||||
|
std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
|
||||||
|
}
|
||||||
|
return s_result;
|
||||||
|
}
|
||||||
|
|
||||||
// Load character dictionary from file
|
// Load character dictionary from file
|
||||||
inline std::vector<std::string> LoadDict(const std::string& dictPath) {
|
inline std::vector<std::string> LoadDict(const std::string& dictPath) {
|
||||||
std::vector<std::string> keys;
|
std::vector<std::string> keys;
|
||||||
|
|||||||
@@ -5,6 +5,7 @@
|
|||||||
#include "ANSGpuFrameRegistry.h" // gpu_frame_lookup(cv::Mat*)
|
#include "ANSGpuFrameRegistry.h" // gpu_frame_lookup(cv::Mat*)
|
||||||
#include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH
|
#include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH
|
||||||
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
|
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
|
||||||
|
#include <climits> // INT_MIN
|
||||||
|
|
||||||
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
|
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
|
||||||
// Defined here, declared extern in EngineBuildLoadNetwork.inl.
|
// Defined here, declared extern in EngineBuildLoadNetwork.inl.
|
||||||
@@ -96,6 +97,37 @@ static int GetNumGPUs() {
|
|||||||
return g_numGPUs;
|
return g_numGPUs;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Determine maxSlotsPerGpu based on GPU topology:
|
||||||
|
// 1 GPU → 1 (single slot, no round-robin needed)
|
||||||
|
// >1 GPU, VRAM<24GB → 1 (round-robin: 1 slot per GPU)
|
||||||
|
// >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
|
||||||
|
// Result is cached after the first query.
|
||||||
|
static int GetPoolMaxSlotsPerGpu() {
|
||||||
|
static int s_result = INT_MIN;
|
||||||
|
static std::mutex s_mutex;
|
||||||
|
std::lock_guard<std::mutex> lk(s_mutex);
|
||||||
|
if (s_result != INT_MIN) return s_result;
|
||||||
|
const int n = GetNumGPUs();
|
||||||
|
if (n <= 1) {
|
||||||
|
s_result = 1;
|
||||||
|
std::cout << "Info [GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
|
||||||
|
return s_result;
|
||||||
|
}
|
||||||
|
// Multiple GPUs — check VRAM (GPUs are assumed same spec)
|
||||||
|
constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024; // 24 GB
|
||||||
|
size_t totalMem = 0, freeMem = 0;
|
||||||
|
cudaSetDevice(0);
|
||||||
|
cudaMemGetInfo(&freeMem, &totalMem);
|
||||||
|
if (totalMem >= kLargeVramBytes) {
|
||||||
|
s_result = -1;
|
||||||
|
std::cout << "Info [GPU]: " << n << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
|
||||||
|
} else {
|
||||||
|
s_result = 1;
|
||||||
|
std::cout << "Info [GPU]: " << n << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
|
||||||
|
}
|
||||||
|
return s_result;
|
||||||
|
}
|
||||||
|
|
||||||
// Returns the next GPU index in round-robin order.
|
// Returns the next GPU index in round-robin order.
|
||||||
// Thread-safe: uses atomic fetch_add.
|
// Thread-safe: uses atomic fetch_add.
|
||||||
static int AssignNextGPU() {
|
static int AssignNextGPU() {
|
||||||
@@ -588,6 +620,7 @@ extern "C" ANSODENGINE_API std::string CreateANSODHandle(ANSCENTER::ANSODBase**
|
|||||||
CheckGPUVRAM(assignedGPU);
|
CheckGPUVRAM(assignedGPU);
|
||||||
|
|
||||||
RegisterODHandle(*Handle);
|
RegisterODHandle(*Handle);
|
||||||
|
(*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
|
||||||
(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
|
(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
|
||||||
bool loadResult = (*Handle)->Initialize(licenseKey, modelConfig, modelFilePath, modelFileZipPassword, labelMap);
|
bool loadResult = (*Handle)->Initialize(licenseKey, modelConfig, modelFilePath, modelFileZipPassword, labelMap);
|
||||||
return labelMap;
|
return labelMap;
|
||||||
@@ -894,6 +927,7 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
|
|||||||
CheckGPUVRAM(assignedGPU);
|
CheckGPUVRAM(assignedGPU);
|
||||||
|
|
||||||
RegisterODHandle(*Handle);
|
RegisterODHandle(*Handle);
|
||||||
|
(*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
|
||||||
(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
|
(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
|
||||||
bool result = (*Handle)->LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
|
bool result = (*Handle)->LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
|
||||||
if (result) return 1;
|
if (result) return 1;
|
||||||
|
|||||||
Reference in New Issue
Block a user