diff --git a/modules/ANSFR/ANSFR.cpp b/modules/ANSFR/ANSFR.cpp
index 7f2e899..5ba1719 100644
--- a/modules/ANSFR/ANSFR.cpp
+++ b/modules/ANSFR/ANSFR.cpp
@@ -757,6 +757,7 @@ namespace ANSCENTER {
             else {
                 _detector = std::make_unique<ANSOVFD>();
             }
+            _detector->SetMaxSlotsPerGpu(m_maxSlotsPerGpu);
 
             // LOCK DURING INITIALIZATION
             bool initSuccess;
@@ -796,6 +797,7 @@ namespace ANSCENTER {
         try {
             // Create recognizer instance
             _recognizer = std::make_unique<ANSFaceRecognizer>();
+            _recognizer->SetMaxSlotsPerGpu(m_maxSlotsPerGpu);
 
             // Configure model
             ModelConfig recognizerConfig;
diff --git a/modules/ANSFR/ANSFR.h b/modules/ANSFR/ANSFR.h
index 6b30a3a..7b683de 100644
--- a/modules/ANSFR/ANSFR.h
+++ b/modules/ANSFR/ANSFR.h
@@ -121,7 +121,9 @@ namespace ANSCENTER
 		~ANSFacialRecognition() noexcept;
 		void UnloadEngine();
 		void Destroy();
+		void SetMaxSlotsPerGpu(int n) { m_maxSlotsPerGpu = n; }
 	private:
+		int m_maxSlotsPerGpu{ 1 };  // set by dllmain based on GPU topology
 		int  GetUser(int userId, UserRecord& userRecord);
 		int  GetUser(int userId, const std::string& userCode,const std::string& userName, UserRecord& userRecord);
 		int  GetUsers(std::vector<UserRecord>& userRecords, std::vector<int>& userIds);
diff --git a/modules/ANSFR/ANSFaceRecognizer.h b/modules/ANSFR/ANSFaceRecognizer.h
index 8f205d0..b3f2240 100644
--- a/modules/ANSFR/ANSFaceRecognizer.h
+++ b/modules/ANSFR/ANSFaceRecognizer.h
@@ -120,7 +120,7 @@ namespace ANSCENTER {
         std::shared_ptr<Engine<float>> m_trtEngine = nullptr; // NVIDIA TensorRT
         EnginePoolManager<float>::PoolKey m_poolKey;
         bool m_usingSharedPool = false;
-        int m_maxSlotsPerGpu{ -1 };  // -1 = elastic mode (on-demand slots, auto-cleanup)
+        int m_maxSlotsPerGpu{ 1 };   // 1 = single slot (default); set by dllmain based on GPU topology
         void SetMaxSlotsPerGpu(int n) override { m_maxSlotsPerGpu = n; }
         std::shared_ptr<faiss::IndexIDMap> faiss_index;
         std::shared_ptr<faiss::gpu::StandardGpuResources> m_gpuResources;
diff --git a/modules/ANSFR/dllmain.cpp b/modules/ANSFR/dllmain.cpp
index fc9c8e6..4e40909 100644
--- a/modules/ANSFR/dllmain.cpp
+++ b/modules/ANSFR/dllmain.cpp
@@ -9,7 +9,10 @@
 #include "FaceNet.h"
 #include "ANSFaceRecognizer.h"
 #include "ANSLibsLoader.h"
+#include "engine/TRTEngineCache.h"
+#include "engine/EnginePoolManager.h"
 #include <memory>
+#include <climits>
 #include <unordered_map>
 #include <condition_variable>
 #include <cstdint>
@@ -93,6 +96,36 @@ public:
     FRHandleGuard& operator=(const FRHandleGuard&) = delete;
 };
 
+// Determine maxSlotsPerGpu based on GPU topology:
+//   1 GPU            → 1  (single slot, no round-robin needed)
+//   >1 GPU, VRAM<24GB → 1  (round-robin: 1 slot per GPU)
+//   >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
+static int GetPoolMaxSlotsPerGpu() {
+    static int s_result = INT_MIN;
+    static std::mutex s_mutex;
+    std::lock_guard<std::mutex> lk(s_mutex);
+    if (s_result != INT_MIN) return s_result;
+    int gpuCount = 0;
+    cudaGetDeviceCount(&gpuCount);
+    if (gpuCount <= 1) {
+        s_result = 1;
+        std::cout << "Info [FR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
+        return s_result;
+    }
+    constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024;  // 24 GB
+    size_t totalMem = 0, freeMem = 0;
+    cudaSetDevice(0);
+    cudaMemGetInfo(&freeMem, &totalMem);
+    if (totalMem >= kLargeVramBytes) {
+        s_result = -1;
+        std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
+    } else {
+        s_result = 1;
+        std::cout << "Info [FR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
+    }
+    return s_result;
+}
+
 BOOL APIENTRY DllMain( HMODULE hModule,
                        DWORD  ul_reason_for_call,
                        LPVOID lpReserved
@@ -117,8 +150,14 @@ BOOL APIENTRY DllMain( HMODULE hModule,
     case DLL_THREAD_DETACH:
         break;
     case DLL_PROCESS_DETACH:
-        // Clean up any handles that LabVIEW didn't release before closing.
-        // Without this, idle-timer threads keep the process alive indefinitely.
+        // ExitProcess: OS killed worker threads, CUDA context is dead.
+        // Set flag so Engine/Pool destructors skip CUDA cleanup.
+        if (lpReserved != nullptr) {
+            g_processExiting().store(true, std::memory_order_relaxed);
+            break;
+        }
+
+        // Dynamic FreeLibrary — threads are still alive, safe to clean up.
         try {
             std::vector<ANSCENTER::ANSFacialRecognition*> leakedHandles;
             {
@@ -130,6 +169,8 @@ BOOL APIENTRY DllMain( HMODULE hModule,
             for (auto* h : leakedHandles) {
                 try { h->Destroy(); delete h; } catch (...) {}
             }
+            try { EnginePoolManager<float>::instance().clearAll(); } catch (...) {}
+            try { TRTEngineCache::instance().clearAll(); } catch (...) {}
         } catch (...) {}
         break;
     }
@@ -185,6 +226,8 @@ extern "C" ANSFR_API int		   CreateANSRFHandle(ANSCENTER::ANSFacialRecognition**
         const bool _enableFaceLiveness = (enableFaceLiveness == 1);
         const bool _enableAntiSpoofing = (enableAntiSpoofing == 1);
 
+        ptr->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
+
         int result = ptr->Initialize(licenseKey,
             configFilePath,
             databaseFilePath,
diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
index 9faa622..2da68da 100644
--- a/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
@@ -40,7 +40,7 @@ bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId,
             options.maxBatchSize };
         m_engine = EnginePoolManager<float>::instance().acquire(
             m_poolKey, options, onnxPath,
-            kClsSubVals, kClsDivVals, true, -1);
+            kClsSubVals, kClsDivVals, true, getPoolMaxSlotsPerGpu());
         m_usingSharedPool = (m_engine != nullptr);
 
         if (!m_engine) {
diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
index 9d4951f..41c290c 100644
--- a/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
@@ -57,7 +57,7 @@ bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
             options.maxBatchSize };
         m_engine = EnginePoolManager<float>::instance().acquire(
             m_poolKey, options, onnxPath,
-            kDetSubVals, kDetDivVals, true, -1);
+            kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu());
         m_usingSharedPool = (m_engine != nullptr);
 
         if (!m_engine) {
diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
index 551440b..758963d 100644
--- a/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
@@ -52,7 +52,7 @@ bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string&
             options.maxBatchSize };
         m_engine = EnginePoolManager<float>::instance().acquire(
             m_poolKey, options, onnxPath,
-            kRecSubVals, kRecDivVals, true, -1);
+            kRecSubVals, kRecDivVals, true, getPoolMaxSlotsPerGpu());
         m_usingSharedPool = (m_engine != nullptr);
 
         if (!m_engine) {
diff --git a/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h b/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h
index 1ae3bc6..5395ded 100644
--- a/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h
+++ b/modules/ANSOCR/ANSRTOCR/RTOCRTypes.h
@@ -7,6 +7,10 @@
 #include <algorithm>
 #include <numeric>
 #include <cmath>
+#include <climits>
+#include <mutex>
+#include <iostream>
+#include <cuda_runtime.h>
 #include <opencv2/core.hpp>
 #include <opencv2/imgproc.hpp>
 
@@ -95,6 +99,38 @@ struct OCRPredictResult {
 // Utility functions
 // ============================================================================
 
+// Determine maxSlotsPerGpu based on GPU topology:
+//   1 GPU            → 1  (single slot, no round-robin needed)
+//   >1 GPU, VRAM<24GB → 1  (round-robin: 1 slot per GPU)
+//   >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
+// Result is cached after the first query.
+inline int getPoolMaxSlotsPerGpu() {
+    static int s_result = INT_MIN;
+    static std::mutex s_mutex;
+    std::lock_guard<std::mutex> lk(s_mutex);
+    if (s_result != INT_MIN) return s_result;
+    int gpuCount = 0;
+    cudaGetDeviceCount(&gpuCount);
+    if (gpuCount <= 1) {
+        s_result = 1;
+        std::cout << "Info [OCR GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
+        return s_result;
+    }
+    // Multiple GPUs — check VRAM (GPUs are assumed same spec)
+    constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024;  // 24 GB
+    size_t totalMem = 0, freeMem = 0;
+    cudaSetDevice(0);
+    cudaMemGetInfo(&freeMem, &totalMem);
+    if (totalMem >= kLargeVramBytes) {
+        s_result = -1;
+        std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
+    } else {
+        s_result = 1;
+        std::cout << "Info [OCR GPU]: " << gpuCount << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
+    }
+    return s_result;
+}
+
 // Load character dictionary from file
 inline std::vector<std::string> LoadDict(const std::string& dictPath) {
     std::vector<std::string> keys;
diff --git a/modules/ANSODEngine/dllmain.cpp b/modules/ANSODEngine/dllmain.cpp
index 7bf72d9..56dada8 100644
--- a/modules/ANSODEngine/dllmain.cpp
+++ b/modules/ANSODEngine/dllmain.cpp
@@ -5,6 +5,7 @@
 #include "ANSGpuFrameRegistry.h"    // gpu_frame_lookup(cv::Mat*)
 #include "engine/TRTEngineCache.h"   // clearAll() on DLL_PROCESS_DETACH
 #include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
+#include <climits>                    // INT_MIN
 
 // Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
 // Defined here, declared extern in EngineBuildLoadNetwork.inl.
@@ -96,6 +97,37 @@ static int GetNumGPUs() {
 	return g_numGPUs;
 }
 
+// Determine maxSlotsPerGpu based on GPU topology:
+//   1 GPU            → 1  (single slot, no round-robin needed)
+//   >1 GPU, VRAM<24GB → 1  (round-robin: 1 slot per GPU)
+//   >1 GPU, VRAM≥24GB → -1 (elastic: on-demand slot growth)
+// Result is cached after the first query.
+static int GetPoolMaxSlotsPerGpu() {
+	static int s_result = INT_MIN;
+	static std::mutex s_mutex;
+	std::lock_guard<std::mutex> lk(s_mutex);
+	if (s_result != INT_MIN) return s_result;
+	const int n = GetNumGPUs();
+	if (n <= 1) {
+		s_result = 1;
+		std::cout << "Info [GPU]: Single GPU — pool mode: 1 slot, no round-robin" << std::endl;
+		return s_result;
+	}
+	// Multiple GPUs — check VRAM (GPUs are assumed same spec)
+	constexpr size_t kLargeVramBytes = 24ULL * 1024 * 1024 * 1024;  // 24 GB
+	size_t totalMem = 0, freeMem = 0;
+	cudaSetDevice(0);
+	cudaMemGetInfo(&freeMem, &totalMem);
+	if (totalMem >= kLargeVramBytes) {
+		s_result = -1;
+		std::cout << "Info [GPU]: " << n << " GPUs, VRAM >= 24 GB — pool mode: elastic" << std::endl;
+	} else {
+		s_result = 1;
+		std::cout << "Info [GPU]: " << n << " GPUs, VRAM < 24 GB — pool mode: round-robin" << std::endl;
+	}
+	return s_result;
+}
+
 // Returns the next GPU index in round-robin order.
 // Thread-safe: uses atomic fetch_add.
 static int AssignNextGPU() {
@@ -588,6 +620,7 @@ extern "C" ANSODENGINE_API std::string  CreateANSODHandle(ANSCENTER::ANSODBase**
 		CheckGPUVRAM(assignedGPU);
 
 		RegisterODHandle(*Handle);
+		(*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
 		(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
 		bool loadResult = (*Handle)->Initialize(licenseKey, modelConfig, modelFilePath, modelFileZipPassword, labelMap);
 		return labelMap;
@@ -894,6 +927,7 @@ extern "C" __declspec(dllexport) int LoadModelFromFolder(ANSCENTER::ANSODBase**
 			CheckGPUVRAM(assignedGPU);
 
 			RegisterODHandle(*Handle);
+			(*Handle)->SetMaxSlotsPerGpu(GetPoolMaxSlotsPerGpu());
 			(*Handle)->SetLoadEngineOnCreation(_loadEngineOnCreation); //Set force to load the engine immediately
 			bool result = (*Handle)->LoadModelFromFolder(licenseKey, modelConfig, modelName, className, modelFolder, labelMap);
 			if (result) return 1;