Fix AMD by changing from GetTensorData<T>() to GetTensorMutableData<T>()

This commit is contained in:
2026-04-28 13:25:02 +10:00
parent f4b74c837e
commit dcf974c35c
18 changed files with 359 additions and 48 deletions

View File

@@ -292,15 +292,15 @@ int CustomModel_StressTest_FilePlayer() {
// Video files (one per stream)
const std::string videoFiles[NUM_STREAMS] = {
"E:\\Programs\\DemoAssets\\Videos\\Helmet\\HM1.mp4",
"E:\\Programs\\DemoAssets\\Videos\\Helmet\\HM2.mp4",
"C:\\ProgramData\\ANSCENTER\\Shared\\HM1.mp4",
"C:\\ProgramData\\ANSCENTER\\Shared\\HM2.mp4",
};
// Which stream each task uses
const int taskStreamMap[NUM_TASKS] = { 0, 0, 1, 1 };
// Model config — EDIT for your custom model
const std::string modelFolder = "C:\\Projects\\ANSVIS\\Models\\ANS_Helmet_v2.0.zip";
const std::string modelFolder = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\Models\\B-IN_ANS_Helmet_v2.0_102728911.zip";
//const char* modelName = "detector";
//const char* className = "detector.names";
const int modelType = 16; // 16 = CustomDetector, 31 = RTYOLO, 30 = ONNXYOLO
@@ -550,3 +550,237 @@ int CustomModel_StressTest_FilePlayer() {
return 0;
}
// =============================================================================
// CustomModel_SingleStream_FilePlayer
//
// ISOLATION TEST — 1 camera, 1 OD handle, 1 worker thread. No concurrent
// inference whatsoever. Same flow as CustomModel_StressTest_FilePlayer
// (FilePlayer → CloneImage → RunInferenceComplete_CPP → ReleaseImage), just
// without the multi-stream / multi-task fan-out.
//
// Use this to determine whether a hang is per-instance (will still hang here)
// or only triggered by cross-session DML contention (will NOT hang here).
// If THIS test runs cleanly for an extended period but the multi-stream
// stress test hangs after a few inferences, the issue is concurrent DML
// submissions on the AMD iGPU — not a bug in the engine code itself.
//
// Reuses helpers from CustomModel_StressTest_FilePlayer:
// LoadANSCV / UnloadANSCV, ODWorkerThread, GetPerGpuFreeMiB.
// =============================================================================
int CustomModel_SingleStream_FilePlayer() {
printf("\n");
printf("============================================================\n");
printf(" Custom Model SINGLE-STREAM Isolation Test (FilePlayer)\n");
printf(" 1 camera + 1 model + 1 worker thread\n");
printf(" Press ESC to stop\n");
printf("============================================================\n\n");
// --- Load ANSCV.dll at runtime (same helper as stress test) ---
if (!LoadANSCV()) return -1;
if (pInitCameraNetwork) pInitCameraNetwork();
// =====================================================================
// CONFIGURATION — EDIT THESE FOR YOUR TEST
// =====================================================================
const std::string videoFile =
"C:\\ProgramData\\ANSCENTER\\Shared\\HM1.mp4";
const std::string modelFolder =
"C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\Models\\B-IN_ANS_Helmet_v2.0_102728911.zip";
const int modelType = 16; // 16 = CustomDetector (same as stress test)
const int detectorType = 1; // Detection
const float scoreThresh = 0.5f;
const float confThresh = 0.5f;
const float nmsThresh = 0.45f;
// =====================================================================
// Reset shared run flag (it's a static at file scope shared with stress test)
g_stressRunning.store(true);
std::cout << "\n--- Single-stream isolation test (no concurrency) ---\n" << std::endl;
// NOTE: deliberately NOT calling OptimizeModelStr here. OptimizeModelStr
// creates a separate "warmup" ANSCUSTOM instance whose detector and
// classifier sessions stay loaded for the lifetime of the process — even
// though that instance never runs inference, its 2 DML sessions hold AMD
// GPU resources and were suspected of contributing to a hang in the
// active session's GetTensorData<float>. Skipping it here leaves exactly
// 1 ANSCUSTOM = 2 DML sessions (detector + classifier) in the process,
// for the cleanest possible single-session isolation.
(void)detectorType; // unused without the OptimizeModelStr call
// --- Per-task state (just one) ---
StressTaskState taskState;
// --- Create FilePlayer (single stream) ---
void* fpClient = nullptr;
{
printf("[Stream0] Creating FilePlayer: %s\n", videoFile.c_str());
int result = pCreateFilePlayer(&fpClient, "", videoFile.c_str());
if (result != 1 || !fpClient) {
printf("[Stream0] FAILED to create FilePlayer (result=%d)\n", result);
UnloadANSCV();
return -2;
}
if (pSetFilePlayerDisplayRes) {
pSetFilePlayerDisplayRes(&fpClient, 1920, 1080);
}
printf("[Stream0] FilePlayer created (display: 1920x1080)\n");
}
// --- Create OD handle (single instance) ---
ANSCENTER::ANSODBase* odHandle = nullptr;
{
printf("[Task0] Creating OD handle (modelType=%d)...\n", modelType);
auto loadStart = std::chrono::steady_clock::now();
auto vramBefore = GetPerGpuFreeMiB();
std::string labelMap = CreateANSODHandle(
&odHandle,
"", // licenseKey
modelFolder.c_str(), // modelFilePath (zip or folder)
"", // modelZipFilePassword
scoreThresh,
confThresh,
nmsThresh,
1, // autoDetectEngine
modelType,
1, // detectionType (1 = Detection)
1); // loadEngineOnCreation
auto loadEnd = std::chrono::steady_clock::now();
double loadMs = std::chrono::duration<double, std::milli>(loadEnd - loadStart).count();
if (!odHandle) {
printf("[Task0] FAILED to create OD handle\n");
pStopFilePlayer(&fpClient);
pReleaseFilePlayer(&fpClient);
UnloadANSCV();
return -3;
}
auto vramAfter = GetPerGpuFreeMiB();
int bestGpu = 0;
size_t maxDelta = 0;
for (size_t g = 0; g < vramBefore.size() && g < vramAfter.size(); g++) {
size_t delta = (vramBefore[g] > vramAfter[g]) ? vramBefore[g] - vramAfter[g] : 0;
if (delta > maxDelta) { maxDelta = delta; bestGpu = (int)g; }
}
printf("[Task0] Model loaded in %.0f ms | GPU[%d] | VRAM: %zu MiB | Labels: %s\n",
loadMs, bestGpu, maxDelta,
labelMap.empty() ? "(none)" : labelMap.substr(0, 80).c_str());
std::lock_guard<std::mutex> lk(taskState.mtx);
taskState.engineLoaded = true;
taskState.statusMsg = "Running";
taskState.gpuDeviceId = bestGpu;
taskState.vramUsedMiB = maxDelta;
}
// --- Start playback ---
pStartFilePlayer(&fpClient);
printf("[Stream0] Playback started\n");
std::this_thread::sleep_for(std::chrono::milliseconds(500));
// --- Single worker thread (reuse ODWorkerThread from stress test) ---
std::thread worker(ODWorkerThread, /*taskId=*/0, fpClient, odHandle, std::ref(taskState));
// --- Display loop (single cell) ---
const int cellW = 1280, cellH = 720;
const char* windowName = "Custom Model — Single Stream Isolation";
cv::namedWindow(windowName, cv::WINDOW_NORMAL);
cv::resizeWindow(windowName, cellW, cellH + 40);
auto testStart = std::chrono::steady_clock::now();
while (g_stressRunning.load()) {
cv::Mat canvas(cellH + 40, cellW, CV_8UC3, cv::Scalar(30, 30, 30));
cv::Mat cell;
double fps = 0, infMs = 0, grabMs = 0;
int fCount = 0, dCount = 0, gpuId = -1;
std::string statusMsg, lastDet;
bool engineLoaded = false;
{
std::lock_guard<std::mutex> lk(taskState.mtx);
if (!taskState.displayFrame.empty()) {
cv::resize(taskState.displayFrame, cell, cv::Size(cellW, cellH));
}
fps = taskState.fps;
infMs = taskState.inferenceMs;
grabMs = taskState.grabMs;
fCount = taskState.frameCount;
dCount = taskState.detectionCount;
gpuId = taskState.gpuDeviceId;
statusMsg = taskState.statusMsg;
lastDet = taskState.lastDetection;
engineLoaded = taskState.engineLoaded;
}
if (cell.empty()) {
cell = cv::Mat(cellH, cellW, CV_8UC3, cv::Scalar(40, 40, 40));
cv::putText(cell, "Task 0: " + statusMsg,
cv::Point(20, cellH / 2),
cv::FONT_HERSHEY_SIMPLEX, 0.8, cv::Scalar(100, 100, 255), 2);
}
cv::rectangle(cell, cv::Rect(0, cellH - 45, cellW, 45),
cv::Scalar(0, 0, 0), cv::FILLED);
char bar1[256], bar2[128];
snprintf(bar1, sizeof(bar1),
"%.1f FPS | inf:%.0fms grab:%.0fms | Frames:%d | Det:%d",
fps, infMs, grabMs, fCount, dCount);
snprintf(bar2, sizeof(bar2), "GPU[%d] | last:%s",
gpuId, lastDet.empty() ? "-" : lastDet.c_str());
cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255);
cv::putText(cell, bar1, cv::Point(5, cellH - 25),
cv::FONT_HERSHEY_SIMPLEX, 0.5, barColor, 1);
cv::putText(cell, bar2, cv::Point(5, cellH - 5),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 200, 255), 1);
cell.copyTo(canvas(cv::Rect(0, 0, cellW, cellH)));
double elapsed = std::chrono::duration<double>(
std::chrono::steady_clock::now() - testStart).count();
char bottomBar[256];
snprintf(bottomBar, sizeof(bottomBar),
"Single-stream | Elapsed: %.0fs | %.1f FPS | Press ESC to stop",
elapsed, fps);
cv::putText(canvas, bottomBar, cv::Point(10, cellH + 25),
cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 200), 1);
cv::imshow(windowName, canvas);
int key = cv::waitKey(30);
if (key == 27) {
printf("\nESC pressed - stopping...\n");
g_stressRunning.store(false);
}
}
printf("Waiting for worker thread...\n");
if (worker.joinable()) worker.join();
double totalElapsed = std::chrono::duration<double>(
std::chrono::steady_clock::now() - testStart).count();
printf("\n============================================================\n");
printf(" SINGLE-STREAM SUMMARY (runtime: %.0fs)\n", totalElapsed);
printf("============================================================\n");
printf(" GPU[%d] | %d frames | %d detections | %.1f FPS | Inf: %.0fms\n",
taskState.gpuDeviceId, taskState.frameCount, taskState.detectionCount,
taskState.fps, taskState.inferenceMs);
printf("============================================================\n");
if (odHandle) ReleaseANSODHandle(&odHandle);
if (fpClient) {
pStopFilePlayer(&fpClient);
pReleaseFilePlayer(&fpClient);
}
cv::destroyAllWindows();
if (pDeinitCameraNetwork) pDeinitCameraNetwork();
UnloadANSCV();
return 0;
}