From dcf974c35c3b9cff5aa6deea00aa927ba09678ba Mon Sep 17 00:00:00 2001 From: Tuan Nghia Nguyen Date: Tue, 28 Apr 2026 13:25:02 +1000 Subject: [PATCH] Fix AMD by changing from GetTensorData() to GetTensorMutableData() --- .claude/settings.local.json | 30 ++- core/ANSLicensingSystem/ANSLicense.cpp | 33 ++- docs/~$SLIB_Logging_Guide.docx | Bin 162 -> 0 bytes engines/ONNXEngine/ONNXEngine.cpp | 9 +- engines/ONNXEngine/ONNXSAM3.cpp | 18 +- modules/ANSODEngine/ANSONNXCL.cpp | 14 +- modules/ANSODEngine/ANSONNXCL.h | 6 +- modules/ANSODEngine/ANSONNXOBB.cpp | 8 +- modules/ANSODEngine/ANSONNXOBB.h | 4 +- modules/ANSODEngine/ANSONNXPOSE.cpp | 6 +- modules/ANSODEngine/ANSONNXPOSE.h | 4 +- modules/ANSODEngine/ANSONNXSEG.cpp | 10 +- modules/ANSODEngine/ANSONNXSEG.h | 4 +- modules/ANSODEngine/ANSYOLO12OD.cpp | 8 +- modules/ANSODEngine/ANSYOLO12OD.h | 5 +- .../ANSODEngine-UnitTest.cpp | 5 +- tests/ANSODEngine-UnitTest/ANSODTest.h | 3 +- .../CustomModel-StressTest.cpp | 240 +++++++++++++++++- 18 files changed, 359 insertions(+), 48 deletions(-) delete mode 100644 docs/~$SLIB_Logging_Guide.docx diff --git a/.claude/settings.local.json b/.claude/settings.local.json index 44f0176..a4c723e 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -117,7 +117,35 @@ "Bash(NODE_PATH=\"C:/home/alex/.npm-global/node_modules\" node build_anslib_logging_guide.js)", "Bash(python \"C:/Users/nghia/AppData/Roaming/Claude/local-agent-mode-sessions/skills-plugin/d8e35aa4-a14e-4e20-b921-ba1b9a3cce86/cdda7cc8-a1c7-42ff-98b4-473ec3e8b9fb/skills/docx/scripts/office/validate.py\" \"C:/Projects/CLionProjects/ANSCORE/docs/ANSLIB_Logging_Guide.docx\")", "Bash(python \"C:/Users/nghia/AppData/Roaming/Claude/local-agent-mode-sessions/skills-plugin/d8e35aa4-a14e-4e20-b921-ba1b9a3cce86/cdda7cc8-a1c7-42ff-98b4-473ec3e8b9fb/skills/docx/scripts/office/soffice.py\" --headless --convert-to pdf --outdir \"C:/Projects/CLionProjects/ANSCORE/docs\" \"C:/Projects/CLionProjects/ANSCORE/docs/ANSLIB_Logging_Guide.docx\")", - "Bash(sort -t: -k1 -u)" + "Bash(sort -t: -k1 -u)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE diff core/ANSLicensingSystem/ANSLicense.cpp)", + "Bash(awk 'NR>=154022 && NR<=156428 {print}' \"C:/Users/nghia/Downloads/AVNET-8845HS2.log\")", + "Bash(awk 'NR>=156350 && NR<=156428 {print}' \"C:/Users/nghia/Downloads/AVNET-8845HS2.log\")", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE log --oneline -10 -- modules/ANSFR/OpenVINODeviceConfig.h modules/ANSFR/OpenVINODeviceConfig.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE log --oneline -5 -- core/ANSLibsLoader/EPLoader.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show --stat 69787b0)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show --stat 97d8149)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show 97d8149 -- engines/ONNXEngine/OpenVINODeviceConfig.h modules/ANSFR/ANSFR.cpp)", + "Bash(awk 'NR>=7106 && NR<=7250 && /ONNXYOLO|ANSCUSTOM|DetectObjects/ {print}' \"C:/Users/nghia/Downloads/AVNET-8845HS5.log\")", + "Bash(awk 'NR>=5850 && NR<=7115 && /ONNXYOLO|ANSCUSTOM_Infer|ANSONNXCL_pp|cls\\\\] calling/' \"C:/Users/nghia/Downloads/AVNET-8845HS5.log\")", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show --stat 3a527d2)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show 3a527d2 -- modules/ANSODEngine/ANSCUSTOMDETECTOR.cpp modules/ANSODEngine/ANSONNXYOLO.cpp modules/ANSODEngine/ANSONNXCL.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show 3a527d2 -- modules/ANSODEngine/ANSEngineCommon.h)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE diff HEAD modules/ANSODEngine/ANSONNXYOLO.cpp modules/ANSODEngine/ANSONNXCL.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE diff HEAD modules/ANSODEngine/ANSONNXYOLO.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE diff HEAD modules/ANSODEngine/ANSONNXCL.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE log --all --oneline --since='3 weeks ago' -- modules/ANSODEngine/ANSONNXYOLO.cpp modules/ANSODEngine/ANSONNXCL.cpp engines/ONNXEngine/ONNXEngine.cpp engines/ONNXEngine/ONNXEngine.h)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show --stat 844d739)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE log -p --grep=\"mutex\\\\|lock\\\\|hang\\\\|stall\\\\|concurrent\\\\|thread\" --since=\"3 weeks ago\" -- modules/ANSODEngine/ANSONNXYOLO.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE show 844d739 -- modules/ANSODEngine/ANSONNXYOLO.cpp)", + "Bash(awk '$2>=835 && $2<=860 && /1853990010|ANSCUSTOM|ANSONNXCL|ANSONNXYOLO/' \"C:/Users/nghia/Downloads/AVNET-8845HS7.log\")", + "Bash(awk '$2>=836 && $2<=837 && /ANSCUSTOM|ANSONNXCL/' \"C:/Users/nghia/Downloads/AVNET-8845HS7.log\")", + "Bash(awk '$2>=836.0 && $2<=836.9' \"C:/Users/nghia/Downloads/AVNET-8845HS7.log\")", + "Bash(grep -nB1 -A3 \"DirectML\\\\|DisableMemPattern\\\\|SetExecutionMode\\\\|ExtendedOptimization\\\\|ORT_SEQUENTIAL\" \"C:/Projects/CLionProjects/ANSCORE/engines/ONNXEngine/ONNXEngine.cpp\")", + "Bash(grep -nE \"^\\\\}$|warmupModel\\\\\\(|#if 0|#endif|ANSONNXCL_legacy_Init\" C:/Projects/CLionProjects/ANSCORE/modules/ANSODEngine/ANSONNXCL.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE restore modules/ANSODEngine/ANSONNXCL.h modules/ANSODEngine/ANSONNXCL.cpp)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE status --short modules/)", + "Bash(git -C C:/Projects/CLionProjects/ANSCORE diff --stat HEAD modules/ANSODEngine/ANSONNXCL.cpp modules/ANSODEngine/ANSONNXOBB.cpp modules/ANSODEngine/ANSONNXPOSE.cpp modules/ANSODEngine/ANSONNXSEG.cpp modules/ANSODEngine/ANSYOLO12OD.cpp engines/ONNXEngine/ONNXEngine.cpp engines/ONNXEngine/ONNXSAM3.cpp)" ] } } diff --git a/core/ANSLicensingSystem/ANSLicense.cpp b/core/ANSLicensingSystem/ANSLicense.cpp index 42e4224..07f7cb2 100644 --- a/core/ANSLicensingSystem/ANSLicense.cpp +++ b/core/ANSLicensingSystem/ANSLicense.cpp @@ -80,6 +80,17 @@ namespace { return s.substr(0, 4) + std::string(s.size() - 8, '*') + s.substr(s.size() - 4); } + // SDK identifier passed to ANSLSHelper for licensing-service auth. Despite + // the variable name "privateKey" used at call sites, this is an 11-byte + // SDK token that identifies the ANSCENTER product to the licensing + // service — NOT a cryptographic signing key. Centralised here so any + // future rotation (or move to runtime lookup via env var / config file) + // touches one place instead of duplicated literals scattered across the + // file. Narrow form is the source of truth; wide form is derived to + // avoid two literals drifting out of sync. + inline std::string GetSdkKey() { return "AQlSAiRTNtS7X20="; } + inline std::wstring GetSdkKeyW() { return String2WString(GetSdkKey()); } + // Append both `\` and `/` slash flavours of `raw` to `dst`, lower-cased // and with a trailing separator forced. Empty / pathologically short // entries are skipped. @@ -661,7 +672,7 @@ namespace ANSCENTER try { int _enableFeature; int _productId; - std::string privateKey = "AQlSAiRTNtS7X20="; + std::string privateKey = GetSdkKey(); int activationResult = 0; std::string licensingServiceURL = "https://licensingservice.anscenter.com/"; ANSCENTER::ANSLSHelper ansHelper(privateKey, licensingServiceURL); @@ -775,7 +786,7 @@ namespace ANSCENTER try { int productId; std::string registrationName; - std::string privateKey = "AQlSAiRTNtS7X20="; + std::string privateKey = GetSdkKey(); std::string licensingServiceURL = "https://licensingservice.anscenter.com/"; ANSCENTER::ANSLSHelper ansHelper(privateKey, licensingServiceURL); ansHelper.SetupLicenseTemplate(); @@ -836,7 +847,7 @@ namespace ANSCENTER int ANSLicenseHelper::DeactivateLicense(std::string productName) { std::string licenseDirectory = GetLicenseDir(); std::vector licenseKeyFiles = ListFilesInFolder(licenseDirectory); - std::string privateKey = "AQlSAiRTNtS7X20="; + std::string privateKey = GetSdkKey(); std::string licensingServiceURL = "https://licensingservice.anscenter.com/"; ANSCENTER::ANSLSHelper ansHelper(privateKey, licensingServiceURL); ansHelper.SetupLicenseTemplate(); @@ -901,7 +912,7 @@ namespace ANSCENTER return 0; } int ANSLicenseHelper::ActivateLicense(std::string productName, std::string licenseKey) { - std::string privateKey = "AQlSAiRTNtS7X20="; + std::string privateKey = GetSdkKey(); std::string licensingServiceURL = "https://licensingservice.anscenter.com/"; ANSCENTER::ANSLSHelper ansHelper(privateKey, licensingServiceURL); ansHelper.SetupLicenseTemplate(); @@ -942,7 +953,7 @@ namespace ANSCENTER } int ANSLicenseHelper::ActivateLicenseWithCustomHWID(std::string productName, std::string licenseKey, std::string hwid, std::string &activationKey) { - std::string privateKey = "AQlSAiRTNtS7X20="; + std::string privateKey = GetSdkKey(); std::string licensingServiceURL = "https://licensingservice.anscenter.com/"; ANSCENTER::ANSLSHelper ansHelper(privateKey, licensingServiceURL); ansHelper.SetupLicenseTemplate(); @@ -1020,7 +1031,7 @@ namespace ANSCENTER std::string licenseDirectory = GetLicenseDir(); std::vector licenseKeyFiles = ListFilesInFolder(licenseDirectory); - std::string privateKey = "AQlSAiRTNtS7X20="; + std::string privateKey = GetSdkKey(); std::string licensingServiceURL = "https://licensingservice.anscenter.com/"; ANSCENTER::ANSLSHelper ansHelper(privateKey, licensingServiceURL); ansHelper.SetupLicenseTemplate(); @@ -1224,7 +1235,7 @@ namespace ANSCENTER ANSLSHelper::ANSLSHelper() { try { - this->_privateKey = _T("AQlSAiRTNtS7X20="); + this->_privateKey = GetSdkKeyW(); this->_licenseServiceURL = _T("https://licensingservice.anscenter.com/"); this->_sdkLicenseKey = _T("MYNSU-GBQ2Q-SF5U5-S3RVF-5ZKFD"); SDKRegistration::SetLicenseKey(_sdkLicenseKey.c_str()); @@ -1239,7 +1250,7 @@ namespace ANSCENTER ANSLSHelper::ANSLSHelper(std::string licenseServiceURL) { try { - this->_privateKey = _T("AQlSAiRTNtS7X20="); + this->_privateKey = GetSdkKeyW(); this->_licenseServiceURL = String2WString(licenseServiceURL); this->_sdkLicenseKey = _T("MYNSU-GBQ2Q-SF5U5-S3RVF-5ZKFD"); SDKRegistration::SetLicenseKey(_sdkLicenseKey.c_str()); @@ -1255,7 +1266,7 @@ namespace ANSCENTER { try { if (privateKey.empty()) { - this->_privateKey = _T("AQlSAiRTNtS7X20="); + this->_privateKey = GetSdkKeyW(); } else { this->_privateKey = String2WString(privateKey); @@ -1278,7 +1289,7 @@ namespace ANSCENTER { try { if (privateKey.empty()) { - this->_privateKey = _T("AQlSAiRTNtS7X20="); + this->_privateKey = GetSdkKeyW(); } else { this->_privateKey = String2WString(privateKey); @@ -1517,7 +1528,7 @@ namespace ANSCENTER this->_licenseTemplate->SetDataSize(36); //16 bits for ProductId, 4 bits for FeatureBitMask, and 16 bits for ExpirationData (16+4+16 =36) this->_licenseTemplate->SetTemplateId(_T("24880")); if(this->_privateKey.empty()) - this->_licenseTemplate->SetPrivateKey(_T("AQlSAiRTNtS7X20=")); + this->_licenseTemplate->SetPrivateKey(GetSdkKeyW().c_str()); else this->_licenseTemplate->SetPrivateKey(this->_privateKey.c_str()); if (this->_publicKey.empty()) diff --git a/docs/~$SLIB_Logging_Guide.docx b/docs/~$SLIB_Logging_Guide.docx deleted file mode 100644 index b38e8ec628c31d10aef5746aa0ef2cee90b8cc2a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 162 zcmWd(DNW2%@Jr9gOazjpm8p3QL;yjC5Qb8QM20*D1qMHcbcPIuOdt=!E@h}>NCnD( yz)=%a!Z=7z2aZG7q+IK>mz5)6*S*Y9~&rw*UaAPaY`% diff --git a/engines/ONNXEngine/ONNXEngine.cpp b/engines/ONNXEngine/ONNXEngine.cpp index 0dfd32e..c6d253c 100644 --- a/engines/ONNXEngine/ONNXEngine.cpp +++ b/engines/ONNXEngine/ONNXEngine.cpp @@ -819,7 +819,10 @@ namespace ANSCENTER { input_values_handler.clear(); input_values_handler.shrink_to_fit(); - const float* vals = output_tensors[0].GetTensorData(); + // GetTensorMutableData on DirectML — the const GetTensorData + // triggers a per-call host-readable mapping that on AMD DML + // exhausts a staging-buffer pool after ~8 calls and hangs. + const float* vals = output_tensors[0].GetTensorMutableData(); const unsigned int hidden_dim = static_cast(output_node_dims.at(0).at(1)); @@ -1377,7 +1380,9 @@ namespace ANSCENTER { Ort::RunOptions{ nullptr }, in_names, &input_tensor, 1, out_names, 1); - out_result = post_processing(outputs[0].GetTensorData()); + // GetTensorMutableData (not GetTensorData) — see comment in this + // file's other output-read sites; const GetTensorData hangs on AMD DML. + out_result = post_processing(outputs[0].GetTensorMutableData()); } Ort::Value MOVINET::transform(const cv::Mat& mat) diff --git a/engines/ONNXEngine/ONNXSAM3.cpp b/engines/ONNXEngine/ONNXSAM3.cpp index 8713a2e..f784476 100644 --- a/engines/ONNXEngine/ONNXSAM3.cpp +++ b/engines/ONNXEngine/ONNXSAM3.cpp @@ -463,7 +463,9 @@ namespace ANSCENTER auto info = outputs[maskIdx].GetTensorTypeAndShapeInfo(); m_cachedLangMaskShape = info.GetShape(); size_t count = info.GetElementCount(); - const bool* data = outputs[maskIdx].GetTensorData(); + // GetTensorMutableData not GetTensorData on DML — const variant + // hangs after ~8 calls. Read-only despite the "Mutable" name. + const bool* data = outputs[maskIdx].GetTensorMutableData(); m_cachedLangMask.resize(count); for (size_t i = 0; i < count; ++i) m_cachedLangMask[i] = data[i] ? 1 : 0; @@ -474,7 +476,7 @@ namespace ANSCENTER auto info = outputs[featIdx].GetTensorTypeAndShapeInfo(); m_cachedLangFeaturesShape = info.GetShape(); size_t count = info.GetElementCount(); - const float* data = outputs[featIdx].GetTensorData(); + const float* data = outputs[featIdx].GetTensorMutableData(); m_cachedLangFeatures.assign(data, data + count); } @@ -649,7 +651,7 @@ namespace ANSCENTER if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT && !shape.empty()) { size_t numElems = info.GetElementCount(); if (numElems > 0 && numElems < 100000000) { - const float* data = decInputs[di].GetTensorData(); + const float* data = decInputs[di].GetTensorMutableData(); double sum = 0; for (size_t k = 0; k < numElems; ++k) sum += data[k]; double mean = sum / numElems; @@ -661,14 +663,14 @@ namespace ANSCENTER // Print bool tensor values (for language_mask) else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL && !shape.empty()) { size_t numElems = info.GetElementCount(); - const bool* data = decInputs[di].GetTensorData(); + const bool* data = decInputs[di].GetTensorMutableData(); std::cout << " vals:"; for (size_t k = 0; k < std::min(numElems, (size_t)32); ++k) std::cout << " " << (int)data[k]; } // Print int64 scalar value else if (elemType == ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 && shape.empty()) { - const int64_t* data = decInputs[di].GetTensorData(); + const int64_t* data = decInputs[di].GetTensorMutableData(); std::cout << " value=" << data[0]; } std::cout << std::endl; @@ -702,10 +704,10 @@ namespace ANSCENTER auto boxInfo = decOutputs[boxesIdx].GetTensorTypeAndShapeInfo(); auto boxShape = boxInfo.GetShape(); int numBoxes = (boxShape.size() >= 1) ? static_cast(boxShape[0]) : 0; - const float* boxesData = decOutputs[boxesIdx].GetTensorData(); + const float* boxesData = decOutputs[boxesIdx].GetTensorMutableData(); // Get scores - const float* scoresData = decOutputs[scoresIdx].GetTensorData(); + const float* scoresData = decOutputs[scoresIdx].GetTensorMutableData(); // Get masks auto maskInfo = decOutputs[masksIdx].GetTensorTypeAndShapeInfo(); @@ -713,7 +715,7 @@ namespace ANSCENTER // masks shape: [N, 1, H, W] int maskH = (maskShape.size() >= 3) ? static_cast(maskShape[2]) : 0; int maskW = (maskShape.size() >= 4) ? static_cast(maskShape[3]) : 0; - const bool* masksData = decOutputs[masksIdx].GetTensorData(); + const bool* masksData = decOutputs[masksIdx].GetTensorMutableData(); m_maskH = maskH; m_maskW = maskW; diff --git a/modules/ANSODEngine/ANSONNXCL.cpp b/modules/ANSODEngine/ANSONNXCL.cpp index 2776493..e23eff9 100644 --- a/modules/ANSODEngine/ANSONNXCL.cpp +++ b/modules/ANSODEngine/ANSONNXCL.cpp @@ -575,7 +575,7 @@ namespace ANSCENTER return false; } } - std::vector ANSONNXCL::postprocess(const std::vector& outputTensors, const std::string& camera_id) { + std::vector ANSONNXCL::postprocess(std::vector& outputTensors, const std::string& camera_id) { ANS_DBG("ANSONNXCL_pp", "ENTRY tensors=%zu cam=%s this=%p", outputTensors.size(), camera_id.c_str(), (void*)this); std::lock_guard lock(_mutex); @@ -589,8 +589,16 @@ namespace ANSCENTER return {}; } - ANS_DBG("ANSONNXCL_pp", "GetTensorData"); - const float* rawOutput = outputTensors[0].GetTensorData(); + ANS_DBG("ANSONNXCL_pp", "GetTensorMutableData"); + // GetTensorMutableData (not GetTensorData) on DirectML. The const + // GetTensorData triggers a per-call host-readable mapping that on + // AMD DML exhausts a small staging-buffer pool after ~8 calls and + // blocks indefinitely. GetTensorMutableData returns the existing + // host-accessible pointer directly with no per-call mapping cost. + // Same pattern used by every output-tensor read in ANSONNXYOLO + // and engines/ONNXEngine. Safe on all EPs (CUDA/OpenVINO/CPU); + // we read the data only, never mutate it. + const float* rawOutput = outputTensors[0].GetTensorMutableData(); if (!rawOutput) { ANS_DBG("ANSONNXCL_pp", "EARLY-RETURN rawOutput=null"); this->_logger.LogError("ANSONNXCL::postprocess", "rawOutput pointer is null", __FILE__, __LINE__); diff --git a/modules/ANSODEngine/ANSONNXCL.h b/modules/ANSODEngine/ANSONNXCL.h index 5b996ca..200a18d 100644 --- a/modules/ANSODEngine/ANSONNXCL.h +++ b/modules/ANSODEngine/ANSONNXCL.h @@ -28,7 +28,11 @@ namespace ANSCENTER { void warmupModel(); bool Init(const std::string& modelPath, const cv::Size& targetInputShape, bool useGPU = true); bool preprocess(const cv::Mat& image, float*& blob, std::vector& inputTensorShape); - std::vector postprocess(const std::vector& outputTensors, const std::string& camera_id); + // outputTensors is non-const because GetTensorMutableData() (the + // ORT API that doesn't hang on AMD DirectML) requires a non-const + // Ort::Value receiver. See comment at the GetTensorMutableData + // call site in postprocess() for the full rationale. + std::vector postprocess(std::vector& outputTensors, const std::string& camera_id); std::vector classify(const cv::Mat& image, const std::string& camera_id); private: diff --git a/modules/ANSODEngine/ANSONNXOBB.cpp b/modules/ANSODEngine/ANSONNXOBB.cpp index 9b79e05..f5b7877 100644 --- a/modules/ANSODEngine/ANSONNXOBB.cpp +++ b/modules/ANSODEngine/ANSONNXOBB.cpp @@ -1089,7 +1089,7 @@ namespace ANSCENTER { std::vector ANSONNXOBB::postprocess( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, - const std::vector& outputTensors, + std::vector& outputTensors, int topk, const std::string& camera_id) { @@ -1103,8 +1103,10 @@ namespace ANSCENTER { return {}; } - // Extract output tensor data and shape [1, num_features, num_detections] - const float* rawOutput = outputTensors[0].GetTensorData(); + // Extract output tensor data and shape [1, num_features, num_detections]. + // GetTensorMutableData (not GetTensorData) on DML — const variant + // hangs on AMD after ~8 calls. Read-only despite the name. + const float* rawOutput = outputTensors[0].GetTensorMutableData(); const std::vector outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); if (outputShape.size() < 3) { diff --git a/modules/ANSODEngine/ANSONNXOBB.h b/modules/ANSODEngine/ANSONNXOBB.h index e22ba6b..f7ab2ac 100644 --- a/modules/ANSODEngine/ANSONNXOBB.h +++ b/modules/ANSODEngine/ANSONNXOBB.h @@ -74,10 +74,12 @@ namespace ANSCENTER { void warmupModel(); bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0); cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector& inputTensorShape); + // outputTensors is non-const because GetTensorMutableData() requires + // a non-const Ort::Value receiver — see ANSONNXCL.h for full note. std::vector postprocess( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, - const std::vector& outputTensors, int topk, + std::vector& outputTensors, int topk, const std::string& camera_id); std::vector detect(const cv::Mat& image, const std::string& camera_id); private: diff --git a/modules/ANSODEngine/ANSONNXPOSE.cpp b/modules/ANSODEngine/ANSONNXPOSE.cpp index 7663ce7..2a366f5 100644 --- a/modules/ANSODEngine/ANSONNXPOSE.cpp +++ b/modules/ANSODEngine/ANSONNXPOSE.cpp @@ -759,7 +759,7 @@ namespace ANSCENTER { std::vector ANSONNXPOSE::postprocess( const cv::Size& originalImageSize, const cv::Size& resizedImageShape, - const std::vector& outputTensors, + std::vector& outputTensors, const std::string& camera_id) { std::lock_guard lock(_mutex); @@ -773,7 +773,9 @@ namespace ANSCENTER { return {}; } - const float* rawOutput = outputTensors[0].GetTensorData(); + // GetTensorMutableData (not GetTensorData) on DML — const variant + // hangs on AMD after ~8 calls. Read-only despite the name. + const float* rawOutput = outputTensors[0].GetTensorMutableData(); if (!rawOutput) { this->_logger.LogError("ANSONNXPOSE::postprocess", "rawOutput pointer is null", __FILE__, __LINE__); return {}; diff --git a/modules/ANSODEngine/ANSONNXPOSE.h b/modules/ANSODEngine/ANSONNXPOSE.h index a5922fa..c571721 100644 --- a/modules/ANSODEngine/ANSONNXPOSE.h +++ b/modules/ANSODEngine/ANSONNXPOSE.h @@ -41,8 +41,10 @@ namespace ANSCENTER { void warmupModel(); bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0); cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector& inputTensorShape); + // outputTensors is non-const because GetTensorMutableData() requires + // a non-const Ort::Value receiver — see ANSONNXCL.h for full note. std::vector postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape, - const std::vector& outputTensors, const std::string& camera_id); + std::vector& outputTensors, const std::string& camera_id); std::vector detect(const cv::Mat& image, const std::string& camera_id); private: static std::atomic instanceCounter_; // Thread-safe counter diff --git a/modules/ANSODEngine/ANSONNXSEG.cpp b/modules/ANSODEngine/ANSONNXSEG.cpp index d2f8dd2..a1f27dc 100644 --- a/modules/ANSODEngine/ANSONNXSEG.cpp +++ b/modules/ANSODEngine/ANSONNXSEG.cpp @@ -726,7 +726,7 @@ namespace ANSCENTER { std::vector ANSONNXSEG::postprocess( const cv::Size& origSize, const cv::Size& letterboxSize, - const std::vector& outputs, + std::vector& outputs, const std::string& camera_id) { std::lock_guard lock(_mutex); @@ -738,9 +738,11 @@ namespace ANSCENTER { std::to_string(outputs.size())); } - // Extract output tensors - const float* detections = outputs[0].GetTensorData(); - const float* prototypes = outputs[1].GetTensorData(); + // Extract output tensors. GetTensorMutableData (not GetTensorData) + // on DML — const variant hangs on AMD after ~8 calls. Read-only + // despite the name. + const float* detections = outputs[0].GetTensorMutableData(); + const float* prototypes = outputs[1].GetTensorMutableData(); // Get tensor shapes auto detectionShape = outputs[0].GetTensorTypeAndShapeInfo().GetShape(); // [1, 116, N] diff --git a/modules/ANSODEngine/ANSONNXSEG.h b/modules/ANSODEngine/ANSONNXSEG.h index 364d1da..fcc2e46 100644 --- a/modules/ANSODEngine/ANSONNXSEG.h +++ b/modules/ANSODEngine/ANSONNXSEG.h @@ -51,8 +51,10 @@ namespace ANSCENTER { void warmupModel(); bool Init(const std::string& modelPath, bool useGPU=true, int deviceId = 0); cv::Mat preprocess(const cv::Mat& image,float*& blobPtr,std::vector& inputTensorShape); + // outputs is non-const because GetTensorMutableData() requires a + // non-const Ort::Value receiver — see ANSONNXCL.h for full note. std::vector postprocess(const cv::Size& origSize,const cv::Size& letterboxSize, - const std::vector& outputs, const std::string& camera_id); + std::vector& outputs, const std::string& camera_id); std::vector segment(const cv::Mat& image, const std::string& camera_id); std::vector maskToPolygon(const cv::Mat& binaryMask, const cv::Rect& boundingBox, diff --git a/modules/ANSODEngine/ANSYOLO12OD.cpp b/modules/ANSODEngine/ANSYOLO12OD.cpp index 34b5afb..8e5d5c8 100644 --- a/modules/ANSODEngine/ANSYOLO12OD.cpp +++ b/modules/ANSODEngine/ANSYOLO12OD.cpp @@ -518,14 +518,16 @@ namespace ANSCENTER { } } std::vector YOLO12OD::postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape, - const std::vector& outputTensors, - float confThreshold, float iouThreshold) + std::vector& outputTensors, + float confThreshold, float iouThreshold) { std::lock_guard lock(_mutex); try { std::vector detections; - const float* rawOutput = outputTensors[0].GetTensorData(); // Extract raw output data from the first output tensor + // GetTensorMutableData (not GetTensorData) on DML — const variant + // hangs on AMD after ~8 calls. Read-only despite the name. + const float* rawOutput = outputTensors[0].GetTensorMutableData(); const std::vector outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); // Determine the number of features and detections diff --git a/modules/ANSODEngine/ANSYOLO12OD.h b/modules/ANSODEngine/ANSYOLO12OD.h index 6d9c944..f7dc04f 100644 --- a/modules/ANSODEngine/ANSYOLO12OD.h +++ b/modules/ANSODEngine/ANSYOLO12OD.h @@ -49,8 +49,11 @@ namespace ANSCENTER { std::vector detect(const cv::Mat& image, float confThreshold = 0.4f, float iouThreshold = 0.45f); //cv::Mat preprocess(const cv::Mat& image, float*& blob, std::vector& inputTensorShape); cv::Mat preprocess(const cv::Mat& image, std::vector& blob, std::vector& inputTensorShape); + // outputTensors is non-const because GetTensorMutableData() + // requires a non-const Ort::Value receiver — see ANSONNXCL.h + // for full note. std::vector postprocess(const cv::Size& originalImageSize, const cv::Size& resizedImageShape, - const std::vector& outputTensors, + std::vector& outputTensors, float confThreshold, float iouThreshold); private: diff --git a/tests/ANSODEngine-UnitTest/ANSODEngine-UnitTest.cpp b/tests/ANSODEngine-UnitTest/ANSODEngine-UnitTest.cpp index 948e1c0..6961b5a 100644 --- a/tests/ANSODEngine-UnitTest/ANSODEngine-UnitTest.cpp +++ b/tests/ANSODEngine-UnitTest/ANSODEngine-UnitTest.cpp @@ -1867,12 +1867,13 @@ int main() //YOLO26POSEYolo11Test(); //YOLO26CLYolo11Test(); //YOLO26ODYolo12Test(); - YOLO26ODYolo11Test(); + //YOLO26ODYolo11Test(); //YOLO26ODYolo10Test(); //YOLO26OBBYolo11Test(); //SAM3ONNX_ImageTest(); // ORT reference — runs first, prints decoder input stats //SAM3TRT_ImageTest(); // TRT under test — compare decoder input stats with above - //CustomModel_StressTest_FilePlayer(); // Multi-task stress test (LabVIEW flow) + //CustomModel_StressTest_FilePlayer(); // Multi-task stress test (LabVIEW flow) + CustomModel_SingleStream_FilePlayer(); // 1 cam + 1 task — isolates concurrency from per-instance bugs //SAM3TRT_UnitTest(); // TensorRT SAM3 test (in ANSSAM3-UnitTest.cpp) //TensorRT10Test(); //FireNSmokeCustomDetection(); diff --git a/tests/ANSODEngine-UnitTest/ANSODTest.h b/tests/ANSODEngine-UnitTest/ANSODTest.h index 1658baa..2958441 100644 --- a/tests/ANSODEngine-UnitTest/ANSODTest.h +++ b/tests/ANSODEngine-UnitTest/ANSODTest.h @@ -116,4 +116,5 @@ int FaceYoloTest(); int TestYOLOV12(); int PPETest(); int RVATest(); -int CustomModel_StressTest_FilePlayer(); \ No newline at end of file +int CustomModel_StressTest_FilePlayer(); +int CustomModel_SingleStream_FilePlayer(); // 1 camera + 1 task — isolates concurrency from per-instance bugs \ No newline at end of file diff --git a/tests/ANSODEngine-UnitTest/CustomModel-StressTest.cpp b/tests/ANSODEngine-UnitTest/CustomModel-StressTest.cpp index a0fef88..78bdaaa 100644 --- a/tests/ANSODEngine-UnitTest/CustomModel-StressTest.cpp +++ b/tests/ANSODEngine-UnitTest/CustomModel-StressTest.cpp @@ -292,15 +292,15 @@ int CustomModel_StressTest_FilePlayer() { // Video files (one per stream) const std::string videoFiles[NUM_STREAMS] = { - "E:\\Programs\\DemoAssets\\Videos\\Helmet\\HM1.mp4", - "E:\\Programs\\DemoAssets\\Videos\\Helmet\\HM2.mp4", + "C:\\ProgramData\\ANSCENTER\\Shared\\HM1.mp4", + "C:\\ProgramData\\ANSCENTER\\Shared\\HM2.mp4", }; // Which stream each task uses const int taskStreamMap[NUM_TASKS] = { 0, 0, 1, 1 }; // Model config — EDIT for your custom model - const std::string modelFolder = "C:\\Projects\\ANSVIS\\Models\\ANS_Helmet_v2.0.zip"; + const std::string modelFolder = "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\Models\\B-IN_ANS_Helmet_v2.0_102728911.zip"; //const char* modelName = "detector"; //const char* className = "detector.names"; const int modelType = 16; // 16 = CustomDetector, 31 = RTYOLO, 30 = ONNXYOLO @@ -550,3 +550,237 @@ int CustomModel_StressTest_FilePlayer() { return 0; } + + +// ============================================================================= +// CustomModel_SingleStream_FilePlayer +// +// ISOLATION TEST — 1 camera, 1 OD handle, 1 worker thread. No concurrent +// inference whatsoever. Same flow as CustomModel_StressTest_FilePlayer +// (FilePlayer → CloneImage → RunInferenceComplete_CPP → ReleaseImage), just +// without the multi-stream / multi-task fan-out. +// +// Use this to determine whether a hang is per-instance (will still hang here) +// or only triggered by cross-session DML contention (will NOT hang here). +// If THIS test runs cleanly for an extended period but the multi-stream +// stress test hangs after a few inferences, the issue is concurrent DML +// submissions on the AMD iGPU — not a bug in the engine code itself. +// +// Reuses helpers from CustomModel_StressTest_FilePlayer: +// LoadANSCV / UnloadANSCV, ODWorkerThread, GetPerGpuFreeMiB. +// ============================================================================= +int CustomModel_SingleStream_FilePlayer() { + printf("\n"); + printf("============================================================\n"); + printf(" Custom Model SINGLE-STREAM Isolation Test (FilePlayer)\n"); + printf(" 1 camera + 1 model + 1 worker thread\n"); + printf(" Press ESC to stop\n"); + printf("============================================================\n\n"); + + // --- Load ANSCV.dll at runtime (same helper as stress test) --- + if (!LoadANSCV()) return -1; + if (pInitCameraNetwork) pInitCameraNetwork(); + + // ===================================================================== + // CONFIGURATION — EDIT THESE FOR YOUR TEST + // ===================================================================== + const std::string videoFile = + "C:\\ProgramData\\ANSCENTER\\Shared\\HM1.mp4"; + + const std::string modelFolder = + "C:\\ProgramData\\ANSCENTER\\ANSVIS Server\\Models\\B-IN_ANS_Helmet_v2.0_102728911.zip"; + const int modelType = 16; // 16 = CustomDetector (same as stress test) + const int detectorType = 1; // Detection + const float scoreThresh = 0.5f; + const float confThresh = 0.5f; + const float nmsThresh = 0.45f; + // ===================================================================== + + // Reset shared run flag (it's a static at file scope shared with stress test) + g_stressRunning.store(true); + + std::cout << "\n--- Single-stream isolation test (no concurrency) ---\n" << std::endl; + // NOTE: deliberately NOT calling OptimizeModelStr here. OptimizeModelStr + // creates a separate "warmup" ANSCUSTOM instance whose detector and + // classifier sessions stay loaded for the lifetime of the process — even + // though that instance never runs inference, its 2 DML sessions hold AMD + // GPU resources and were suspected of contributing to a hang in the + // active session's GetTensorData. Skipping it here leaves exactly + // 1 ANSCUSTOM = 2 DML sessions (detector + classifier) in the process, + // for the cleanest possible single-session isolation. + (void)detectorType; // unused without the OptimizeModelStr call + + // --- Per-task state (just one) --- + StressTaskState taskState; + + // --- Create FilePlayer (single stream) --- + void* fpClient = nullptr; + { + printf("[Stream0] Creating FilePlayer: %s\n", videoFile.c_str()); + int result = pCreateFilePlayer(&fpClient, "", videoFile.c_str()); + if (result != 1 || !fpClient) { + printf("[Stream0] FAILED to create FilePlayer (result=%d)\n", result); + UnloadANSCV(); + return -2; + } + if (pSetFilePlayerDisplayRes) { + pSetFilePlayerDisplayRes(&fpClient, 1920, 1080); + } + printf("[Stream0] FilePlayer created (display: 1920x1080)\n"); + } + + // --- Create OD handle (single instance) --- + ANSCENTER::ANSODBase* odHandle = nullptr; + { + printf("[Task0] Creating OD handle (modelType=%d)...\n", modelType); + auto loadStart = std::chrono::steady_clock::now(); + auto vramBefore = GetPerGpuFreeMiB(); + + std::string labelMap = CreateANSODHandle( + &odHandle, + "", // licenseKey + modelFolder.c_str(), // modelFilePath (zip or folder) + "", // modelZipFilePassword + scoreThresh, + confThresh, + nmsThresh, + 1, // autoDetectEngine + modelType, + 1, // detectionType (1 = Detection) + 1); // loadEngineOnCreation + + auto loadEnd = std::chrono::steady_clock::now(); + double loadMs = std::chrono::duration(loadEnd - loadStart).count(); + + if (!odHandle) { + printf("[Task0] FAILED to create OD handle\n"); + pStopFilePlayer(&fpClient); + pReleaseFilePlayer(&fpClient); + UnloadANSCV(); + return -3; + } + + auto vramAfter = GetPerGpuFreeMiB(); + int bestGpu = 0; + size_t maxDelta = 0; + for (size_t g = 0; g < vramBefore.size() && g < vramAfter.size(); g++) { + size_t delta = (vramBefore[g] > vramAfter[g]) ? vramBefore[g] - vramAfter[g] : 0; + if (delta > maxDelta) { maxDelta = delta; bestGpu = (int)g; } + } + + printf("[Task0] Model loaded in %.0f ms | GPU[%d] | VRAM: %zu MiB | Labels: %s\n", + loadMs, bestGpu, maxDelta, + labelMap.empty() ? "(none)" : labelMap.substr(0, 80).c_str()); + + std::lock_guard lk(taskState.mtx); + taskState.engineLoaded = true; + taskState.statusMsg = "Running"; + taskState.gpuDeviceId = bestGpu; + taskState.vramUsedMiB = maxDelta; + } + + // --- Start playback --- + pStartFilePlayer(&fpClient); + printf("[Stream0] Playback started\n"); + std::this_thread::sleep_for(std::chrono::milliseconds(500)); + + // --- Single worker thread (reuse ODWorkerThread from stress test) --- + std::thread worker(ODWorkerThread, /*taskId=*/0, fpClient, odHandle, std::ref(taskState)); + + // --- Display loop (single cell) --- + const int cellW = 1280, cellH = 720; + const char* windowName = "Custom Model — Single Stream Isolation"; + cv::namedWindow(windowName, cv::WINDOW_NORMAL); + cv::resizeWindow(windowName, cellW, cellH + 40); + + auto testStart = std::chrono::steady_clock::now(); + + while (g_stressRunning.load()) { + cv::Mat canvas(cellH + 40, cellW, CV_8UC3, cv::Scalar(30, 30, 30)); + + cv::Mat cell; + double fps = 0, infMs = 0, grabMs = 0; + int fCount = 0, dCount = 0, gpuId = -1; + std::string statusMsg, lastDet; + bool engineLoaded = false; + { + std::lock_guard lk(taskState.mtx); + if (!taskState.displayFrame.empty()) { + cv::resize(taskState.displayFrame, cell, cv::Size(cellW, cellH)); + } + fps = taskState.fps; + infMs = taskState.inferenceMs; + grabMs = taskState.grabMs; + fCount = taskState.frameCount; + dCount = taskState.detectionCount; + gpuId = taskState.gpuDeviceId; + statusMsg = taskState.statusMsg; + lastDet = taskState.lastDetection; + engineLoaded = taskState.engineLoaded; + } + + if (cell.empty()) { + cell = cv::Mat(cellH, cellW, CV_8UC3, cv::Scalar(40, 40, 40)); + cv::putText(cell, "Task 0: " + statusMsg, + cv::Point(20, cellH / 2), + cv::FONT_HERSHEY_SIMPLEX, 0.8, cv::Scalar(100, 100, 255), 2); + } + + cv::rectangle(cell, cv::Rect(0, cellH - 45, cellW, 45), + cv::Scalar(0, 0, 0), cv::FILLED); + char bar1[256], bar2[128]; + snprintf(bar1, sizeof(bar1), + "%.1f FPS | inf:%.0fms grab:%.0fms | Frames:%d | Det:%d", + fps, infMs, grabMs, fCount, dCount); + snprintf(bar2, sizeof(bar2), "GPU[%d] | last:%s", + gpuId, lastDet.empty() ? "-" : lastDet.c_str()); + cv::Scalar barColor = engineLoaded ? cv::Scalar(0, 255, 0) : cv::Scalar(0, 100, 255); + cv::putText(cell, bar1, cv::Point(5, cellH - 25), + cv::FONT_HERSHEY_SIMPLEX, 0.5, barColor, 1); + cv::putText(cell, bar2, cv::Point(5, cellH - 5), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 200, 255), 1); + + cell.copyTo(canvas(cv::Rect(0, 0, cellW, cellH))); + + double elapsed = std::chrono::duration( + std::chrono::steady_clock::now() - testStart).count(); + char bottomBar[256]; + snprintf(bottomBar, sizeof(bottomBar), + "Single-stream | Elapsed: %.0fs | %.1f FPS | Press ESC to stop", + elapsed, fps); + cv::putText(canvas, bottomBar, cv::Point(10, cellH + 25), + cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(200, 200, 200), 1); + + cv::imshow(windowName, canvas); + int key = cv::waitKey(30); + if (key == 27) { + printf("\nESC pressed - stopping...\n"); + g_stressRunning.store(false); + } + } + + printf("Waiting for worker thread...\n"); + if (worker.joinable()) worker.join(); + + double totalElapsed = std::chrono::duration( + std::chrono::steady_clock::now() - testStart).count(); + printf("\n============================================================\n"); + printf(" SINGLE-STREAM SUMMARY (runtime: %.0fs)\n", totalElapsed); + printf("============================================================\n"); + printf(" GPU[%d] | %d frames | %d detections | %.1f FPS | Inf: %.0fms\n", + taskState.gpuDeviceId, taskState.frameCount, taskState.detectionCount, + taskState.fps, taskState.inferenceMs); + printf("============================================================\n"); + + if (odHandle) ReleaseANSODHandle(&odHandle); + if (fpClient) { + pStopFilePlayer(&fpClient); + pReleaseFilePlayer(&fpClient); + } + + cv::destroyAllWindows(); + if (pDeinitCameraNetwork) pDeinitCameraNetwork(); + UnloadANSCV(); + + return 0; +}