Use software decoder by default
This commit is contained in:
@@ -3,6 +3,7 @@
|
||||
#include <cmath>
|
||||
#include <json.hpp>
|
||||
#include "ANSODEngine.h"
|
||||
#include "ANSLicense.h" // ANS_DBG macro
|
||||
#include "ANSYOLOOD.h"
|
||||
#include "ANSTENSORRTOD.h"
|
||||
#include "ANSTENSORRTCL.h"
|
||||
@@ -879,6 +880,9 @@ namespace ANSCENTER
|
||||
std::vector<Object> allResults;
|
||||
allResults.clear();
|
||||
try {
|
||||
ANS_DBG("ODEngine", "SAHI START: %dx%d tile=%dx%d overlap=%.1f cam=%s",
|
||||
input.cols, input.rows, tiledWidth, tiledHeight, overLap, camera_id.c_str());
|
||||
auto _sahiStart = std::chrono::steady_clock::now();
|
||||
cv::Mat image = input.clone();
|
||||
if (image.empty() || !image.data || !image.u) {
|
||||
return allResults;
|
||||
@@ -920,6 +924,16 @@ namespace ANSCENTER
|
||||
//4. Apply Non-Maximum Suppression (NMS) to merge overlapping results
|
||||
float iouThreshold = 0.1;
|
||||
std::vector<Object> finalResults = ANSUtilityHelper::ApplyNMS(allResults, iouThreshold);
|
||||
{
|
||||
double _sahiMs = std::chrono::duration<double, std::milli>(
|
||||
std::chrono::steady_clock::now() - _sahiStart).count();
|
||||
ANS_DBG("ODEngine", "SAHI DONE: %.1fms patches=%zu results=%zu cam=%s",
|
||||
_sahiMs, patches.size() + 1, finalResults.size(), camera_id.c_str());
|
||||
if (_sahiMs > 2000.0) {
|
||||
ANS_DBG("ODEngine", "SAHI SLOW: %.1fms — %zu patches held _mutex entire time!",
|
||||
_sahiMs, patches.size() + 1);
|
||||
}
|
||||
}
|
||||
image.release();
|
||||
return finalResults;
|
||||
}
|
||||
@@ -2103,6 +2117,8 @@ namespace ANSCENTER
|
||||
// No coarse _mutex — sub-components (engines, trackers) have their own locks.
|
||||
// LabVIEW semaphore controls concurrency at the caller level.
|
||||
try {
|
||||
ANS_DBG("ODEngine", "RunInferenceWithOption: cam=%s %dx%d mode=%s",
|
||||
camera_id.c_str(), input.cols, input.rows, activeROIMode.c_str());
|
||||
int mode = 0;
|
||||
double confidenceThreshold = 0.35;
|
||||
std::vector<int> trackingObjectIds;
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
#include "ANSRTYOLO.h"
|
||||
#include "Utility.h"
|
||||
#include "ANSLicense.h" // ANS_DBG macro for DebugView
|
||||
#include <future>
|
||||
#include <numeric>
|
||||
#include <cmath>
|
||||
@@ -903,7 +904,6 @@ namespace ANSCENTER {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Check if model is classification (output ndims <= 2)
|
||||
const auto& outputDims = m_trtEngine->getOutputDims();
|
||||
const bool isClassification = !outputDims.empty() && outputDims[0].nbDims <= 2;
|
||||
|
||||
@@ -914,11 +914,8 @@ namespace ANSCENTER {
|
||||
cv::cuda::GpuMat resized;
|
||||
if (imgRGB.rows != inputH || imgRGB.cols != inputW) {
|
||||
if (isClassification) {
|
||||
// Classification: direct resize (no letterbox padding)
|
||||
// Must use explicit stream to avoid conflict with CUDA Graph capture on null stream
|
||||
cv::cuda::resize(imgRGB, resized, cv::Size(inputW, inputH), 0, 0, cv::INTER_LINEAR, stream);
|
||||
} else {
|
||||
// Detection/Seg/Pose/OBB: letterbox resize + right-bottom pad
|
||||
resized = Engine<float>::resizeKeepAspectRatioPadRightBottom(imgRGB, inputH, inputW);
|
||||
}
|
||||
}
|
||||
@@ -1831,8 +1828,7 @@ namespace ANSCENTER {
|
||||
}
|
||||
|
||||
// --- 2. Preprocess under lock ---
|
||||
// Try NV12 fast path first (12MB upload vs 24MB BGR for 4K)
|
||||
// Falls back to standard GPU preprocessing if no NV12 data available.
|
||||
ANS_DBG("YOLO", "Preprocess START %dx%d", inputImage.cols, inputImage.rows);
|
||||
ImageMetadata meta;
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> input;
|
||||
bool usedNV12 = false;
|
||||
@@ -1874,11 +1870,22 @@ namespace ANSCENTER {
|
||||
}
|
||||
|
||||
// --- 3. TRT Inference (mutex released for concurrent GPU slots) ---
|
||||
ANS_DBG("YOLO", "TRT inference START nv12=%d inputSize=%dx%d",
|
||||
(int)usedNV12,
|
||||
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].cols),
|
||||
input.empty() ? 0 : (input[0].empty() ? 0 : input[0][0].rows));
|
||||
auto _trtStart = std::chrono::steady_clock::now();
|
||||
std::vector<std::vector<std::vector<float>>> featureVectors;
|
||||
if (!m_trtEngine->runInference(input, featureVectors)) {
|
||||
ANS_DBG("YOLO", "ERROR: TRT runInference FAILED");
|
||||
_logger.LogError("ANSRTYOLO::DetectObjects", "Error running inference", __FILE__, __LINE__);
|
||||
return {};
|
||||
}
|
||||
auto _trtEnd = std::chrono::steady_clock::now();
|
||||
double _trtMs = std::chrono::duration<double, std::milli>(_trtEnd - _trtStart).count();
|
||||
if (_trtMs > 500.0) {
|
||||
ANS_DBG("YOLO", "SLOW TRT inference: %.1fms", _trtMs);
|
||||
}
|
||||
double msInference = dbg ? elapsed() : 0;
|
||||
|
||||
// --- 4. Transform output ---
|
||||
|
||||
@@ -81,6 +81,7 @@ namespace ANSCENTER {
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> PreprocessBatch(
|
||||
const std::vector<cv::Mat>& inputImages, BatchMetadata& outMetadata);
|
||||
|
||||
|
||||
// ── Detection pipeline ───────────────────────────────────────────
|
||||
std::vector<Object> DetectObjects(const cv::Mat& inputImage,
|
||||
const std::string& camera_id);
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
#include "NV12PreprocessHelper.h"
|
||||
#include "ANSGpuFrameRegistry.h"
|
||||
#include "ANSEngineCommon.h"
|
||||
#include "ANSLicense.h" // ANS_DBG macro
|
||||
#include <opencv2/cudaimgproc.hpp>
|
||||
#include <opencv2/cudawarping.hpp>
|
||||
#include <opencv2/core/cuda_stream_accessor.hpp>
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include "engine/TRTEngineCache.h" // clearAll() on DLL_PROCESS_DETACH
|
||||
#include "engine/EnginePoolManager.h" // clearAll() on DLL_PROCESS_DETACH
|
||||
#include <climits> // INT_MIN
|
||||
#include "ANSLicense.h" // ANS_DBG macro for DebugView
|
||||
|
||||
// Process-wide flag: when true, all engines force single-GPU path (no pool, no idle timers).
|
||||
// Defined here, declared extern in EngineBuildLoadNetwork.inl.
|
||||
@@ -1696,6 +1697,8 @@ static int RunInferenceComplete_LV_Impl(
|
||||
auto* engine = guard.get();
|
||||
|
||||
try {
|
||||
auto _t0 = std::chrono::steady_clock::now();
|
||||
|
||||
// Save/restore thread-local to support nested calls (custom model DLLs
|
||||
// calling back into ANSODEngine via ANSLIB.dll).
|
||||
GpuFrameData* savedFrame = tl_currentGpuFrame();
|
||||
@@ -1708,6 +1711,10 @@ static int RunInferenceComplete_LV_Impl(
|
||||
int originalWidth = localImage.cols;
|
||||
int originalHeight = localImage.rows;
|
||||
|
||||
ANS_DBG("LV_Inference", "START cam=%s %dx%d gpuFrame=%p nv12=%s",
|
||||
cameraId ? cameraId : "?", originalWidth, originalHeight,
|
||||
(void*)gpuFrame, gpuFrame ? "YES" : "NO");
|
||||
|
||||
if (originalWidth == 0 || originalHeight == 0) {
|
||||
tl_currentGpuFrame() = savedFrame;
|
||||
return -2;
|
||||
@@ -1717,8 +1724,17 @@ static int RunInferenceComplete_LV_Impl(
|
||||
// Safe: *cvImage holds a refcount, keeping gpuFrame alive during inference.
|
||||
// Only use OWN gpuFrame — never inherit outer caller's frame (dimension mismatch on crops).
|
||||
tl_currentGpuFrame() = gpuFrame;
|
||||
auto _t1 = std::chrono::steady_clock::now();
|
||||
std::vector<ANSCENTER::Object> outputs = engine->RunInferenceWithOption(localImage, cameraId, activeROIMode);
|
||||
auto _t2 = std::chrono::steady_clock::now();
|
||||
tl_currentGpuFrame() = savedFrame;
|
||||
|
||||
double prepMs = std::chrono::duration<double, std::milli>(_t1 - _t0).count();
|
||||
double infMs = std::chrono::duration<double, std::milli>(_t2 - _t1).count();
|
||||
if (infMs > 500.0) {
|
||||
ANS_DBG("LV_Inference", "SLOW cam=%s prep=%.1fms inf=%.1fms results=%zu",
|
||||
cameraId ? cameraId : "?", prepMs, infMs, outputs.size());
|
||||
}
|
||||
bool getJpeg = (getJpegString == 1);
|
||||
std::string stImage;
|
||||
// NOTE: odMutex was removed here. All variables in this scope are local
|
||||
|
||||
@@ -402,6 +402,9 @@ private:
|
||||
cudaStream_t m_memoryStream; // ADD THIS - separate stream for memory operations
|
||||
std::vector<cv::cuda::GpuMat> m_preprocessedInputs; // Keep inputs alive
|
||||
|
||||
// Note: blobFromGpuMats and resizeKeepAspectRatioPadRightBottom are static,
|
||||
// so cached buffers use thread_local inside the functions themselves.
|
||||
|
||||
|
||||
// Thermal management (ADD THESE)
|
||||
//int m_consecutiveInferences;
|
||||
@@ -431,7 +434,7 @@ private:
|
||||
|
||||
Logger m_logger;
|
||||
bool m_verbose{ true }; // false for non-probe pool slots
|
||||
bool m_disableGraphs{ false }; // true for pool slots — concurrent graph captures corrupt CUDA context
|
||||
bool m_disableGraphs{ true }; // DISABLED by default — concurrent graph launches + uploads cause GPU deadlock on WDDM
|
||||
|
||||
// -- Multi-GPU pool data ---------------------------------------------------
|
||||
|
||||
|
||||
Reference in New Issue
Block a user