engines/TensorRTAPI/include/engine/EngineUtilities.inl

#pragma once
#include <filesystem>
#include <NvInfer.h>        // NV_TENSORRT_MAJOR/MINOR/PATCH
#include <NvInferVersion.h> // also defines TRT version macros
#include <cudnn_version.h>  // CUDNN_MAJOR/MINOR/PATCHLEVEL
#include <cuda_runtime.h>   // cudaRuntimeGetVersion

template <typename T>
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {
    if (input.size() == 1) {
        output = std::move(input[0]);
	}
	else {
		auto msg = "The feature vector has incorrect dimensions!";
		std::cout<<msg;
    }
}
template <typename T> 
void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {
    if (input.size() != 1 || input[0].size() != 1) {
        auto msg = "The feature vector has incorrect dimensions!";
        std::cout<<msg;
    }
    output = std::move(input[0][0]);
}
template <typename T>
cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,
                                                                size_t height, size_t width,
                                                                const cv::Scalar& bgcolor) {
    if (input.empty()) {
        return cv::cuda::GpuMat();
    }

    // Use a thread_local stream to avoid creating a new CUDA stream per call.
    // Creating cv::cuda::Stream() each call leaks stream handles under WDDM.
    thread_local cv::cuda::Stream stream;

    float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);
    size_t unpad_w = static_cast<size_t>(r * input.cols);
    size_t unpad_h = static_cast<size_t>(r * input.rows);

    // Resize the input image
    cv::cuda::GpuMat re;
    re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());
    cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);

    // Create the output image and fill with the background color
    cv::cuda::GpuMat out;
    out.create(static_cast<int>(height), static_cast<int>(width), input.type());
    out.setTo(bgcolor, stream);

    // Copy the resized content into the top-left corner
    re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);
    stream.waitForCompletion();
    return out;
}

template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {
    int numGPUs;
    cudaGetDeviceCount(&numGPUs);
    for (int device = 0; device < numGPUs; device++) {
        cudaDeviceProp prop;
        cudaGetDeviceProperties(&prop, device);
        deviceNames.push_back(std::string(prop.name));
    }
}
template <typename T>  int  Engine<T>::getBindingIndexByName(const std::string& name) {
    for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)
    {
		if (name == m_engine->getIOTensorName(i))
		{
			return i;
		}
    }
	return -1;
}


//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {
//    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
//    std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";
//
//    // Add the GPU device name to the file to ensure that the model is only used
//    // on devices with the exact same GPU
//    std::vector<std::string> deviceNames;
//    getDeviceNames(deviceNames);
//
//    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
//        auto msg = "Error, provided device index is out of range!";
//        std::cout<<msg;
//		return "";
//    }
//
//    auto deviceName = deviceNames[options.deviceIndex];
//    // Remove spaces from the device name
//    deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());
//    engineName += "." + deviceName;
//    // Serialize the specified options into the filename
//    if (options.precision == ANSCENTER::Precision::FP16) {
//        engineName += ".fp16";
//    } else if (options.precision == ANSCENTER::Precision::FP32) {
//        engineName += ".fp32";
//    } else {
//        engineName += ".int8";
//    }
//    if (options.maxBatchSize > 1) {
//        engineName += "." + std::to_string(options.maxBatchSize);
//    }
//    return engineName;
//}

template <typename T>
std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,
    const std::string& onnxModelPath)
{
    // -- Base name from ONNX file ---------------------------------------------
    const auto filenamePos = onnxModelPath.find_last_of('/') + 1;
    std::string engineName = onnxModelPath.substr(
        filenamePos,
        onnxModelPath.find_last_of('.') - filenamePos) + ".engine";

    // -- GPU device name ------------------------------------------------------
    // Ensures the engine is only loaded on the exact GPU it was built for.
    std::vector<std::string> deviceNames;
    getDeviceNames(deviceNames);
    if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {
        std::cout << "Error, provided device index is out of range!";
        return "";
    }
    auto deviceName = deviceNames[options.deviceIndex];
    deviceName.erase(
        std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),
        deviceName.end());
    engineName += "." + deviceName;

    // -- Precision ------------------------------------------------------------
    if (options.precision == ANSCENTER::Precision::FP16) {
        engineName += ".fp16";
    }
    else if (options.precision == ANSCENTER::Precision::FP32) {
        engineName += ".fp32";
    }
    else {
        engineName += ".int8";
    }

    // -- Batch size -----------------------------------------------------------
    if (options.maxBatchSize > 1) {
        engineName += ".b" + std::to_string(options.maxBatchSize);
    }

    // -- Max spatial dims: intentionally NOT included in the filename ----------
    // buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory
    // is insufficient.  If the filename included .s{H}x{W}, the next launch
    // would look for .s2560x2560, miss the cached .s1920x1920, and waste
    // minutes re-attempting the doomed 2560 build before falling back.
    // Without the suffix, the cache is found immediately on the next launch.
    // The actual profile max is queried at runtime via getProfileMaxHeight/Width.

    // -- TensorRT version -----------------------------------------------------
    // Engine format changes between TensorRT minor versions -- must rebuild.
    // NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in
    // <NvInferVersion.h> which is included via NvInfer.h.
    engineName += ".trt"
        + std::to_string(NV_TENSORRT_MAJOR) + "."
        + std::to_string(NV_TENSORRT_MINOR) + "."
        + std::to_string(NV_TENSORRT_PATCH);

    // -- CUDA runtime version -------------------------------------------------
    // Engines built with different CUDA versions may use different PTX/cubin
    // formats and must be rebuilt.
    int cudaVersion = 0;
    cudaRuntimeGetVersion(&cudaVersion);
    const int cudaMajor = cudaVersion / 1000;
    const int cudaMinor = (cudaVersion % 1000) / 10;
    engineName += ".cuda"
        + std::to_string(cudaMajor) + "."
        + std::to_string(cudaMinor);

    // -- cuDNN version --------------------------------------------------------
    // cuDNN version affects layer implementations inside the engine.
    // CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.
    engineName += ".cudnn"
        + std::to_string(CUDNN_MAJOR) + "."
        + std::to_string(CUDNN_MINOR);

    return engineName;
}

template <typename T>
cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,
                                            const std::array<float, 3> &divVals, bool normalize, bool swapRB,
                                            cv::cuda::Stream &stream) {
    cv::cuda::GpuMat result;
    if (batchInput.empty()) return result;
    if (batchInput[0].channels() != 3) return result;

    const int H     = batchInput[0].rows;
    const int W     = batchInput[0].cols;
    const int batch = static_cast<int>(batchInput.size());
    const size_t planeSize = static_cast<size_t>(H) * W;   // pixels per channel
    const int totalElems = batch * 3 * static_cast<int>(planeSize);

    // thread_local cached buffers — reused across calls on the same thread.
    // KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak
    // caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate
    // (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).
    thread_local cv::cuda::GpuMat tl_blob;
    thread_local cv::cuda::GpuMat tl_floatImg;
    thread_local int tl_blobMaxElems = 0;

    if (totalElems > tl_blobMaxElems) {
        tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);
        tl_blobMaxElems = totalElems;
        size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);
        ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",
                batch, W, H, blobBytes / (1024.0 * 1024.0));
    }
    // Use a sub-region of the cached blob for the current batch
    cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);

    for (int img = 0; img < batch; ++img) {
        if (normalize) {
            batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);
        } else {
            batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);
        }

        cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);
        cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);

        // 2. Split normalised HWC image into CHW planes directly into the blob.
        size_t offset = static_cast<size_t>(img) * 3 * planeSize;

        if (swapRB) {
            std::vector<cv::cuda::GpuMat> channels{
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};
            cv::cuda::split(tl_floatImg, channels, stream);
        } else {
            std::vector<cv::cuda::GpuMat> channels{
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),
                cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};
            cv::cuda::split(tl_floatImg, channels, stream);
        }
    }

    return blob;
}

template <typename T> void Engine<T>::clearGpuBuffers() {
    if (!m_buffers.empty()) {
        // Free ALL I/O GPU buffers (both inputs and outputs).
        for (void* ptr : m_buffers) {
            if (ptr) {
                Util::checkCudaErrorCode(cudaFree(ptr));
            }
        }
        m_buffers.clear();
    }

    // Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).
    // They are cleaned up automatically when threads exit.
    ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");
}
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`#pragma once`
			`#include <filesystem>`
			`#include <NvInfer.h> // NV_TENSORRT_MAJOR/MINOR/PATCH`
			`#include <NvInferVersion.h> // also defines TRT version macros`
			`#include <cudnn_version.h> // CUDNN_MAJOR/MINOR/PATCHLEVEL`
			`#include <cuda_runtime.h> // cudaRuntimeGetVersion`

			`template <typename T>`
			`void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output) {`
			`if (input.size() == 1) {`
			`output = std::move(input[0]);`
			`}`
			`else {`
			`auto msg = "The feature vector has incorrect dimensions!";`
			`std::cout<<msg;`
			`}`
			`}`
			`template <typename T>`
			`void Engine<T>::transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output) {`
			`if (input.size() != 1 \|\| input[0].size() != 1) {`
			`auto msg = "The feature vector has incorrect dimensions!";`
			`std::cout<<msg;`
			`}`
			`output = std::move(input[0][0]);`
			`}`
			`template <typename T>`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat& input,`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`size_t height, size_t width,`
			`const cv::Scalar& bgcolor) {`
			`if (input.empty()) {`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`return cv::cuda::GpuMat();`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`}`
Use software decoder by default 2026-04-04 20:19:54 +11:00
			`// Use a thread_local stream to avoid creating a new CUDA stream per call.`
			`// Creating cv::cuda::Stream() each call leaks stream handles under WDDM.`
			`thread_local cv::cuda::Stream stream;`

Initial setup for CLion 2026-03-28 16:54:11 +11:00			`float r = std::min(static_cast<float>(width) / input.cols, static_cast<float>(height) / input.rows);`
			`size_t unpad_w = static_cast<size_t>(r * input.cols);`
			`size_t unpad_h = static_cast<size_t>(r * input.rows);`
Use software decoder by default 2026-04-04 20:19:54 +11:00
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`// Resize the input image`
			`cv::cuda::GpuMat re;`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`re.create(static_cast<int>(unpad_h), static_cast<int>(unpad_w), input.type());`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`cv::cuda::resize(input, re, re.size(), 0, 0, cv::INTER_LINEAR, stream);`
Use software decoder by default 2026-04-04 20:19:54 +11:00
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`// Create the output image and fill with the background color`
			`cv::cuda::GpuMat out;`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`out.create(static_cast<int>(height), static_cast<int>(width), input.type());`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`out.setTo(bgcolor, stream);`
Use software decoder by default 2026-04-04 20:19:54 +11:00
			`// Copy the resized content into the top-left corner`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)), stream);`
			`stream.waitForCompletion();`
			`return out;`
			`}`

			`template <typename T> void Engine<T>::getDeviceNames(std::vector<std::string> &deviceNames) {`
			`int numGPUs;`
			`cudaGetDeviceCount(&numGPUs);`
			`for (int device = 0; device < numGPUs; device++) {`
			`cudaDeviceProp prop;`
			`cudaGetDeviceProperties(&prop, device);`
			`deviceNames.push_back(std::string(prop.name));`
			`}`
			`}`
			`template <typename T> int Engine<T>::getBindingIndexByName(const std::string& name) {`
			`for (int i = 0, e = m_engine->getNbIOTensors(); i < e; i++)`
			`{`
			`if (name == m_engine->getIOTensorName(i))`
			`{`
			`return i;`
			`}`
			`}`
			`return -1;`
			`}`


			`//template <typename T> std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options &options, const std::string &onnxModelPath) {`
			`// const auto filenamePos = onnxModelPath.find_last_of('/') + 1;`
			`// std::string engineName = onnxModelPath.substr(filenamePos, onnxModelPath.find_last_of('.') - filenamePos) + ".engine";`
			`//`
			`// // Add the GPU device name to the file to ensure that the model is only used`
			`// // on devices with the exact same GPU`
			`// std::vector<std::string> deviceNames;`
			`// getDeviceNames(deviceNames);`
			`//`
			`// if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {`
			`// auto msg = "Error, provided device index is out of range!";`
			`// std::cout<<msg;`
			`// return "";`
			`// }`
			`//`
			`// auto deviceName = deviceNames[options.deviceIndex];`
			`// // Remove spaces from the device name`
			`// deviceName.erase(std::remove_if(deviceName.begin(), deviceName.end(), ::isspace), deviceName.end());`
			`// engineName += "." + deviceName;`
			`// // Serialize the specified options into the filename`
			`// if (options.precision == ANSCENTER::Precision::FP16) {`
			`// engineName += ".fp16";`
			`// } else if (options.precision == ANSCENTER::Precision::FP32) {`
			`// engineName += ".fp32";`
			`// } else {`
			`// engineName += ".int8";`
			`// }`
			`// if (options.maxBatchSize > 1) {`
			`// engineName += "." + std::to_string(options.maxBatchSize);`
			`// }`
			`// return engineName;`
			`//}`

			`template <typename T>`
			`std::string Engine<T>::serializeEngineOptions(const ANSCENTER::Options& options,`
			`const std::string& onnxModelPath)`
			`{`
			`// -- Base name from ONNX file ---------------------------------------------`
			`const auto filenamePos = onnxModelPath.find_last_of('/') + 1;`
			`std::string engineName = onnxModelPath.substr(`
			`filenamePos,`
			`onnxModelPath.find_last_of('.') - filenamePos) + ".engine";`

			`// -- GPU device name ------------------------------------------------------`
			`// Ensures the engine is only loaded on the exact GPU it was built for.`
			`std::vector<std::string> deviceNames;`
			`getDeviceNames(deviceNames);`
			`if (static_cast<size_t>(options.deviceIndex) >= deviceNames.size()) {`
			`std::cout << "Error, provided device index is out of range!";`
			`return "";`
			`}`
			`auto deviceName = deviceNames[options.deviceIndex];`
			`deviceName.erase(`
			`std::remove_if(deviceName.begin(), deviceName.end(), ::isspace),`
			`deviceName.end());`
			`engineName += "." + deviceName;`

			`// -- Precision ------------------------------------------------------------`
			`if (options.precision == ANSCENTER::Precision::FP16) {`
			`engineName += ".fp16";`
			`}`
			`else if (options.precision == ANSCENTER::Precision::FP32) {`
			`engineName += ".fp32";`
			`}`
			`else {`
			`engineName += ".int8";`
			`}`

			`// -- Batch size -----------------------------------------------------------`
			`if (options.maxBatchSize > 1) {`
			`engineName += ".b" + std::to_string(options.maxBatchSize);`
			`}`

			`// -- Max spatial dims: intentionally NOT included in the filename ----------`
			`// buildWithRetry() may reduce max dims (e.g. 2560→1920) when GPU memory`
			`// is insufficient. If the filename included .s{H}x{W}, the next launch`
			`// would look for .s2560x2560, miss the cached .s1920x1920, and waste`
			`// minutes re-attempting the doomed 2560 build before falling back.`
			`// Without the suffix, the cache is found immediately on the next launch.`
			`// The actual profile max is queried at runtime via getProfileMaxHeight/Width.`

			`// -- TensorRT version -----------------------------------------------------`
			`// Engine format changes between TensorRT minor versions -- must rebuild.`
			`// NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, NV_TENSORRT_PATCH are defined in`
			`// <NvInferVersion.h> which is included via NvInfer.h.`
			`engineName += ".trt"`
			`+ std::to_string(NV_TENSORRT_MAJOR) + "."`
			`+ std::to_string(NV_TENSORRT_MINOR) + "."`
			`+ std::to_string(NV_TENSORRT_PATCH);`

			`// -- CUDA runtime version -------------------------------------------------`
			`// Engines built with different CUDA versions may use different PTX/cubin`
			`// formats and must be rebuilt.`
			`int cudaVersion = 0;`
			`cudaRuntimeGetVersion(&cudaVersion);`
			`const int cudaMajor = cudaVersion / 1000;`
			`const int cudaMinor = (cudaVersion % 1000) / 10;`
			`engineName += ".cuda"`
			`+ std::to_string(cudaMajor) + "."`
			`+ std::to_string(cudaMinor);`

			`// -- cuDNN version --------------------------------------------------------`
			`// cuDNN version affects layer implementations inside the engine.`
			`// CUDNN_MAJOR, CUDNN_MINOR, CUDNN_PATCHLEVEL are defined in <cudnn_version.h>.`
			`engineName += ".cudnn"`
			`+ std::to_string(CUDNN_MAJOR) + "."`
			`+ std::to_string(CUDNN_MINOR);`

			`return engineName;`
			`}`

			`template <typename T>`
			`cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,`
			`const std::array<float, 3> &divVals, bool normalize, bool swapRB,`
			`cv::cuda::Stream &stream) {`
			`cv::cuda::GpuMat result;`
			`if (batchInput.empty()) return result;`
			`if (batchInput[0].channels() != 3) return result;`

			`const int H = batchInput[0].rows;`
			`const int W = batchInput[0].cols;`
			`const int batch = static_cast<int>(batchInput.size());`
			`const size_t planeSize = static_cast<size_t>(H) * W; // pixels per channel`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`const int totalElems = batch * 3 * static_cast<int>(planeSize);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00
Use software decoder by default 2026-04-04 20:19:54 +11:00			`// thread_local cached buffers — reused across calls on the same thread.`
			`// KEY: allocate for MAX seen size, never shrink. This prevents the VRAM leak`
			`// caused by OpenCV's GpuMat pool growing unbounded when batch sizes alternate`
			`// (e.g., batch=1,6,1,6 → each size triggers new alloc, old goes to pool, never freed).`
			`thread_local cv::cuda::GpuMat tl_blob;`
			`thread_local cv::cuda::GpuMat tl_floatImg;`
			`thread_local int tl_blobMaxElems = 0;`

			`if (totalElems > tl_blobMaxElems) {`
			`tl_blob = cv::cuda::GpuMat(1, totalElems, CV_32FC1);`
			`tl_blobMaxElems = totalElems;`
			`size_t blobBytes = static_cast<size_t>(totalElems) * sizeof(float);`
			`ANS_DBG("TRT_Preproc", "blobFromGpuMats: ALLOC blob batch=%d %dx%d %.1fMB (new max)",`
			`batch, W, H, blobBytes / (1024.0 * 1024.0));`
			`}`
			`// Use a sub-region of the cached blob for the current batch`
			`cv::cuda::GpuMat blob = tl_blob.colRange(0, totalElems);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00
			`for (int img = 0; img < batch; ++img) {`
			`if (normalize) {`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.f / 255.f, stream);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`} else {`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`batchInput[img].convertTo(tl_floatImg, CV_32FC3, 1.0, stream);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`}`

Use software decoder by default 2026-04-04 20:19:54 +11:00			`cv::cuda::subtract(tl_floatImg, cv::Scalar(subVals[0], subVals[1], subVals[2]), tl_floatImg, cv::noArray(), -1, stream);`
			`cv::cuda::divide(tl_floatImg, cv::Scalar(divVals[0], divVals[1], divVals[2]), tl_floatImg, 1, -1, stream);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00
			`// 2. Split normalised HWC image into CHW planes directly into the blob.`
			`size_t offset = static_cast<size_t>(img) * 3 * planeSize;`

			`if (swapRB) {`
			`std::vector<cv::cuda::GpuMat> channels{`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize),`
			`cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),`
			`cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset)};`
			`cv::cuda::split(tl_floatImg, channels, stream);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`} else {`
			`std::vector<cv::cuda::GpuMat> channels{`
			`cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset),`
			`cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + planeSize),`
			`cv::cuda::GpuMat(H, W, CV_32FC1, blob.ptr<float>() + offset + 2 * planeSize)};`
Use software decoder by default 2026-04-04 20:19:54 +11:00			`cv::cuda::split(tl_floatImg, channels, stream);`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`}`
			`}`

			`return blob;`
			`}`

			`template <typename T> void Engine<T>::clearGpuBuffers() {`
			`if (!m_buffers.empty()) {`
			`// Free ALL I/O GPU buffers (both inputs and outputs).`
			`for (void* ptr : m_buffers) {`
			`if (ptr) {`
			`Util::checkCudaErrorCode(cudaFree(ptr));`
			`}`
			`}`
			`m_buffers.clear();`
			`}`
Use software decoder by default 2026-04-04 20:19:54 +11:00
			`// Note: blob/floatImg caches are thread_local inside blobFromGpuMats (static method).`
			`// They are cleaned up automatically when threads exit.`
			`ANS_DBG("TRT_Engine", "clearGpuBuffers: I/O buffers released");`
Initial setup for CLion 2026-03-28 16:54:11 +11:00			`}`