diff --git a/modules/ANSCV/ANSOpenCV.cpp b/modules/ANSCV/ANSOpenCV.cpp index 8540acf..6cbe5f5 100644 --- a/modules/ANSCV/ANSOpenCV.cpp +++ b/modules/ANSCV/ANSOpenCV.cpp @@ -19,6 +19,9 @@ #include #include #include +#include +#include +#include "ANSCVVendorGate.h" #include #include #include @@ -150,7 +153,110 @@ namespace ANSCENTER return jpegString; } + // ── NvJpegCompressor: GPU-accelerated JPEG (NVIDIA only) ── + + NvJpegCompressor::NvJpegCompressor() { + if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) return; + + auto handle = reinterpret_cast(&_nvHandle); + auto state = reinterpret_cast(&_encState); + auto params = reinterpret_cast(&_encParams); + auto stream = reinterpret_cast(&_stream); + + if (nvjpegCreateSimple(handle) != NVJPEG_STATUS_SUCCESS) return; + if (nvjpegEncoderStateCreate(*handle, state, nullptr) != NVJPEG_STATUS_SUCCESS) { cleanup(); return; } + if (nvjpegEncoderParamsCreate(*handle, params, nullptr) != NVJPEG_STATUS_SUCCESS) { cleanup(); return; } + if (cudaStreamCreate(stream) != cudaSuccess) { cleanup(); return; } + + _valid = true; + } + + NvJpegCompressor::~NvJpegCompressor() noexcept { cleanup(); } + + void NvJpegCompressor::cleanup() noexcept { + if (_gpuBuffer) { cudaFree(_gpuBuffer); _gpuBuffer = nullptr; _gpuBufferSize = 0; } + if (_stream) { cudaStreamDestroy(reinterpret_cast(_stream)); _stream = nullptr; } + if (_encParams) { nvjpegEncoderParamsDestroy(reinterpret_cast(_encParams)); _encParams = nullptr; } + if (_encState) { nvjpegEncoderStateDestroy(reinterpret_cast(_encState)); _encState = nullptr; } + if (_nvHandle) { nvjpegDestroy(reinterpret_cast(_nvHandle)); _nvHandle = nullptr; } + _valid = false; + } + + std::string NvJpegCompressor::compress(const cv::Mat& image, int quality) { + if (!_valid || image.empty()) return ""; + + // Only support BGR 8-bit (the common path) + if (image.type() != CV_8UC3) return ""; + + auto handle = reinterpret_cast(_nvHandle); + auto state = reinterpret_cast(_encState); + auto params = reinterpret_cast(_encParams); + auto stream = reinterpret_cast(_stream); + + int width = image.cols; + int height = image.rows; + size_t imageSize = static_cast(width) * height * 3; + + // Reuse GPU buffer, grow if needed + if (imageSize > _gpuBufferSize) { + if (_gpuBuffer) cudaFree(_gpuBuffer); + // Allocate with 25% headroom to reduce reallocations + _gpuBufferSize = imageSize + imageSize / 4; + if (cudaMalloc(&_gpuBuffer, _gpuBufferSize) != cudaSuccess) { + _gpuBuffer = nullptr; + _gpuBufferSize = 0; + return ""; + } + } + + // Upload interleaved BGR to GPU + if (cudaMemcpy(_gpuBuffer, image.data, imageSize, cudaMemcpyHostToDevice) != cudaSuccess) + return ""; + + // Configure encoder + if (nvjpegEncoderParamsSetQuality(params, quality, stream) != NVJPEG_STATUS_SUCCESS) return ""; + if (nvjpegEncoderParamsSetSamplingFactors(params, NVJPEG_CSS_420, stream) != NVJPEG_STATUS_SUCCESS) return ""; + if (nvjpegEncoderParamsSetOptimizedHuffman(params, 1, stream) != NVJPEG_STATUS_SUCCESS) return ""; + + // Set up nvjpegImage_t for interleaved BGR + nvjpegImage_t nv_image = {}; + nv_image.channel[0] = _gpuBuffer; + nv_image.pitch[0] = static_cast(width * 3); + + // Encode + if (nvjpegEncodeImage(handle, state, params, &nv_image, + NVJPEG_INPUT_BGRI, width, height, stream) != NVJPEG_STATUS_SUCCESS) + return ""; + + // Get compressed size + size_t jpegSize = 0; + if (nvjpegEncodeRetrieveBitstream(handle, state, nullptr, &jpegSize, stream) != NVJPEG_STATUS_SUCCESS) + return ""; + + // Retrieve bitstream + std::string jpegStr(jpegSize, '\0'); + if (nvjpegEncodeRetrieveBitstream(handle, state, + reinterpret_cast(jpegStr.data()), &jpegSize, stream) != NVJPEG_STATUS_SUCCESS) + return ""; + + if (cudaStreamSynchronize(stream) != cudaSuccess) + return ""; + + jpegStr.resize(jpegSize); + return jpegStr; + } + + // ── Unified entry point: nvJPEG on NVIDIA, TurboJPEG otherwise ── + std::string CompressJpegToString(const cv::Mat& image, int quality) { + if (anscv_vendor_gate::IsNvidiaGpuAvailable()) { + static thread_local NvJpegCompressor nvCompressor; + if (nvCompressor.isValid()) { + std::string result = nvCompressor.compress(image, quality); + if (!result.empty()) return result; + } + // Fall through to TurboJPEG on failure + } static thread_local TurboJpegCompressor compressor; return compressor.compress(image, quality); } @@ -6524,9 +6630,73 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu if (quality <= 0 || quality > 100) quality = 85; - // Decode BMP from memory int bmpSize = (*bmpInput)->cnt; - std::vector bmpData((*bmpInput)->str, (*bmpInput)->str + bmpSize); + unsigned char* raw = reinterpret_cast((*bmpInput)->str); + + // ── Fast path: parse BMP header directly, zero-copy ── + // Minimum BMP = file header (14) + info header (40) + some pixels + constexpr int kMinBmpSize = sizeof(BmpFileHeader) + sizeof(BmpInfoHeader) + 1; + if (bmpSize >= kMinBmpSize && raw[0] == 'B' && raw[1] == 'M') { + + const auto& fh = *reinterpret_cast(raw); + const auto& ih = *reinterpret_cast(raw + sizeof(BmpFileHeader)); + + int width = ih.width; + int height = ih.height; // negative = top-down + bool topDown = (height < 0); + if (height < 0) height = -height; + + // Only handle 24-bit uncompressed (the format ImageToBmp produces) + if (ih.bitCount == 24 && ih.compression == 0 && width > 0 && height > 0) { + int rowBytes = width * 3; + int stride = (rowBytes + 3) & ~3; // BMP rows are 4-byte aligned + + // Verify the buffer is large enough + int pixelOffset = static_cast(fh.offsetData); + int64_t neededSize = static_cast(pixelOffset) + static_cast(stride) * height; + if (bmpSize >= neededSize) { + unsigned char* pixels = raw + pixelOffset; + + cv::Mat mat; + if (topDown) { + // Top-down BMP: rows are already in correct order + // If no padding, wrap directly; otherwise need to handle stride + if (stride == rowBytes) { + mat = cv::Mat(height, width, CV_8UC3, pixels); + } else { + mat = cv::Mat(height, width, CV_8UC3, pixels, stride); + } + } else { + // Bottom-up BMP: flip to top-down for JPEG encoding + // Create Mat pointing at the last row with negative step + // OpenCV doesn't support negative step, so flip + cv::Mat bottomUp(height, width, CV_8UC3, pixels, stride); + cv::flip(bottomUp, mat, 0); + } + + ANS_DBG("ANSCV", "BmpToJpeg: fast-path %dx%d, encoding JPEG q=%d", width, height, quality); + + std::string jpegStr = ANSCENTER::CompressJpegToString(mat, quality); + if (!jpegStr.empty()) { + int size = static_cast(jpegStr.size()); + MgErr error = DSSetHandleSize(jpegOutput, sizeof(int32) + size * sizeof(uChar)); + if (error != noErr) { + ANS_DBG("ANSCV", "BmpToJpeg: DSSetHandleSize failed - err=%d", error); + return -4; + } + (*jpegOutput)->cnt = size; + memcpy((*jpegOutput)->str, jpegStr.data(), size); + ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS (fast) - %d bytes BMP -> %d bytes JPEG", bmpSize, size); + return 1; + } + // If fast-path encode failed, fall through to imdecode path + } + } + } + + // ── Fallback: use imdecode for non-standard BMP formats ── + ANS_DBG("ANSCV", "BmpToJpeg: using imdecode fallback for %d bytes", bmpSize); + std::vector bmpData(raw, raw + bmpSize); cv::Mat mat = cv::imdecode(bmpData, cv::IMREAD_COLOR); if (mat.empty()) { ANS_DBG("ANSCV", "BmpToJpeg: imdecode failed - %d bytes input", bmpSize); @@ -6535,7 +6705,6 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu ANS_DBG("ANSCV", "BmpToJpeg: decoded %dx%d, encoding JPEG q=%d", mat.cols, mat.rows, quality); - // Encode to JPEG using TurboJPEG if available, else cv::imencode std::string jpegStr = ANSCENTER::CompressJpegToString(mat, quality); if (jpegStr.empty()) { ANS_DBG("ANSCV", "BmpToJpeg: JPEG encode failed"); @@ -6551,7 +6720,7 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu (*jpegOutput)->cnt = size; memcpy((*jpegOutput)->str, jpegStr.data(), size); - ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS - %d bytes BMP -> %d bytes JPEG", bmpSize, size); + ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS (fallback) - %d bytes BMP -> %d bytes JPEG", bmpSize, size); return 1; } catch (const std::exception& e) { diff --git a/modules/ANSCV/ANSOpenCV.h b/modules/ANSCV/ANSOpenCV.h index dd0325f..137d32c 100644 --- a/modules/ANSCV/ANSOpenCV.h +++ b/modules/ANSCV/ANSOpenCV.h @@ -34,16 +34,35 @@ namespace ANSCENTER public: TurboJpegCompressor(); ~TurboJpegCompressor() noexcept; - // Delete copy constructor and assignment operator TurboJpegCompressor(const TurboJpegCompressor&) = delete; TurboJpegCompressor& operator=(const TurboJpegCompressor&) = delete; - // Your original logic with minimal optimizations [[nodiscard]] std::string compress(const cv::Mat& image, int quality); private: void* _handle = nullptr; unsigned char* _buffer = nullptr; unsigned long _bufferSize = 0; }; + + // GPU-accelerated JPEG encoder using nvJPEG (NVIDIA only). + // Falls back silently if init fails or on non-NVIDIA hardware. + class NvJpegCompressor { + public: + NvJpegCompressor(); + ~NvJpegCompressor() noexcept; + NvJpegCompressor(const NvJpegCompressor&) = delete; + NvJpegCompressor& operator=(const NvJpegCompressor&) = delete; + [[nodiscard]] std::string compress(const cv::Mat& image, int quality); + [[nodiscard]] bool isValid() const noexcept { return _valid; } + private: + void cleanup() noexcept; + bool _valid = false; + void* _nvHandle = nullptr; // nvjpegHandle_t + void* _encState = nullptr; // nvjpegEncoderState_t + void* _encParams = nullptr; // nvjpegEncoderParams_t + void* _stream = nullptr; // cudaStream_t + unsigned char* _gpuBuffer = nullptr; // reusable device memory + size_t _gpuBufferSize = 0; + }; /// /// // ANSOPENCV class provides various image processing functionalities using OpenCV and ANS Center SDK. ///