Improve jpeg conversion

Add NvJpegPool (4 encoders) and JPEG passthrough in BmpToJpeg
- NvJpegPool: singleton pool of 4 NvJpegCompressor instances with lock-free slot acquisition (~160MB VRAM). Threads that can't grab a slot fall back to TurboJPEG with zero wait. - JPEG passthrough: BmpToJpeg now checks if input is already JPEG (FF D8 FF magic) and copies directly without re-encoding. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-04-16 10:59:28 +10:00 · 2026-04-16 08:33:17 +10:00 · 2026-04-16 07:50:13 +10:00
2 changed files with 295 additions and 6 deletions
--- a/modules/ANSCV/ANSOpenCV.cpp
+++ b/modules/ANSCV/ANSOpenCV.cpp
@@ -19,6 +19,9 @@
 #include <chrono>
 #include <mutex>
 #include <turbojpeg.h>
 #include <nvjpeg.h>
 #include <cuda_runtime.h>
 #include "ANSCVVendorGate.h"
 #include <thread>
 #include <future>
 #include <opencv2/imgproc.hpp>
@@ -150,7 +153,172 @@ namespace ANSCENTER
 		return jpegString;
 	}
 	// ── NvJpegCompressor: GPU-accelerated JPEG (NVIDIA only) ──
 	NvJpegCompressor::NvJpegCompressor() {
 		if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) return;
 		auto handle = reinterpret_cast<nvjpegHandle_t*>(&_nvHandle);
 		auto state  = reinterpret_cast<nvjpegEncoderState_t*>(&_encState);
 		auto params = reinterpret_cast<nvjpegEncoderParams_t*>(&_encParams);
 		auto stream = reinterpret_cast<cudaStream_t*>(&_stream);
 		if (nvjpegCreateSimple(handle) != NVJPEG_STATUS_SUCCESS) return;
 		if (nvjpegEncoderStateCreate(*handle, state, nullptr) != NVJPEG_STATUS_SUCCESS) { cleanup(); return; }
 		if (nvjpegEncoderParamsCreate(*handle, params, nullptr) != NVJPEG_STATUS_SUCCESS) { cleanup(); return; }
 		if (cudaStreamCreate(stream) != cudaSuccess) { cleanup(); return; }
 		_valid = true;
 	}
 	NvJpegCompressor::~NvJpegCompressor() noexcept { cleanup(); }
 	void NvJpegCompressor::cleanup() noexcept {
 		if (_gpuBuffer) { cudaFree(_gpuBuffer); _gpuBuffer = nullptr; _gpuBufferSize = 0; }
 		if (_stream)    { cudaStreamDestroy(reinterpret_cast<cudaStream_t>(_stream)); _stream = nullptr; }
 		if (_encParams) { nvjpegEncoderParamsDestroy(reinterpret_cast<nvjpegEncoderParams_t>(_encParams)); _encParams = nullptr; }
 		if (_encState)  { nvjpegEncoderStateDestroy(reinterpret_cast<nvjpegEncoderState_t>(_encState)); _encState = nullptr; }
 		if (_nvHandle)  { nvjpegDestroy(reinterpret_cast<nvjpegHandle_t>(_nvHandle)); _nvHandle = nullptr; }
 		_valid = false;
 	}
 	std::string NvJpegCompressor::compress(const cv::Mat& image, int quality) {
 		if (!_valid || image.empty()) return "";
 		// Only support BGR 8-bit (the common path)
 		if (image.type() != CV_8UC3) return "";
 		auto handle = reinterpret_cast<nvjpegHandle_t>(_nvHandle);
 		auto state  = reinterpret_cast<nvjpegEncoderState_t>(_encState);
 		auto params = reinterpret_cast<nvjpegEncoderParams_t>(_encParams);
 		auto stream = reinterpret_cast<cudaStream_t>(_stream);
 		int width  = image.cols;
 		int height = image.rows;
 		size_t imageSize = static_cast<size_t>(width) * height * 3;
 		// Reuse GPU buffer, grow if needed
 		if (imageSize > _gpuBufferSize) {
 			if (_gpuBuffer) cudaFree(_gpuBuffer);
 			// Allocate with 25% headroom to reduce reallocations
 			_gpuBufferSize = imageSize + imageSize / 4;
 			if (cudaMalloc(&_gpuBuffer, _gpuBufferSize) != cudaSuccess) {
 				_gpuBuffer = nullptr;
 				_gpuBufferSize = 0;
 				return "";
 			}
 		}
 		// Upload interleaved BGR to GPU
 		if (cudaMemcpy(_gpuBuffer, image.data, imageSize, cudaMemcpyHostToDevice) != cudaSuccess)
 			return "";
 		// Configure encoder
 		if (nvjpegEncoderParamsSetQuality(params, quality, stream) != NVJPEG_STATUS_SUCCESS) return "";
 		if (nvjpegEncoderParamsSetSamplingFactors(params, NVJPEG_CSS_420, stream) != NVJPEG_STATUS_SUCCESS) return "";
 		if (nvjpegEncoderParamsSetOptimizedHuffman(params, 1, stream) != NVJPEG_STATUS_SUCCESS) return "";
 		// Set up nvjpegImage_t for interleaved BGR
 		nvjpegImage_t nv_image = {};
 		nv_image.channel[0] = _gpuBuffer;
 		nv_image.pitch[0]   = static_cast<unsigned int>(width * 3);
 		// Encode
 		if (nvjpegEncodeImage(handle, state, params, &nv_image,
 				NVJPEG_INPUT_BGRI, width, height, stream) != NVJPEG_STATUS_SUCCESS)
 			return "";
 		// Get compressed size
 		size_t jpegSize = 0;
 		if (nvjpegEncodeRetrieveBitstream(handle, state, nullptr, &jpegSize, stream) != NVJPEG_STATUS_SUCCESS)
 			return "";
 		// Retrieve bitstream
 		std::string jpegStr(jpegSize, '\0');
 		if (nvjpegEncodeRetrieveBitstream(handle, state,
 				reinterpret_cast<unsigned char*>(jpegStr.data()), &jpegSize, stream) != NVJPEG_STATUS_SUCCESS)
 			return "";
 		if (cudaStreamSynchronize(stream) != cudaSuccess)
 			return "";
 		jpegStr.resize(jpegSize);
 		return jpegStr;
 	}
 	// ── NvJpegPool: VRAM-scaled pool of GPU encoders, lock-free acquire ──
 	int NvJpegPool::detectPoolSize() {
 		// Query VRAM via CUDA and scale: 1 encoder per 2 GB, min 1
 		int deviceCount = 0;
 		if (cudaGetDeviceCount(&deviceCount) != cudaSuccess || deviceCount <= 0)
 			return 0;
 		cudaDeviceProp prop{};
 		if (cudaGetDeviceProperties(&prop, 0) != cudaSuccess)
 			return 0;
 		size_t vramGB = prop.totalGlobalMem / (1024ULL * 1024ULL * 1024ULL);
 		int pool = static_cast<int>(vramGB / 2);
 		if (pool < 1) pool = 1;
 		ANS_DBG("ANSCV", "NvJpegPool: GPU=%s, VRAM=%zuGB, poolSize=%d", prop.name, vramGB, pool);
 		return pool;
 	}
 	NvJpegPool& NvJpegPool::Instance() {
 		static NvJpegPool instance;
 		return instance;
 	}
 	NvJpegPool::NvJpegPool() {
 		if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) return;
 		_poolSize = detectPoolSize();
 		if (_poolSize <= 0) return;
 		_encoders.resize(_poolSize);
 		_inUse = std::make_unique<std::atomic<bool>[]>(_poolSize);
 		for (int i = 0; i < _poolSize; ++i) {
 			_inUse[i].store(false, std::memory_order_relaxed);
 			_encoders[i] = std::make_unique<NvJpegCompressor>();
 			if (!_encoders[i]->isValid()) {
 				_encoders[i].reset();
 			}
 		}
 		// Pool is available if at least one encoder initialized
 		for (int i = 0; i < _poolSize; ++i) {
 			if (_encoders[i]) { _available = true; break; }
 		}
 		ANS_DBG("ANSCV", "NvJpegPool: initialized %d encoder(s), available=%d", _poolSize, _available ? 1 : 0);
 	}
 	std::string NvJpegPool::tryCompress(const cv::Mat& image, int quality) {
 		if (!_available) return "";
 		// Lock-free slot acquisition: try each slot with compare_exchange
 		for (int i = 0; i < _poolSize; ++i) {
 			if (!_encoders[i]) continue;
 			bool expected = false;
 			if (_inUse[i].compare_exchange_strong(expected, true, std::memory_order_acquire)) {
 				std::string result = _encoders[i]->compress(image, quality);
 				_inUse[i].store(false, std::memory_order_release);
 				return result;  // may be empty on encode failure — caller falls back
 			}
 		}
 		return "";  // All slots busy — caller falls back to TurboJPEG
 	}
 	// ── Unified entry point: nvJPEG pool on NVIDIA, TurboJPEG otherwise ──
 	std::string CompressJpegToString(const cv::Mat& image, int quality) {
 		// Try GPU path first (returns "" if non-NVIDIA, pool full, or encode fails)
 		std::string result = NvJpegPool::Instance().tryCompress(image, quality);
 		if (!result.empty()) return result;
 		// CPU fallback — always available
 		static thread_local TurboJpegCompressor compressor;
 		return compressor.compress(image, quality);
 	}
@@ -6524,9 +6692,86 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu
 		if (quality <= 0 || quality > 100) quality = 85;
 		// Decode BMP from memory
 		int bmpSize = (*bmpInput)->cnt;
-		std::vector<unsigned char> bmpData((*bmpInput)->str, (*bmpInput)->str + bmpSize);
+		unsigned char* raw = reinterpret_cast<unsigned char*>((*bmpInput)->str);
 		// ── Passthrough: input is already JPEG (starts with FF D8 FF) ──
 		if (bmpSize >= 3 && raw[0] == 0xFF && raw[1] == 0xD8 && raw[2] == 0xFF) {
 			MgErr error = DSSetHandleSize(jpegOutput, sizeof(int32) + bmpSize * sizeof(uChar));
 			if (error != noErr) {
 				ANS_DBG("ANSCV", "BmpToJpeg: DSSetHandleSize failed (passthrough) - err=%d", error);
 				return -4;
 			}
 			(*jpegOutput)->cnt = bmpSize;
 			memcpy((*jpegOutput)->str, raw, bmpSize);
 			ANS_DBG("ANSCV", "BmpToJpeg: PASSTHROUGH - input is already JPEG (%d bytes)", bmpSize);
 			return 1;
 		}
 		// ── Fast path: parse BMP header directly, zero-copy ──
 		// Minimum BMP = file header (14) + info header (40) + some pixels
 		constexpr int kMinBmpSize = sizeof(BmpFileHeader) + sizeof(BmpInfoHeader) + 1;
 		if (bmpSize >= kMinBmpSize && raw[0] == 'B' && raw[1] == 'M') {
 			const auto& fh = *reinterpret_cast<const BmpFileHeader*>(raw);
 			const auto& ih = *reinterpret_cast<const BmpInfoHeader*>(raw + sizeof(BmpFileHeader));
 			int width  = ih.width;
 			int height = ih.height;  // negative = top-down
 			bool topDown = (height < 0);
 			if (height < 0) height = -height;
 			// Only handle 24-bit uncompressed (the format ImageToBmp produces)
 			if (ih.bitCount == 24 && ih.compression == 0 && width > 0 && height > 0) {
 				int rowBytes = width * 3;
 				int stride   = (rowBytes + 3) & ~3;  // BMP rows are 4-byte aligned
 				// Verify the buffer is large enough
 				int pixelOffset = static_cast<int>(fh.offsetData);
 				int64_t neededSize = static_cast<int64_t>(pixelOffset) + static_cast<int64_t>(stride) * height;
 				if (bmpSize >= neededSize) {
 					unsigned char* pixels = raw + pixelOffset;
 					cv::Mat mat;
 					if (topDown) {
 						// Top-down BMP: rows are already in correct order
 						// If no padding, wrap directly; otherwise need to handle stride
 						if (stride == rowBytes) {
 							mat = cv::Mat(height, width, CV_8UC3, pixels);
 						} else {
 							mat = cv::Mat(height, width, CV_8UC3, pixels, stride);
 						}
 					} else {
 						// Bottom-up BMP: flip to top-down for JPEG encoding
 						// Create Mat pointing at the last row with negative step
 						// OpenCV doesn't support negative step, so flip
 						cv::Mat bottomUp(height, width, CV_8UC3, pixels, stride);
 						cv::flip(bottomUp, mat, 0);
 					}
 					ANS_DBG("ANSCV", "BmpToJpeg: fast-path %dx%d, encoding JPEG q=%d", width, height, quality);
 					std::string jpegStr = ANSCENTER::CompressJpegToString(mat, quality);
 					if (!jpegStr.empty()) {
 						int size = static_cast<int>(jpegStr.size());
 						MgErr error = DSSetHandleSize(jpegOutput, sizeof(int32) + size * sizeof(uChar));
 						if (error != noErr) {
 							ANS_DBG("ANSCV", "BmpToJpeg: DSSetHandleSize failed - err=%d", error);
 							return -4;
 						}
 						(*jpegOutput)->cnt = size;
 						memcpy((*jpegOutput)->str, jpegStr.data(), size);
 						ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS (fast) - %d bytes BMP -> %d bytes JPEG", bmpSize, size);
 						return 1;
 					}
 					// If fast-path encode failed, fall through to imdecode path
 				}
 			}
 		}
 		// ── Fallback: use imdecode for non-standard BMP formats ──
 		ANS_DBG("ANSCV", "BmpToJpeg: using imdecode fallback for %d bytes", bmpSize);
 		std::vector<unsigned char> bmpData(raw, raw + bmpSize);
 		cv::Mat mat = cv::imdecode(bmpData, cv::IMREAD_COLOR);
 		if (mat.empty()) {
 			ANS_DBG("ANSCV", "BmpToJpeg: imdecode failed - %d bytes input", bmpSize);
@@ -6535,7 +6780,6 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu
 		ANS_DBG("ANSCV", "BmpToJpeg: decoded %dx%d, encoding JPEG q=%d", mat.cols, mat.rows, quality);
 		// Encode to JPEG using TurboJPEG if available, else cv::imencode
 		std::string jpegStr = ANSCENTER::CompressJpegToString(mat, quality);
 		if (jpegStr.empty()) {
 			ANS_DBG("ANSCV", "BmpToJpeg: JPEG encode failed");
@@ -6551,7 +6795,7 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu
 		(*jpegOutput)->cnt = size;
 		memcpy((*jpegOutput)->str, jpegStr.data(), size);
-		ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS - %d bytes BMP -> %d bytes JPEG", bmpSize, size);
+		ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS (fallback) - %d bytes BMP -> %d bytes JPEG", bmpSize, size);
 		return 1;
 	}
 	catch (const std::exception& e) {
--- a/modules/ANSCV/ANSOpenCV.h
+++ b/modules/ANSCV/ANSOpenCV.h
@@ -5,6 +5,9 @@
 #include "ANSLicense.h"
 #include "LabVIEWHeader/extcode.h"
 #include <vector>
 #include <array>
 #include <atomic>
 #include <memory>
 #include <opencv2/opencv.hpp>
 // Forward declaration for NI Vision IMAQ Image (avoids nivision.h dependency for consumers)
@@ -34,16 +37,58 @@ namespace ANSCENTER
    public:
        TurboJpegCompressor();
 		~TurboJpegCompressor() noexcept;
        // Delete copy constructor and assignment operator
        TurboJpegCompressor(const TurboJpegCompressor&) = delete;
        TurboJpegCompressor& operator=(const TurboJpegCompressor&) = delete;
        // Your original logic with minimal optimizations
 		[[nodiscard]] std::string compress(const cv::Mat& image, int quality);
    private:
 		void* _handle = nullptr;
        unsigned char* _buffer = nullptr;
        unsigned long _bufferSize = 0;
    };
 	// GPU-accelerated JPEG encoder using nvJPEG (NVIDIA only).
 	// Falls back silently if init fails or on non-NVIDIA hardware.
 	class NvJpegCompressor {
 	public:
 		NvJpegCompressor();
 		~NvJpegCompressor() noexcept;
 		NvJpegCompressor(const NvJpegCompressor&) = delete;
 		NvJpegCompressor& operator=(const NvJpegCompressor&) = delete;
 		[[nodiscard]] std::string compress(const cv::Mat& image, int quality);
 		[[nodiscard]] bool isValid() const noexcept { return _valid; }
 	private:
 		void cleanup() noexcept;
 		bool            _valid = false;
 		void*           _nvHandle = nullptr;      // nvjpegHandle_t
 		void*           _encState = nullptr;      // nvjpegEncoderState_t
 		void*           _encParams = nullptr;     // nvjpegEncoderParams_t
 		void*           _stream = nullptr;        // cudaStream_t
 		unsigned char*  _gpuBuffer = nullptr;     // reusable device memory
 		size_t          _gpuBufferSize = 0;
 	};
 	// Fixed-size pool of NvJpegCompressors (~40MB VRAM each).
 	// Threads that can't acquire an encoder fall back to TurboJPEG.
 	// Fixed pool of NvJpegCompressors sized by GPU VRAM.
 	// Formula: poolSize = VRAM_GB / 2 (min 1, e.g. 2GB→1, 4GB→2, 8GB→4, 10GB→5).
 	// Threads that can't acquire an encoder fall back to TurboJPEG.
 	class NvJpegPool {
 	public:
 		static NvJpegPool& Instance();
 		[[nodiscard]] std::string tryCompress(const cv::Mat& image, int quality);
 		[[nodiscard]] bool isAvailable() const noexcept { return _available; }
 		[[nodiscard]] int poolSize() const noexcept { return _poolSize; }
 	private:
 		NvJpegPool();
 		~NvJpegPool() = default;
 		NvJpegPool(const NvJpegPool&) = delete;
 		NvJpegPool& operator=(const NvJpegPool&) = delete;
 		static int detectPoolSize();
 		bool                                            _available = false;
 		int                                             _poolSize = 0;
 		std::vector<std::unique_ptr<NvJpegCompressor>>  _encoders;
 		std::unique_ptr<std::atomic<bool>[]>            _inUse;   // can't use vector — atomic is non-copyable
 	};
    /// <summary>
 	/// // ANSOPENCV class provides various image processing functionalities using OpenCV and ANS Center SDK.
    /// </summary>