|
|
|
|
@@ -19,6 +19,9 @@
|
|
|
|
|
#include <chrono>
|
|
|
|
|
#include <mutex>
|
|
|
|
|
#include <turbojpeg.h>
|
|
|
|
|
#include <nvjpeg.h>
|
|
|
|
|
#include <cuda_runtime.h>
|
|
|
|
|
#include "ANSCVVendorGate.h"
|
|
|
|
|
#include <thread>
|
|
|
|
|
#include <future>
|
|
|
|
|
#include <opencv2/imgproc.hpp>
|
|
|
|
|
@@ -150,7 +153,172 @@ namespace ANSCENTER
|
|
|
|
|
|
|
|
|
|
return jpegString;
|
|
|
|
|
}
|
|
|
|
|
// ── NvJpegCompressor: GPU-accelerated JPEG (NVIDIA only) ──
|
|
|
|
|
|
|
|
|
|
NvJpegCompressor::NvJpegCompressor() {
|
|
|
|
|
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) return;
|
|
|
|
|
|
|
|
|
|
auto handle = reinterpret_cast<nvjpegHandle_t*>(&_nvHandle);
|
|
|
|
|
auto state = reinterpret_cast<nvjpegEncoderState_t*>(&_encState);
|
|
|
|
|
auto params = reinterpret_cast<nvjpegEncoderParams_t*>(&_encParams);
|
|
|
|
|
auto stream = reinterpret_cast<cudaStream_t*>(&_stream);
|
|
|
|
|
|
|
|
|
|
if (nvjpegCreateSimple(handle) != NVJPEG_STATUS_SUCCESS) return;
|
|
|
|
|
if (nvjpegEncoderStateCreate(*handle, state, nullptr) != NVJPEG_STATUS_SUCCESS) { cleanup(); return; }
|
|
|
|
|
if (nvjpegEncoderParamsCreate(*handle, params, nullptr) != NVJPEG_STATUS_SUCCESS) { cleanup(); return; }
|
|
|
|
|
if (cudaStreamCreate(stream) != cudaSuccess) { cleanup(); return; }
|
|
|
|
|
|
|
|
|
|
_valid = true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NvJpegCompressor::~NvJpegCompressor() noexcept { cleanup(); }
|
|
|
|
|
|
|
|
|
|
void NvJpegCompressor::cleanup() noexcept {
|
|
|
|
|
if (_gpuBuffer) { cudaFree(_gpuBuffer); _gpuBuffer = nullptr; _gpuBufferSize = 0; }
|
|
|
|
|
if (_stream) { cudaStreamDestroy(reinterpret_cast<cudaStream_t>(_stream)); _stream = nullptr; }
|
|
|
|
|
if (_encParams) { nvjpegEncoderParamsDestroy(reinterpret_cast<nvjpegEncoderParams_t>(_encParams)); _encParams = nullptr; }
|
|
|
|
|
if (_encState) { nvjpegEncoderStateDestroy(reinterpret_cast<nvjpegEncoderState_t>(_encState)); _encState = nullptr; }
|
|
|
|
|
if (_nvHandle) { nvjpegDestroy(reinterpret_cast<nvjpegHandle_t>(_nvHandle)); _nvHandle = nullptr; }
|
|
|
|
|
_valid = false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string NvJpegCompressor::compress(const cv::Mat& image, int quality) {
|
|
|
|
|
if (!_valid || image.empty()) return "";
|
|
|
|
|
|
|
|
|
|
// Only support BGR 8-bit (the common path)
|
|
|
|
|
if (image.type() != CV_8UC3) return "";
|
|
|
|
|
|
|
|
|
|
auto handle = reinterpret_cast<nvjpegHandle_t>(_nvHandle);
|
|
|
|
|
auto state = reinterpret_cast<nvjpegEncoderState_t>(_encState);
|
|
|
|
|
auto params = reinterpret_cast<nvjpegEncoderParams_t>(_encParams);
|
|
|
|
|
auto stream = reinterpret_cast<cudaStream_t>(_stream);
|
|
|
|
|
|
|
|
|
|
int width = image.cols;
|
|
|
|
|
int height = image.rows;
|
|
|
|
|
size_t imageSize = static_cast<size_t>(width) * height * 3;
|
|
|
|
|
|
|
|
|
|
// Reuse GPU buffer, grow if needed
|
|
|
|
|
if (imageSize > _gpuBufferSize) {
|
|
|
|
|
if (_gpuBuffer) cudaFree(_gpuBuffer);
|
|
|
|
|
// Allocate with 25% headroom to reduce reallocations
|
|
|
|
|
_gpuBufferSize = imageSize + imageSize / 4;
|
|
|
|
|
if (cudaMalloc(&_gpuBuffer, _gpuBufferSize) != cudaSuccess) {
|
|
|
|
|
_gpuBuffer = nullptr;
|
|
|
|
|
_gpuBufferSize = 0;
|
|
|
|
|
return "";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Upload interleaved BGR to GPU
|
|
|
|
|
if (cudaMemcpy(_gpuBuffer, image.data, imageSize, cudaMemcpyHostToDevice) != cudaSuccess)
|
|
|
|
|
return "";
|
|
|
|
|
|
|
|
|
|
// Configure encoder
|
|
|
|
|
if (nvjpegEncoderParamsSetQuality(params, quality, stream) != NVJPEG_STATUS_SUCCESS) return "";
|
|
|
|
|
if (nvjpegEncoderParamsSetSamplingFactors(params, NVJPEG_CSS_420, stream) != NVJPEG_STATUS_SUCCESS) return "";
|
|
|
|
|
if (nvjpegEncoderParamsSetOptimizedHuffman(params, 1, stream) != NVJPEG_STATUS_SUCCESS) return "";
|
|
|
|
|
|
|
|
|
|
// Set up nvjpegImage_t for interleaved BGR
|
|
|
|
|
nvjpegImage_t nv_image = {};
|
|
|
|
|
nv_image.channel[0] = _gpuBuffer;
|
|
|
|
|
nv_image.pitch[0] = static_cast<unsigned int>(width * 3);
|
|
|
|
|
|
|
|
|
|
// Encode
|
|
|
|
|
if (nvjpegEncodeImage(handle, state, params, &nv_image,
|
|
|
|
|
NVJPEG_INPUT_BGRI, width, height, stream) != NVJPEG_STATUS_SUCCESS)
|
|
|
|
|
return "";
|
|
|
|
|
|
|
|
|
|
// Get compressed size
|
|
|
|
|
size_t jpegSize = 0;
|
|
|
|
|
if (nvjpegEncodeRetrieveBitstream(handle, state, nullptr, &jpegSize, stream) != NVJPEG_STATUS_SUCCESS)
|
|
|
|
|
return "";
|
|
|
|
|
|
|
|
|
|
// Retrieve bitstream
|
|
|
|
|
std::string jpegStr(jpegSize, '\0');
|
|
|
|
|
if (nvjpegEncodeRetrieveBitstream(handle, state,
|
|
|
|
|
reinterpret_cast<unsigned char*>(jpegStr.data()), &jpegSize, stream) != NVJPEG_STATUS_SUCCESS)
|
|
|
|
|
return "";
|
|
|
|
|
|
|
|
|
|
if (cudaStreamSynchronize(stream) != cudaSuccess)
|
|
|
|
|
return "";
|
|
|
|
|
|
|
|
|
|
jpegStr.resize(jpegSize);
|
|
|
|
|
return jpegStr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── NvJpegPool: VRAM-scaled pool of GPU encoders, lock-free acquire ──
|
|
|
|
|
|
|
|
|
|
int NvJpegPool::detectPoolSize() {
|
|
|
|
|
// Query VRAM via CUDA and scale: 1 encoder per 2 GB, min 1
|
|
|
|
|
int deviceCount = 0;
|
|
|
|
|
if (cudaGetDeviceCount(&deviceCount) != cudaSuccess || deviceCount <= 0)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
cudaDeviceProp prop{};
|
|
|
|
|
if (cudaGetDeviceProperties(&prop, 0) != cudaSuccess)
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
|
|
size_t vramGB = prop.totalGlobalMem / (1024ULL * 1024ULL * 1024ULL);
|
|
|
|
|
int pool = static_cast<int>(vramGB / 2);
|
|
|
|
|
if (pool < 1) pool = 1;
|
|
|
|
|
|
|
|
|
|
ANS_DBG("ANSCV", "NvJpegPool: GPU=%s, VRAM=%zuGB, poolSize=%d", prop.name, vramGB, pool);
|
|
|
|
|
return pool;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NvJpegPool& NvJpegPool::Instance() {
|
|
|
|
|
static NvJpegPool instance;
|
|
|
|
|
return instance;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
NvJpegPool::NvJpegPool() {
|
|
|
|
|
if (!anscv_vendor_gate::IsNvidiaGpuAvailable()) return;
|
|
|
|
|
|
|
|
|
|
_poolSize = detectPoolSize();
|
|
|
|
|
if (_poolSize <= 0) return;
|
|
|
|
|
|
|
|
|
|
_encoders.resize(_poolSize);
|
|
|
|
|
_inUse = std::make_unique<std::atomic<bool>[]>(_poolSize);
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < _poolSize; ++i) {
|
|
|
|
|
_inUse[i].store(false, std::memory_order_relaxed);
|
|
|
|
|
_encoders[i] = std::make_unique<NvJpegCompressor>();
|
|
|
|
|
if (!_encoders[i]->isValid()) {
|
|
|
|
|
_encoders[i].reset();
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
// Pool is available if at least one encoder initialized
|
|
|
|
|
for (int i = 0; i < _poolSize; ++i) {
|
|
|
|
|
if (_encoders[i]) { _available = true; break; }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ANS_DBG("ANSCV", "NvJpegPool: initialized %d encoder(s), available=%d", _poolSize, _available ? 1 : 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
std::string NvJpegPool::tryCompress(const cv::Mat& image, int quality) {
|
|
|
|
|
if (!_available) return "";
|
|
|
|
|
|
|
|
|
|
// Lock-free slot acquisition: try each slot with compare_exchange
|
|
|
|
|
for (int i = 0; i < _poolSize; ++i) {
|
|
|
|
|
if (!_encoders[i]) continue;
|
|
|
|
|
bool expected = false;
|
|
|
|
|
if (_inUse[i].compare_exchange_strong(expected, true, std::memory_order_acquire)) {
|
|
|
|
|
std::string result = _encoders[i]->compress(image, quality);
|
|
|
|
|
_inUse[i].store(false, std::memory_order_release);
|
|
|
|
|
return result; // may be empty on encode failure — caller falls back
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return ""; // All slots busy — caller falls back to TurboJPEG
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Unified entry point: nvJPEG pool on NVIDIA, TurboJPEG otherwise ──
|
|
|
|
|
|
|
|
|
|
std::string CompressJpegToString(const cv::Mat& image, int quality) {
|
|
|
|
|
// Try GPU path first (returns "" if non-NVIDIA, pool full, or encode fails)
|
|
|
|
|
std::string result = NvJpegPool::Instance().tryCompress(image, quality);
|
|
|
|
|
if (!result.empty()) return result;
|
|
|
|
|
|
|
|
|
|
// CPU fallback — always available
|
|
|
|
|
static thread_local TurboJpegCompressor compressor;
|
|
|
|
|
return compressor.compress(image, quality);
|
|
|
|
|
}
|
|
|
|
|
@@ -6524,9 +6692,86 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu
|
|
|
|
|
|
|
|
|
|
if (quality <= 0 || quality > 100) quality = 85;
|
|
|
|
|
|
|
|
|
|
// Decode BMP from memory
|
|
|
|
|
int bmpSize = (*bmpInput)->cnt;
|
|
|
|
|
std::vector<unsigned char> bmpData((*bmpInput)->str, (*bmpInput)->str + bmpSize);
|
|
|
|
|
unsigned char* raw = reinterpret_cast<unsigned char*>((*bmpInput)->str);
|
|
|
|
|
|
|
|
|
|
// ── Passthrough: input is already JPEG (starts with FF D8 FF) ──
|
|
|
|
|
if (bmpSize >= 3 && raw[0] == 0xFF && raw[1] == 0xD8 && raw[2] == 0xFF) {
|
|
|
|
|
MgErr error = DSSetHandleSize(jpegOutput, sizeof(int32) + bmpSize * sizeof(uChar));
|
|
|
|
|
if (error != noErr) {
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: DSSetHandleSize failed (passthrough) - err=%d", error);
|
|
|
|
|
return -4;
|
|
|
|
|
}
|
|
|
|
|
(*jpegOutput)->cnt = bmpSize;
|
|
|
|
|
memcpy((*jpegOutput)->str, raw, bmpSize);
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: PASSTHROUGH - input is already JPEG (%d bytes)", bmpSize);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Fast path: parse BMP header directly, zero-copy ──
|
|
|
|
|
// Minimum BMP = file header (14) + info header (40) + some pixels
|
|
|
|
|
constexpr int kMinBmpSize = sizeof(BmpFileHeader) + sizeof(BmpInfoHeader) + 1;
|
|
|
|
|
if (bmpSize >= kMinBmpSize && raw[0] == 'B' && raw[1] == 'M') {
|
|
|
|
|
|
|
|
|
|
const auto& fh = *reinterpret_cast<const BmpFileHeader*>(raw);
|
|
|
|
|
const auto& ih = *reinterpret_cast<const BmpInfoHeader*>(raw + sizeof(BmpFileHeader));
|
|
|
|
|
|
|
|
|
|
int width = ih.width;
|
|
|
|
|
int height = ih.height; // negative = top-down
|
|
|
|
|
bool topDown = (height < 0);
|
|
|
|
|
if (height < 0) height = -height;
|
|
|
|
|
|
|
|
|
|
// Only handle 24-bit uncompressed (the format ImageToBmp produces)
|
|
|
|
|
if (ih.bitCount == 24 && ih.compression == 0 && width > 0 && height > 0) {
|
|
|
|
|
int rowBytes = width * 3;
|
|
|
|
|
int stride = (rowBytes + 3) & ~3; // BMP rows are 4-byte aligned
|
|
|
|
|
|
|
|
|
|
// Verify the buffer is large enough
|
|
|
|
|
int pixelOffset = static_cast<int>(fh.offsetData);
|
|
|
|
|
int64_t neededSize = static_cast<int64_t>(pixelOffset) + static_cast<int64_t>(stride) * height;
|
|
|
|
|
if (bmpSize >= neededSize) {
|
|
|
|
|
unsigned char* pixels = raw + pixelOffset;
|
|
|
|
|
|
|
|
|
|
cv::Mat mat;
|
|
|
|
|
if (topDown) {
|
|
|
|
|
// Top-down BMP: rows are already in correct order
|
|
|
|
|
// If no padding, wrap directly; otherwise need to handle stride
|
|
|
|
|
if (stride == rowBytes) {
|
|
|
|
|
mat = cv::Mat(height, width, CV_8UC3, pixels);
|
|
|
|
|
} else {
|
|
|
|
|
mat = cv::Mat(height, width, CV_8UC3, pixels, stride);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Bottom-up BMP: flip to top-down for JPEG encoding
|
|
|
|
|
// Create Mat pointing at the last row with negative step
|
|
|
|
|
// OpenCV doesn't support negative step, so flip
|
|
|
|
|
cv::Mat bottomUp(height, width, CV_8UC3, pixels, stride);
|
|
|
|
|
cv::flip(bottomUp, mat, 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: fast-path %dx%d, encoding JPEG q=%d", width, height, quality);
|
|
|
|
|
|
|
|
|
|
std::string jpegStr = ANSCENTER::CompressJpegToString(mat, quality);
|
|
|
|
|
if (!jpegStr.empty()) {
|
|
|
|
|
int size = static_cast<int>(jpegStr.size());
|
|
|
|
|
MgErr error = DSSetHandleSize(jpegOutput, sizeof(int32) + size * sizeof(uChar));
|
|
|
|
|
if (error != noErr) {
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: DSSetHandleSize failed - err=%d", error);
|
|
|
|
|
return -4;
|
|
|
|
|
}
|
|
|
|
|
(*jpegOutput)->cnt = size;
|
|
|
|
|
memcpy((*jpegOutput)->str, jpegStr.data(), size);
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS (fast) - %d bytes BMP -> %d bytes JPEG", bmpSize, size);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
// If fast-path encode failed, fall through to imdecode path
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// ── Fallback: use imdecode for non-standard BMP formats ──
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: using imdecode fallback for %d bytes", bmpSize);
|
|
|
|
|
std::vector<unsigned char> bmpData(raw, raw + bmpSize);
|
|
|
|
|
cv::Mat mat = cv::imdecode(bmpData, cv::IMREAD_COLOR);
|
|
|
|
|
if (mat.empty()) {
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: imdecode failed - %d bytes input", bmpSize);
|
|
|
|
|
@@ -6535,7 +6780,6 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu
|
|
|
|
|
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: decoded %dx%d, encoding JPEG q=%d", mat.cols, mat.rows, quality);
|
|
|
|
|
|
|
|
|
|
// Encode to JPEG using TurboJPEG if available, else cv::imencode
|
|
|
|
|
std::string jpegStr = ANSCENTER::CompressJpegToString(mat, quality);
|
|
|
|
|
if (jpegStr.empty()) {
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: JPEG encode failed");
|
|
|
|
|
@@ -6551,7 +6795,7 @@ extern "C" __declspec(dllexport) int ANSCV_BmpToJpeg(LStrHandle bmpInput, int qu
|
|
|
|
|
(*jpegOutput)->cnt = size;
|
|
|
|
|
memcpy((*jpegOutput)->str, jpegStr.data(), size);
|
|
|
|
|
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS - %d bytes BMP -> %d bytes JPEG", bmpSize, size);
|
|
|
|
|
ANS_DBG("ANSCV", "BmpToJpeg: SUCCESS (fallback) - %d bytes BMP -> %d bytes JPEG", bmpSize, size);
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
catch (const std::exception& e) {
|
|
|
|
|
|