Files
ANSCORE/modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp

404 lines
16 KiB
C++
Raw Normal View History

2026-03-28 16:54:11 +11:00
#include "RTOCRDetector.h"
#include "include/clipper.h"
#include "NV12PreprocessHelper.h"
#include "ANSGpuFrameRegistry.h"
#include <cuda_runtime.h>
#include <opencv2/imgproc.hpp>
// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>
namespace ANSCENTER {
namespace rtocr {
bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
const std::string& engineCacheDir,
int maxSideLen) {
// Engine cache directory
std::string cacheDir;
if (!engineCacheDir.empty()) {
cacheDir = engineCacheDir;
} else {
auto pos = onnxPath.find_last_of("/\\");
cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
}
try {
ANSCENTER::Options options;
options.deviceIndex = gpuId;
// FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
// The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
// intermediate values overflow FP16 range (65504). Mixed precision
// (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
// originates deep in the conv->scale->relu backbone before reaching those layers.
// Classifier and recognizer remain FP16 with mixed precision -- only the
// detector needs full FP32.
options.precision = ANSCENTER::Precision::FP32;
options.maxBatchSize = 1;
options.optBatchSize = 1;
// Dynamic spatial dimensions for detection (multiples of 32)
options.minInputHeight = 32;
options.minInputWidth = 32;
options.optInputHeight = std::min(640, maxSideLen);
options.optInputWidth = std::min(640, maxSideLen);
options.maxInputHeight = maxSideLen;
options.maxInputWidth = maxSideLen;
options.engineFileDir = cacheDir;
m_poolKey = { onnxPath,
static_cast<int>(options.precision),
options.maxBatchSize };
m_engine = EnginePoolManager<float>::instance().acquire(
m_poolKey, options, onnxPath,
kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu());
2026-03-28 16:54:11 +11:00
m_usingSharedPool = (m_engine != nullptr);
if (!m_engine) {
std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
<< onnxPath << std::endl;
return false;
}
// Query actual profile max from the loaded engine
int profMaxH = m_engine->getProfileMaxHeight();
int profMaxW = m_engine->getProfileMaxWidth();
if (profMaxH > 0 && profMaxW > 0) {
m_engineMaxSideLen = std::min(profMaxH, profMaxW);
} else {
m_engineMaxSideLen = maxSideLen;
}
if (m_engineMaxSideLen < maxSideLen) {
std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
<< "x" << m_engineMaxSideLen << " (requested " << maxSideLen
<< " exceeded GPU capacity)" << std::endl;
}
std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
return false;
}
}
std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
int maxSideLen, float dbThresh,
float boxThresh, float unclipRatio,
bool useDilation) {
std::lock_guard<std::mutex> lock(_mutex);
if (!m_engine || image.empty()) return {};
try {
// Single-pass detection: resize the full image to fit within
// the engine's max spatial dimension (same approach as ONNX version).
int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);
// 1. Compute resize dimensions (multiples of 32)
cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
int newH = resizeShape.height;
int newW = resizeShape.width;
float ratioH = static_cast<float>(image.rows) / newH;
float ratioW = static_cast<float>(image.cols) / newW;
// 2. Upload to GPU and resize — try NV12 fast path first
cv::cuda::GpuMat gpuResized;
bool usedNV12 = false;
GpuFrameData* gpuFrame = tl_currentGpuFrame();
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
gpuFrame->width > 0 && gpuFrame->height > 0) {
// NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
// instead of CPU BGR upload (24MB) + separate resize
int fW = gpuFrame->width;
int fH = gpuFrame->height;
int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;
// Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
const uint8_t* devY = nullptr;
const uint8_t* devUV = nullptr;
int yPitch = 0, uvPitch = 0;
{
auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
// Cache hit
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
yPitch = static_cast<int>(gpuFrame->gpuCacheYPitch);
uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
} else if (!gpuFrame->gpuCacheValid) {
// Cache miss — upload CPU NV12 to GPU
size_t yBytes = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;
auto& reg = ANSGpuFrameRegistry::instance();
if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
cudaMalloc(&gpuFrame->gpuCacheY, yBytes);
cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
cudaMemcpy(gpuFrame->gpuCacheY, gpuFrame->cpuYPlane, yBytes, cudaMemcpyHostToDevice);
cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
gpuFrame->gpuCacheValid = true;
gpuFrame->gpuCacheDeviceIdx = gpuIdx;
gpuFrame->gpuCacheYPitch = static_cast<size_t>(gpuFrame->cpuYLinesize);
gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
gpuFrame->gpuCacheBytes = yBytes + uvBytes;
reg.onGpuCacheCreated(yBytes + uvBytes);
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
yPitch = gpuFrame->cpuYLinesize;
uvPitch = gpuFrame->cpuUvLinesize;
}
}
} // release registry lock before GPU kernel
if (devY && devUV) {
// Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
gpuResized.create(newH, newW, CV_8UC3);
NV12PreprocessHelper::nv12ToBGRResize(
devY, yPitch, devUV, uvPitch,
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
newW, newH, fW, fH);
usedNV12 = true;
// Update ratios to map from full-res NV12 to detection output
ratioH = static_cast<float>(fH) / newH;
ratioW = static_cast<float>(fW) / newW;
}
}
if (!usedNV12) {
// Fallback: CPU resize then upload small image to GPU
cv::Mat cpuResized;
cv::resize(image, cpuResized, resizeShape, 0, 0, cv::INTER_LINEAR);
gpuResized.upload(cpuResized);
2026-03-28 16:54:11 +11:00
}
// Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)
// 3. Run inference
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
std::vector<std::vector<std::vector<float>>> featureVectors;
if (!m_engine->runInference(inputs, featureVectors)) {
std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
return {};
}
if (featureVectors.empty() || featureVectors[0].empty()) return {};
// 4. Reshape output to probability map [H, W]
std::vector<float>& output = featureVectors[0][0];
int outputSize = static_cast<int>(output.size());
if (outputSize < newH * newW) {
std::cerr << "[RTOCRDetector] Output too small: expected at least "
<< newH * newW << " got " << outputSize << std::endl;
return {};
}
cv::Mat bitmap(newH, newW, CV_32FC1, output.data());
// 5. Threshold to binary (matches ONNX/PaddleOCR official order)
cv::Mat binaryMap;
cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
binaryMap.convertTo(binaryMap, CV_8UC1);
// 6. Apply dilation if requested (on binaryMap, matching ONNX version)
if (useDilation) {
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
cv::dilate(binaryMap, binaryMap, kernel);
}
// 7. Find contours and build text boxes
// (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
std::vector<std::vector<cv::Point>> contours;
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
std::vector<TextBox> boxes;
for (int i = 0; i < numCandidates; i++) {
if (contours[i].size() < 4) continue;
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
float sside = std::min(minRect.size.width, minRect.size.height);
if (sside < 3.0f) continue;
auto ordered = GetMiniBoxes(minRect);
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
float score = BoxScoreFast(bitmap, ordered);
if (score < boxThresh) continue;
// Step 3: UnclipPolygon - expand the 4-point box
auto expanded = UnclipPolygon(ordered, unclipRatio);
if (expanded.size() < 4) continue;
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
std::vector<cv::Point> expandedInt;
expandedInt.reserve(expanded.size());
for (auto& p : expanded) {
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
}
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
if (expandedSside < 5.0f) continue;
auto expandedOrdered = GetMiniBoxes(expandedRect);
// Step 5: Scale to original image coordinates
TextBox box;
for (int j = 0; j < 4; j++) {
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
}
box.score = score;
boxes.push_back(box);
}
SortTextBoxes(boxes);
return boxes;
}
catch (const std::exception& e) {
std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
return {};
}
}
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
cv::Point2f vertices[4];
rect.points(vertices);
// Sort all 4 points by x-coordinate ascending
std::sort(vertices, vertices + 4,
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
cv::Point2f topLeft, bottomLeft;
if (vertices[0].y <= vertices[1].y) {
topLeft = vertices[0];
bottomLeft = vertices[1];
} else {
topLeft = vertices[1];
bottomLeft = vertices[0];
}
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
cv::Point2f topRight, bottomRight;
if (vertices[2].y <= vertices[3].y) {
topRight = vertices[2];
bottomRight = vertices[3];
} else {
topRight = vertices[3];
bottomRight = vertices[2];
}
// Order: [TL, TR, BR, BL] (clockwise from top-left)
return { topLeft, topRight, bottomRight, bottomLeft };
}
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
const std::array<cv::Point2f, 4>& box) {
int h = probMap.rows;
int w = probMap.cols;
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
if (xmin >= xmax || ymin >= ymax) return 0.0f;
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
std::vector<cv::Point> pts(4);
for (int j = 0; j < 4; j++) {
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
static_cast<int>(box[j].y) - ymin);
}
std::vector<std::vector<cv::Point>> polys = { pts };
cv::fillPoly(mask, polys, cv::Scalar(1));
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
return static_cast<float>(cv::mean(roiMap, mask)[0]);
}
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
float unclipRatio) {
// Compute area using Shoelace formula and perimeter
float area = 0.0f;
float perimeter = 0.0f;
for (int i = 0; i < 4; i++) {
int j = (i + 1) % 4;
area += box[i].x * box[j].y - box[j].x * box[i].y;
float dx = box[j].x - box[i].x;
float dy = box[j].y - box[i].y;
perimeter += std::sqrt(dx * dx + dy * dy);
}
area = std::abs(area) * 0.5f;
if (perimeter < 1.0f) return {};
float distance = area * unclipRatio / perimeter;
ClipperLib::Path clipperPath;
for (int i = 0; i < 4; i++) {
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
static_cast<ClipperLib::cInt>(box[i].y) });
}
ClipperLib::ClipperOffset offset;
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
ClipperLib::Paths solution;
offset.Execute(solution, distance);
if (solution.empty() || solution[0].empty()) return {};
std::vector<cv::Point2f> result;
for (auto& p : solution[0]) {
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
}
return result;
}
RTOCRDetector::~RTOCRDetector() {
try {
if (m_usingSharedPool) {
EnginePoolManager<float>::instance().release(m_poolKey);
m_engine.reset();
m_usingSharedPool = false;
}
else if (m_engine) {
m_engine.reset();
}
}
catch (...) {}
}
} // namespace rtocr
} // namespace ANSCENTER