404 lines
16 KiB
C++
404 lines
16 KiB
C++
#include "RTOCRDetector.h"
|
|
#include "include/clipper.h"
|
|
#include "NV12PreprocessHelper.h"
|
|
#include "ANSGpuFrameRegistry.h"
|
|
|
|
#include <cuda_runtime.h>
|
|
#include <opencv2/imgproc.hpp>
|
|
|
|
// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
|
|
#include <opencv2/cudaimgproc.hpp>
|
|
#include <opencv2/cudawarping.hpp>
|
|
#include <opencv2/cudaarithm.hpp>
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
|
|
namespace ANSCENTER {
|
|
namespace rtocr {
|
|
|
|
bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
|
|
const std::string& engineCacheDir,
|
|
int maxSideLen) {
|
|
// Engine cache directory
|
|
std::string cacheDir;
|
|
if (!engineCacheDir.empty()) {
|
|
cacheDir = engineCacheDir;
|
|
} else {
|
|
auto pos = onnxPath.find_last_of("/\\");
|
|
cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
|
|
}
|
|
|
|
try {
|
|
ANSCENTER::Options options;
|
|
options.deviceIndex = gpuId;
|
|
// FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
|
|
// The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
|
|
// intermediate values overflow FP16 range (65504). Mixed precision
|
|
// (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
|
|
// originates deep in the conv->scale->relu backbone before reaching those layers.
|
|
// Classifier and recognizer remain FP16 with mixed precision -- only the
|
|
// detector needs full FP32.
|
|
options.precision = ANSCENTER::Precision::FP32;
|
|
options.maxBatchSize = 1;
|
|
options.optBatchSize = 1;
|
|
|
|
// Dynamic spatial dimensions for detection (multiples of 32)
|
|
options.minInputHeight = 32;
|
|
options.minInputWidth = 32;
|
|
options.optInputHeight = std::min(640, maxSideLen);
|
|
options.optInputWidth = std::min(640, maxSideLen);
|
|
options.maxInputHeight = maxSideLen;
|
|
options.maxInputWidth = maxSideLen;
|
|
options.engineFileDir = cacheDir;
|
|
|
|
m_poolKey = { onnxPath,
|
|
static_cast<int>(options.precision),
|
|
options.maxBatchSize };
|
|
m_engine = EnginePoolManager<float>::instance().acquire(
|
|
m_poolKey, options, onnxPath,
|
|
kDetSubVals, kDetDivVals, true, -1);
|
|
m_usingSharedPool = (m_engine != nullptr);
|
|
|
|
if (!m_engine) {
|
|
std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
|
|
<< onnxPath << std::endl;
|
|
return false;
|
|
}
|
|
|
|
// Query actual profile max from the loaded engine
|
|
int profMaxH = m_engine->getProfileMaxHeight();
|
|
int profMaxW = m_engine->getProfileMaxWidth();
|
|
if (profMaxH > 0 && profMaxW > 0) {
|
|
m_engineMaxSideLen = std::min(profMaxH, profMaxW);
|
|
} else {
|
|
m_engineMaxSideLen = maxSideLen;
|
|
}
|
|
|
|
if (m_engineMaxSideLen < maxSideLen) {
|
|
std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
|
|
<< "x" << m_engineMaxSideLen << " (requested " << maxSideLen
|
|
<< " exceeded GPU capacity)" << std::endl;
|
|
}
|
|
std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
|
|
return true;
|
|
}
|
|
catch (const std::exception& e) {
|
|
std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
|
|
int maxSideLen, float dbThresh,
|
|
float boxThresh, float unclipRatio,
|
|
bool useDilation) {
|
|
std::lock_guard<std::mutex> lock(_mutex);
|
|
|
|
if (!m_engine || image.empty()) return {};
|
|
|
|
try {
|
|
// Single-pass detection: resize the full image to fit within
|
|
// the engine's max spatial dimension (same approach as ONNX version).
|
|
int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);
|
|
|
|
// 1. Compute resize dimensions (multiples of 32)
|
|
cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
|
|
int newH = resizeShape.height;
|
|
int newW = resizeShape.width;
|
|
|
|
float ratioH = static_cast<float>(image.rows) / newH;
|
|
float ratioW = static_cast<float>(image.cols) / newW;
|
|
|
|
// 2. Upload to GPU and resize — try NV12 fast path first
|
|
cv::cuda::GpuMat gpuResized;
|
|
bool usedNV12 = false;
|
|
|
|
GpuFrameData* gpuFrame = tl_currentGpuFrame();
|
|
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
|
|
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
|
|
gpuFrame->width > 0 && gpuFrame->height > 0) {
|
|
// NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
|
|
// instead of CPU BGR upload (24MB) + separate resize
|
|
int fW = gpuFrame->width;
|
|
int fH = gpuFrame->height;
|
|
int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;
|
|
|
|
// Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
|
|
const uint8_t* devY = nullptr;
|
|
const uint8_t* devUV = nullptr;
|
|
int yPitch = 0, uvPitch = 0;
|
|
{
|
|
auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
|
|
if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
|
|
// Cache hit
|
|
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
|
|
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
|
|
yPitch = static_cast<int>(gpuFrame->gpuCacheYPitch);
|
|
uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
|
|
} else if (!gpuFrame->gpuCacheValid) {
|
|
// Cache miss — upload CPU NV12 to GPU
|
|
size_t yBytes = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
|
|
size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;
|
|
|
|
auto& reg = ANSGpuFrameRegistry::instance();
|
|
if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
|
|
cudaMalloc(&gpuFrame->gpuCacheY, yBytes);
|
|
cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
|
|
cudaMemcpy(gpuFrame->gpuCacheY, gpuFrame->cpuYPlane, yBytes, cudaMemcpyHostToDevice);
|
|
cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
|
|
gpuFrame->gpuCacheValid = true;
|
|
gpuFrame->gpuCacheDeviceIdx = gpuIdx;
|
|
gpuFrame->gpuCacheYPitch = static_cast<size_t>(gpuFrame->cpuYLinesize);
|
|
gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
|
|
gpuFrame->gpuCacheBytes = yBytes + uvBytes;
|
|
reg.onGpuCacheCreated(yBytes + uvBytes);
|
|
|
|
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
|
|
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
|
|
yPitch = gpuFrame->cpuYLinesize;
|
|
uvPitch = gpuFrame->cpuUvLinesize;
|
|
}
|
|
}
|
|
} // release registry lock before GPU kernel
|
|
|
|
if (devY && devUV) {
|
|
// Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
|
|
gpuResized.create(newH, newW, CV_8UC3);
|
|
NV12PreprocessHelper::nv12ToBGRResize(
|
|
devY, yPitch, devUV, uvPitch,
|
|
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
|
|
newW, newH, fW, fH);
|
|
usedNV12 = true;
|
|
|
|
// Update ratios to map from full-res NV12 to detection output
|
|
ratioH = static_cast<float>(fH) / newH;
|
|
ratioW = static_cast<float>(fW) / newW;
|
|
}
|
|
}
|
|
|
|
if (!usedNV12) {
|
|
// Fallback: standard BGR upload
|
|
cv::cuda::GpuMat gpuImg;
|
|
gpuImg.upload(image);
|
|
cv::cuda::resize(gpuImg, gpuResized, resizeShape);
|
|
}
|
|
|
|
// Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)
|
|
|
|
// 3. Run inference
|
|
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
|
|
std::vector<std::vector<std::vector<float>>> featureVectors;
|
|
|
|
if (!m_engine->runInference(inputs, featureVectors)) {
|
|
std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
|
|
return {};
|
|
}
|
|
|
|
if (featureVectors.empty() || featureVectors[0].empty()) return {};
|
|
|
|
// 4. Reshape output to probability map [H, W]
|
|
std::vector<float>& output = featureVectors[0][0];
|
|
int outputSize = static_cast<int>(output.size());
|
|
|
|
if (outputSize < newH * newW) {
|
|
std::cerr << "[RTOCRDetector] Output too small: expected at least "
|
|
<< newH * newW << " got " << outputSize << std::endl;
|
|
return {};
|
|
}
|
|
|
|
cv::Mat bitmap(newH, newW, CV_32FC1, output.data());
|
|
|
|
// 5. Threshold to binary (matches ONNX/PaddleOCR official order)
|
|
cv::Mat binaryMap;
|
|
cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
|
|
binaryMap.convertTo(binaryMap, CV_8UC1);
|
|
|
|
// 6. Apply dilation if requested (on binaryMap, matching ONNX version)
|
|
if (useDilation) {
|
|
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
|
|
cv::dilate(binaryMap, binaryMap, kernel);
|
|
}
|
|
|
|
// 7. Find contours and build text boxes
|
|
// (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
|
|
std::vector<std::vector<cv::Point>> contours;
|
|
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
|
|
|
|
int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
|
|
std::vector<TextBox> boxes;
|
|
|
|
for (int i = 0; i < numCandidates; i++) {
|
|
if (contours[i].size() < 4) continue;
|
|
|
|
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
|
|
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
|
|
float sside = std::min(minRect.size.width, minRect.size.height);
|
|
if (sside < 3.0f) continue;
|
|
|
|
auto ordered = GetMiniBoxes(minRect);
|
|
|
|
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
|
|
float score = BoxScoreFast(bitmap, ordered);
|
|
if (score < boxThresh) continue;
|
|
|
|
// Step 3: UnclipPolygon - expand the 4-point box
|
|
auto expanded = UnclipPolygon(ordered, unclipRatio);
|
|
if (expanded.size() < 4) continue;
|
|
|
|
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
|
|
std::vector<cv::Point> expandedInt;
|
|
expandedInt.reserve(expanded.size());
|
|
for (auto& p : expanded) {
|
|
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
|
|
}
|
|
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
|
|
|
|
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
|
|
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
|
|
if (expandedSside < 5.0f) continue;
|
|
|
|
auto expandedOrdered = GetMiniBoxes(expandedRect);
|
|
|
|
// Step 5: Scale to original image coordinates
|
|
TextBox box;
|
|
for (int j = 0; j < 4; j++) {
|
|
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
|
|
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
|
|
}
|
|
box.score = score;
|
|
boxes.push_back(box);
|
|
}
|
|
|
|
SortTextBoxes(boxes);
|
|
return boxes;
|
|
}
|
|
catch (const std::exception& e) {
|
|
std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
|
|
return {};
|
|
}
|
|
}
|
|
|
|
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
|
|
std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
|
|
cv::Point2f vertices[4];
|
|
rect.points(vertices);
|
|
|
|
// Sort all 4 points by x-coordinate ascending
|
|
std::sort(vertices, vertices + 4,
|
|
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
|
|
|
|
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
|
|
cv::Point2f topLeft, bottomLeft;
|
|
if (vertices[0].y <= vertices[1].y) {
|
|
topLeft = vertices[0];
|
|
bottomLeft = vertices[1];
|
|
} else {
|
|
topLeft = vertices[1];
|
|
bottomLeft = vertices[0];
|
|
}
|
|
|
|
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
|
|
cv::Point2f topRight, bottomRight;
|
|
if (vertices[2].y <= vertices[3].y) {
|
|
topRight = vertices[2];
|
|
bottomRight = vertices[3];
|
|
} else {
|
|
topRight = vertices[3];
|
|
bottomRight = vertices[2];
|
|
}
|
|
|
|
// Order: [TL, TR, BR, BL] (clockwise from top-left)
|
|
return { topLeft, topRight, bottomRight, bottomLeft };
|
|
}
|
|
|
|
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
|
|
float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
|
|
const std::array<cv::Point2f, 4>& box) {
|
|
int h = probMap.rows;
|
|
int w = probMap.cols;
|
|
|
|
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
|
|
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
|
|
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
|
|
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
|
|
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
|
|
|
|
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
|
|
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
|
|
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
|
|
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
|
|
|
|
if (xmin >= xmax || ymin >= ymax) return 0.0f;
|
|
|
|
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
|
|
|
|
std::vector<cv::Point> pts(4);
|
|
for (int j = 0; j < 4; j++) {
|
|
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
|
|
static_cast<int>(box[j].y) - ymin);
|
|
}
|
|
std::vector<std::vector<cv::Point>> polys = { pts };
|
|
cv::fillPoly(mask, polys, cv::Scalar(1));
|
|
|
|
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
|
|
return static_cast<float>(cv::mean(roiMap, mask)[0]);
|
|
}
|
|
|
|
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
|
|
// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
|
|
std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
|
|
float unclipRatio) {
|
|
// Compute area using Shoelace formula and perimeter
|
|
float area = 0.0f;
|
|
float perimeter = 0.0f;
|
|
for (int i = 0; i < 4; i++) {
|
|
int j = (i + 1) % 4;
|
|
area += box[i].x * box[j].y - box[j].x * box[i].y;
|
|
float dx = box[j].x - box[i].x;
|
|
float dy = box[j].y - box[i].y;
|
|
perimeter += std::sqrt(dx * dx + dy * dy);
|
|
}
|
|
area = std::abs(area) * 0.5f;
|
|
if (perimeter < 1.0f) return {};
|
|
|
|
float distance = area * unclipRatio / perimeter;
|
|
|
|
ClipperLib::Path clipperPath;
|
|
for (int i = 0; i < 4; i++) {
|
|
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
|
|
static_cast<ClipperLib::cInt>(box[i].y) });
|
|
}
|
|
|
|
ClipperLib::ClipperOffset offset;
|
|
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
|
|
|
|
ClipperLib::Paths solution;
|
|
offset.Execute(solution, distance);
|
|
|
|
if (solution.empty() || solution[0].empty()) return {};
|
|
|
|
std::vector<cv::Point2f> result;
|
|
for (auto& p : solution[0]) {
|
|
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
RTOCRDetector::~RTOCRDetector() {
|
|
try {
|
|
if (m_usingSharedPool) {
|
|
EnginePoolManager<float>::instance().release(m_poolKey);
|
|
m_engine.reset();
|
|
m_usingSharedPool = false;
|
|
}
|
|
else if (m_engine) {
|
|
m_engine.reset();
|
|
}
|
|
}
|
|
catch (...) {}
|
|
}
|
|
|
|
} // namespace rtocr
|
|
} // namespace ANSCENTER
|