Refactor project structure
This commit is contained in:
403
modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
Normal file
403
modules/ANSOCR/ANSRTOCR/RTOCRDetector.cpp
Normal file
@@ -0,0 +1,403 @@
|
||||
#include "RTOCRDetector.h"
|
||||
#include "include/clipper.h"
|
||||
#include "NV12PreprocessHelper.h"
|
||||
#include "ANSGpuFrameRegistry.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
|
||||
// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
|
||||
#include <opencv2/cudaimgproc.hpp>
|
||||
#include <opencv2/cudawarping.hpp>
|
||||
#include <opencv2/cudaarithm.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
|
||||
const std::string& engineCacheDir,
|
||||
int maxSideLen) {
|
||||
// Engine cache directory
|
||||
std::string cacheDir;
|
||||
if (!engineCacheDir.empty()) {
|
||||
cacheDir = engineCacheDir;
|
||||
} else {
|
||||
auto pos = onnxPath.find_last_of("/\\");
|
||||
cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
|
||||
}
|
||||
|
||||
try {
|
||||
ANSCENTER::Options options;
|
||||
options.deviceIndex = gpuId;
|
||||
// FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
|
||||
// The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
|
||||
// intermediate values overflow FP16 range (65504). Mixed precision
|
||||
// (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
|
||||
// originates deep in the conv->scale->relu backbone before reaching those layers.
|
||||
// Classifier and recognizer remain FP16 with mixed precision -- only the
|
||||
// detector needs full FP32.
|
||||
options.precision = ANSCENTER::Precision::FP32;
|
||||
options.maxBatchSize = 1;
|
||||
options.optBatchSize = 1;
|
||||
|
||||
// Dynamic spatial dimensions for detection (multiples of 32)
|
||||
options.minInputHeight = 32;
|
||||
options.minInputWidth = 32;
|
||||
options.optInputHeight = std::min(640, maxSideLen);
|
||||
options.optInputWidth = std::min(640, maxSideLen);
|
||||
options.maxInputHeight = maxSideLen;
|
||||
options.maxInputWidth = maxSideLen;
|
||||
options.engineFileDir = cacheDir;
|
||||
|
||||
m_poolKey = { onnxPath,
|
||||
static_cast<int>(options.precision),
|
||||
options.maxBatchSize };
|
||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||
m_poolKey, options, onnxPath,
|
||||
kDetSubVals, kDetDivVals, true, -1);
|
||||
m_usingSharedPool = (m_engine != nullptr);
|
||||
|
||||
if (!m_engine) {
|
||||
std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
|
||||
<< onnxPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Query actual profile max from the loaded engine
|
||||
int profMaxH = m_engine->getProfileMaxHeight();
|
||||
int profMaxW = m_engine->getProfileMaxWidth();
|
||||
if (profMaxH > 0 && profMaxW > 0) {
|
||||
m_engineMaxSideLen = std::min(profMaxH, profMaxW);
|
||||
} else {
|
||||
m_engineMaxSideLen = maxSideLen;
|
||||
}
|
||||
|
||||
if (m_engineMaxSideLen < maxSideLen) {
|
||||
std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
|
||||
<< "x" << m_engineMaxSideLen << " (requested " << maxSideLen
|
||||
<< " exceeded GPU capacity)" << std::endl;
|
||||
}
|
||||
std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
|
||||
int maxSideLen, float dbThresh,
|
||||
float boxThresh, float unclipRatio,
|
||||
bool useDilation) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
|
||||
if (!m_engine || image.empty()) return {};
|
||||
|
||||
try {
|
||||
// Single-pass detection: resize the full image to fit within
|
||||
// the engine's max spatial dimension (same approach as ONNX version).
|
||||
int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);
|
||||
|
||||
// 1. Compute resize dimensions (multiples of 32)
|
||||
cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
|
||||
int newH = resizeShape.height;
|
||||
int newW = resizeShape.width;
|
||||
|
||||
float ratioH = static_cast<float>(image.rows) / newH;
|
||||
float ratioW = static_cast<float>(image.cols) / newW;
|
||||
|
||||
// 2. Upload to GPU and resize — try NV12 fast path first
|
||||
cv::cuda::GpuMat gpuResized;
|
||||
bool usedNV12 = false;
|
||||
|
||||
GpuFrameData* gpuFrame = tl_currentGpuFrame();
|
||||
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
|
||||
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
|
||||
gpuFrame->width > 0 && gpuFrame->height > 0) {
|
||||
// NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
|
||||
// instead of CPU BGR upload (24MB) + separate resize
|
||||
int fW = gpuFrame->width;
|
||||
int fH = gpuFrame->height;
|
||||
int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;
|
||||
|
||||
// Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
|
||||
const uint8_t* devY = nullptr;
|
||||
const uint8_t* devUV = nullptr;
|
||||
int yPitch = 0, uvPitch = 0;
|
||||
{
|
||||
auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
|
||||
if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
|
||||
// Cache hit
|
||||
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
|
||||
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
|
||||
yPitch = static_cast<int>(gpuFrame->gpuCacheYPitch);
|
||||
uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
|
||||
} else if (!gpuFrame->gpuCacheValid) {
|
||||
// Cache miss — upload CPU NV12 to GPU
|
||||
size_t yBytes = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
|
||||
size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;
|
||||
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
|
||||
cudaMalloc(&gpuFrame->gpuCacheY, yBytes);
|
||||
cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
|
||||
cudaMemcpy(gpuFrame->gpuCacheY, gpuFrame->cpuYPlane, yBytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
|
||||
gpuFrame->gpuCacheValid = true;
|
||||
gpuFrame->gpuCacheDeviceIdx = gpuIdx;
|
||||
gpuFrame->gpuCacheYPitch = static_cast<size_t>(gpuFrame->cpuYLinesize);
|
||||
gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
|
||||
gpuFrame->gpuCacheBytes = yBytes + uvBytes;
|
||||
reg.onGpuCacheCreated(yBytes + uvBytes);
|
||||
|
||||
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
|
||||
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
|
||||
yPitch = gpuFrame->cpuYLinesize;
|
||||
uvPitch = gpuFrame->cpuUvLinesize;
|
||||
}
|
||||
}
|
||||
} // release registry lock before GPU kernel
|
||||
|
||||
if (devY && devUV) {
|
||||
// Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
|
||||
gpuResized.create(newH, newW, CV_8UC3);
|
||||
NV12PreprocessHelper::nv12ToBGRResize(
|
||||
devY, yPitch, devUV, uvPitch,
|
||||
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
|
||||
newW, newH, fW, fH);
|
||||
usedNV12 = true;
|
||||
|
||||
// Update ratios to map from full-res NV12 to detection output
|
||||
ratioH = static_cast<float>(fH) / newH;
|
||||
ratioW = static_cast<float>(fW) / newW;
|
||||
}
|
||||
}
|
||||
|
||||
if (!usedNV12) {
|
||||
// Fallback: standard BGR upload
|
||||
cv::cuda::GpuMat gpuImg;
|
||||
gpuImg.upload(image);
|
||||
cv::cuda::resize(gpuImg, gpuResized, resizeShape);
|
||||
}
|
||||
|
||||
// Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)
|
||||
|
||||
// 3. Run inference
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
|
||||
std::vector<std::vector<std::vector<float>>> featureVectors;
|
||||
|
||||
if (!m_engine->runInference(inputs, featureVectors)) {
|
||||
std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
|
||||
return {};
|
||||
}
|
||||
|
||||
if (featureVectors.empty() || featureVectors[0].empty()) return {};
|
||||
|
||||
// 4. Reshape output to probability map [H, W]
|
||||
std::vector<float>& output = featureVectors[0][0];
|
||||
int outputSize = static_cast<int>(output.size());
|
||||
|
||||
if (outputSize < newH * newW) {
|
||||
std::cerr << "[RTOCRDetector] Output too small: expected at least "
|
||||
<< newH * newW << " got " << outputSize << std::endl;
|
||||
return {};
|
||||
}
|
||||
|
||||
cv::Mat bitmap(newH, newW, CV_32FC1, output.data());
|
||||
|
||||
// 5. Threshold to binary (matches ONNX/PaddleOCR official order)
|
||||
cv::Mat binaryMap;
|
||||
cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
|
||||
binaryMap.convertTo(binaryMap, CV_8UC1);
|
||||
|
||||
// 6. Apply dilation if requested (on binaryMap, matching ONNX version)
|
||||
if (useDilation) {
|
||||
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
|
||||
cv::dilate(binaryMap, binaryMap, kernel);
|
||||
}
|
||||
|
||||
// 7. Find contours and build text boxes
|
||||
// (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
|
||||
std::vector<std::vector<cv::Point>> contours;
|
||||
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
|
||||
|
||||
int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
|
||||
std::vector<TextBox> boxes;
|
||||
|
||||
for (int i = 0; i < numCandidates; i++) {
|
||||
if (contours[i].size() < 4) continue;
|
||||
|
||||
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
|
||||
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
|
||||
float sside = std::min(minRect.size.width, minRect.size.height);
|
||||
if (sside < 3.0f) continue;
|
||||
|
||||
auto ordered = GetMiniBoxes(minRect);
|
||||
|
||||
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
|
||||
float score = BoxScoreFast(bitmap, ordered);
|
||||
if (score < boxThresh) continue;
|
||||
|
||||
// Step 3: UnclipPolygon - expand the 4-point box
|
||||
auto expanded = UnclipPolygon(ordered, unclipRatio);
|
||||
if (expanded.size() < 4) continue;
|
||||
|
||||
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
|
||||
std::vector<cv::Point> expandedInt;
|
||||
expandedInt.reserve(expanded.size());
|
||||
for (auto& p : expanded) {
|
||||
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
|
||||
}
|
||||
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
|
||||
|
||||
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
|
||||
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
|
||||
if (expandedSside < 5.0f) continue;
|
||||
|
||||
auto expandedOrdered = GetMiniBoxes(expandedRect);
|
||||
|
||||
// Step 5: Scale to original image coordinates
|
||||
TextBox box;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
|
||||
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
|
||||
}
|
||||
box.score = score;
|
||||
boxes.push_back(box);
|
||||
}
|
||||
|
||||
SortTextBoxes(boxes);
|
||||
return boxes;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
|
||||
std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
|
||||
cv::Point2f vertices[4];
|
||||
rect.points(vertices);
|
||||
|
||||
// Sort all 4 points by x-coordinate ascending
|
||||
std::sort(vertices, vertices + 4,
|
||||
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
|
||||
|
||||
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
|
||||
cv::Point2f topLeft, bottomLeft;
|
||||
if (vertices[0].y <= vertices[1].y) {
|
||||
topLeft = vertices[0];
|
||||
bottomLeft = vertices[1];
|
||||
} else {
|
||||
topLeft = vertices[1];
|
||||
bottomLeft = vertices[0];
|
||||
}
|
||||
|
||||
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
|
||||
cv::Point2f topRight, bottomRight;
|
||||
if (vertices[2].y <= vertices[3].y) {
|
||||
topRight = vertices[2];
|
||||
bottomRight = vertices[3];
|
||||
} else {
|
||||
topRight = vertices[3];
|
||||
bottomRight = vertices[2];
|
||||
}
|
||||
|
||||
// Order: [TL, TR, BR, BL] (clockwise from top-left)
|
||||
return { topLeft, topRight, bottomRight, bottomLeft };
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
|
||||
float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
|
||||
const std::array<cv::Point2f, 4>& box) {
|
||||
int h = probMap.rows;
|
||||
int w = probMap.cols;
|
||||
|
||||
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
|
||||
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
|
||||
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
|
||||
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
|
||||
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
|
||||
|
||||
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
|
||||
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
|
||||
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
|
||||
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
|
||||
|
||||
if (xmin >= xmax || ymin >= ymax) return 0.0f;
|
||||
|
||||
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
|
||||
|
||||
std::vector<cv::Point> pts(4);
|
||||
for (int j = 0; j < 4; j++) {
|
||||
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
|
||||
static_cast<int>(box[j].y) - ymin);
|
||||
}
|
||||
std::vector<std::vector<cv::Point>> polys = { pts };
|
||||
cv::fillPoly(mask, polys, cv::Scalar(1));
|
||||
|
||||
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
|
||||
return static_cast<float>(cv::mean(roiMap, mask)[0]);
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
|
||||
// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
|
||||
std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
|
||||
float unclipRatio) {
|
||||
// Compute area using Shoelace formula and perimeter
|
||||
float area = 0.0f;
|
||||
float perimeter = 0.0f;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int j = (i + 1) % 4;
|
||||
area += box[i].x * box[j].y - box[j].x * box[i].y;
|
||||
float dx = box[j].x - box[i].x;
|
||||
float dy = box[j].y - box[i].y;
|
||||
perimeter += std::sqrt(dx * dx + dy * dy);
|
||||
}
|
||||
area = std::abs(area) * 0.5f;
|
||||
if (perimeter < 1.0f) return {};
|
||||
|
||||
float distance = area * unclipRatio / perimeter;
|
||||
|
||||
ClipperLib::Path clipperPath;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
|
||||
static_cast<ClipperLib::cInt>(box[i].y) });
|
||||
}
|
||||
|
||||
ClipperLib::ClipperOffset offset;
|
||||
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
|
||||
|
||||
ClipperLib::Paths solution;
|
||||
offset.Execute(solution, distance);
|
||||
|
||||
if (solution.empty() || solution[0].empty()) return {};
|
||||
|
||||
std::vector<cv::Point2f> result;
|
||||
for (auto& p : solution[0]) {
|
||||
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
RTOCRDetector::~RTOCRDetector() {
|
||||
try {
|
||||
if (m_usingSharedPool) {
|
||||
EnginePoolManager<float>::instance().release(m_poolKey);
|
||||
m_engine.reset();
|
||||
m_usingSharedPool = false;
|
||||
}
|
||||
else if (m_engine) {
|
||||
m_engine.reset();
|
||||
}
|
||||
}
|
||||
catch (...) {}
|
||||
}
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
Reference in New Issue
Block a user