Refactor project structure

This commit is contained in:
2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions

View File

@@ -0,0 +1,151 @@
#include "PaddleOCRV5RTEngine.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
namespace ANSCENTER {
namespace rtocr {
bool PaddleOCRV5RTEngine::Initialize(const std::string& detModelPath,
const std::string& clsModelPath,
const std::string& recModelPath,
const std::string& dictPath,
int gpuId,
const std::string& engineCacheDir) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
gpuId_ = gpuId;
if (!engineCacheDir.empty()) {
engineCacheDir_ = engineCacheDir;
}
try {
// 1. Initialize detector
detector_ = std::make_unique<RTOCRDetector>();
if (!detector_->Initialize(detModelPath, gpuId_, engineCacheDir_, detMaxSideLen_)) {
std::cerr << "[PaddleOCRV5RTEngine] Failed to initialize detector" << std::endl;
return false;
}
// 2. Initialize classifier (optional - only if path provided)
if (!clsModelPath.empty()) {
classifier_ = std::make_unique<RTOCRClassifier>();
if (!classifier_->Initialize(clsModelPath, gpuId_, engineCacheDir_)) {
std::cerr << "[PaddleOCRV5RTEngine] Warning: Failed to initialize classifier, skipping"
<< std::endl;
classifier_.reset();
}
}
// 3. Initialize recognizer
recognizer_ = std::make_unique<RTOCRRecognizer>();
recognizer_->SetRecImageHeight(recImgH_);
recognizer_->SetRecImageMaxWidth(recImgMaxW_);
if (!recognizer_->Initialize(recModelPath, dictPath, gpuId_, engineCacheDir_)) {
std::cerr << "[PaddleOCRV5RTEngine] Failed to initialize recognizer" << std::endl;
return false;
}
std::cout << "[PaddleOCRV5RTEngine] Initialized successfully"
<< " (detector: yes, classifier: " << (classifier_ ? "yes" : "no")
<< ", recognizer: yes)" << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[PaddleOCRV5RTEngine] Initialize failed: " << e.what() << std::endl;
return false;
}
}
std::vector<OCRPredictResult> PaddleOCRV5RTEngine::ocr(const cv::Mat& image) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
std::vector<OCRPredictResult> results;
if (!detector_ || !recognizer_ || image.empty()) return results;
try {
// 1. Detection: find text boxes
std::vector<TextBox> textBoxes = detector_->Detect(
image, detMaxSideLen_, detDbThresh_, detBoxThresh_,
detUnclipRatio_, useDilation_);
if (textBoxes.empty()) return results;
// 2. Crop text regions
std::vector<cv::Mat> croppedImages;
croppedImages.reserve(textBoxes.size());
for (size_t i = 0; i < textBoxes.size(); i++) {
cv::Mat cropped = GetRotateCropImage(image, textBoxes[i]);
if (cropped.empty()) continue;
croppedImages.push_back(cropped);
}
if (croppedImages.size() != textBoxes.size()) {
// Some crops failed, rebuild aligned arrays
std::vector<TextBox> validBoxes;
std::vector<cv::Mat> validCrops;
for (size_t i = 0; i < textBoxes.size(); i++) {
cv::Mat cropped = GetRotateCropImage(image, textBoxes[i]);
if (!cropped.empty()) {
validBoxes.push_back(textBoxes[i]);
validCrops.push_back(cropped);
}
}
textBoxes = validBoxes;
croppedImages = validCrops;
}
// 3. Classification (optional): check orientation and rotate if needed
std::vector<int> clsLabels(croppedImages.size(), 0);
std::vector<float> clsScores(croppedImages.size(), 0.0f);
if (classifier_) {
auto clsResults = classifier_->Classify(croppedImages, clsThresh_);
for (size_t i = 0; i < clsResults.size() && i < croppedImages.size(); i++) {
clsLabels[i] = clsResults[i].first;
clsScores[i] = clsResults[i].second;
// Rotate 180 degrees if label is odd and confidence is high enough
if (clsLabels[i] % 2 == 1 && clsScores[i] > clsThresh_) {
cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
}
}
}
// 4. Recognition: extract text from cropped images
std::vector<TextLine> textLines = recognizer_->RecognizeBatch(croppedImages);
// 5. Combine results
results.reserve(textBoxes.size());
for (size_t i = 0; i < textBoxes.size(); i++) {
OCRPredictResult res;
// Convert box to [[x,y], ...] format
for (int j = 0; j < 4; j++) {
res.box.push_back({
static_cast<int>(textBoxes[i].points[j].x),
static_cast<int>(textBoxes[i].points[j].y)
});
}
if (i < textLines.size()) {
res.text = textLines[i].text;
res.score = textLines[i].score;
}
res.cls_label = clsLabels[i];
res.cls_score = clsScores[i];
results.push_back(res);
}
return results;
}
catch (const std::exception& e) {
std::cerr << "[PaddleOCRV5RTEngine] OCR failed: " << e.what() << std::endl;
return results;
}
}
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,67 @@
#pragma once
#include "RTOCRTypes.h"
#include "RTOCRDetector.h"
#include "RTOCRClassifier.h"
#include "RTOCRRecognizer.h"
#include <memory>
#include <mutex>
#include <string>
#include <vector>
#include "ANSLicense.h"
namespace ANSCENTER {
namespace rtocr {
class PaddleOCRV5RTEngine {
public:
PaddleOCRV5RTEngine() = default;
~PaddleOCRV5RTEngine() = default;
PaddleOCRV5RTEngine(const PaddleOCRV5RTEngine&) = delete;
PaddleOCRV5RTEngine& operator=(const PaddleOCRV5RTEngine&) = delete;
// Initialize all components
// clsModelPath can be empty to skip classifier
bool Initialize(const std::string& detModelPath,
const std::string& clsModelPath,
const std::string& recModelPath,
const std::string& dictPath,
int gpuId = 0,
const std::string& engineCacheDir = "");
// Run full OCR pipeline: detect → crop → [classify →] recognize
std::vector<OCRPredictResult> ocr(const cv::Mat& image);
// Configuration setters
void SetDetMaxSideLen(int v) { detMaxSideLen_ = v; }
void SetDetDbThresh(float v) { detDbThresh_ = v; }
void SetDetBoxThresh(float v) { detBoxThresh_ = v; }
void SetDetUnclipRatio(float v) { detUnclipRatio_ = v; }
void SetClsThresh(float v) { clsThresh_ = v; }
void SetUseDilation(bool v) { useDilation_ = v; }
void SetRecImageHeight(int v) { recImgH_ = v; }
void SetRecImageMaxWidth(int v) { recImgMaxW_ = v; }
void SetGpuId(int v) { gpuId_ = v; }
void SetEngineCacheDir(const std::string& v) { engineCacheDir_ = v; }
private:
std::unique_ptr<RTOCRDetector> detector_;
std::unique_ptr<RTOCRClassifier> classifier_; // optional
std::unique_ptr<RTOCRRecognizer> recognizer_;
// Configuration
int detMaxSideLen_ = kDetMaxSideLen;
float detDbThresh_ = kDetDbThresh;
float detBoxThresh_ = kDetBoxThresh;
float detUnclipRatio_ = kDetUnclipRatio;
float clsThresh_ = kClsThresh;
bool useDilation_ = false;
int recImgH_ = kRecImgH;
int recImgMaxW_ = kRecImgMaxW;
int gpuId_ = 0;
std::string engineCacheDir_;
std::recursive_mutex _mutex;
};
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,143 @@
#include "RTOCRClassifier.h"
#include <opencv2/imgproc.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
#include <cmath>
namespace ANSCENTER {
namespace rtocr {
bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId,
const std::string& engineCacheDir) {
try {
ANSCENTER::Options options;
options.deviceIndex = gpuId;
options.precision = ANSCENTER::Precision::FP16;
options.maxBatchSize = 1;
options.optBatchSize = 1;
// Fixed input size for classifier
options.minInputHeight = kClsImageH;
options.optInputHeight = kClsImageH;
options.maxInputHeight = kClsImageH;
options.minInputWidth = kClsImageW;
options.optInputWidth = kClsImageW;
options.maxInputWidth = kClsImageW;
if (!engineCacheDir.empty()) {
options.engineFileDir = engineCacheDir;
}
else {
auto pos = onnxPath.find_last_of("/\\");
options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
}
m_poolKey = { onnxPath,
static_cast<int>(options.precision),
options.maxBatchSize };
m_engine = EnginePoolManager<float>::instance().acquire(
m_poolKey, options, onnxPath,
kClsSubVals, kClsDivVals, true, -1);
m_usingSharedPool = (m_engine != nullptr);
if (!m_engine) {
std::cerr << "[RTOCRClassifier] Failed to build/load TRT engine: " << onnxPath << std::endl;
return false;
}
std::cout << "[RTOCRClassifier] Initialized TRT engine from: " << onnxPath << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[RTOCRClassifier] Initialize failed: " << e.what() << std::endl;
m_engine.reset();
return false;
}
}
std::vector<std::pair<int, float>> RTOCRClassifier::Classify(
const std::vector<cv::Mat>& images, float clsThresh) {
std::lock_guard<std::mutex> lock(_mutex);
std::vector<std::pair<int, float>> results;
if (!m_engine || images.empty()) return results;
results.reserve(images.size());
for (size_t i = 0; i < images.size(); i++) {
try {
if (images[i].empty()) {
results.push_back({ 0, 0.0f });
continue;
}
// Preprocess: direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
cv::Mat resized;
cv::resize(images[i], resized, cv::Size(kClsImageW, kClsImageH));
// Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
cv::cuda::GpuMat gpuImg;
gpuImg.upload(resized);
// Run inference
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
std::vector<std::vector<std::vector<float>>> featureVectors;
if (!m_engine->runInference(inputs, featureVectors)) {
results.push_back({ 0, 0.0f });
continue;
}
if (featureVectors.empty() || featureVectors[0].empty() ||
featureVectors[0][0].empty()) {
results.push_back({ 0, 0.0f });
continue;
}
// Find argmax and use raw output value as score
// PaddleOCR v5 models include softmax, so output values are probabilities
// Matches PaddleOCR official: score = preds[i, argmax_idx]
const std::vector<float>& output = featureVectors[0][0];
int numClasses = static_cast<int>(output.size());
int bestIdx = 0;
float bestScore = output[0];
for (int c = 1; c < numClasses; c++) {
if (output[c] > bestScore) {
bestScore = output[c];
bestIdx = c;
}
}
results.push_back({ bestIdx, bestScore });
}
catch (const std::exception& e) {
std::cerr << "[RTOCRClassifier] Classify failed for image " << i
<< ": " << e.what() << std::endl;
results.push_back({ 0, 0.0f });
}
}
return results;
}
RTOCRClassifier::~RTOCRClassifier() {
try {
if (m_usingSharedPool) {
EnginePoolManager<float>::instance().release(m_poolKey);
m_engine.reset();
m_usingSharedPool = false;
}
else if (m_engine) {
m_engine.reset();
}
}
catch (...) {}
}
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,36 @@
#pragma once
#include "RTOCRTypes.h"
#include "engine.h"
#include "engine/EnginePoolManager.h"
#include <memory>
#include <mutex>
namespace ANSCENTER {
namespace rtocr {
class RTOCRClassifier {
public:
RTOCRClassifier() = default;
~RTOCRClassifier();
RTOCRClassifier(const RTOCRClassifier&) = delete;
RTOCRClassifier& operator=(const RTOCRClassifier&) = delete;
bool Initialize(const std::string& onnxPath, int gpuId = 0,
const std::string& engineCacheDir = "");
// Classify a batch of text images
// Returns vector of (cls_label, cls_score) per image
// cls_label: 0 = normal, 1 = rotated 180 degrees
std::vector<std::pair<int, float>> Classify(
const std::vector<cv::Mat>& images, float clsThresh = kClsThresh);
private:
std::shared_ptr<Engine<float>> m_engine = nullptr;
EnginePoolManager<float>::PoolKey m_poolKey;
bool m_usingSharedPool = false;
std::mutex _mutex;
};
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,403 @@
#include "RTOCRDetector.h"
#include "include/clipper.h"
#include "NV12PreprocessHelper.h"
#include "ANSGpuFrameRegistry.h"
#include <cuda_runtime.h>
#include <opencv2/imgproc.hpp>
// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>
namespace ANSCENTER {
namespace rtocr {
bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
const std::string& engineCacheDir,
int maxSideLen) {
// Engine cache directory
std::string cacheDir;
if (!engineCacheDir.empty()) {
cacheDir = engineCacheDir;
} else {
auto pos = onnxPath.find_last_of("/\\");
cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
}
try {
ANSCENTER::Options options;
options.deviceIndex = gpuId;
// FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
// The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
// intermediate values overflow FP16 range (65504). Mixed precision
// (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
// originates deep in the conv->scale->relu backbone before reaching those layers.
// Classifier and recognizer remain FP16 with mixed precision -- only the
// detector needs full FP32.
options.precision = ANSCENTER::Precision::FP32;
options.maxBatchSize = 1;
options.optBatchSize = 1;
// Dynamic spatial dimensions for detection (multiples of 32)
options.minInputHeight = 32;
options.minInputWidth = 32;
options.optInputHeight = std::min(640, maxSideLen);
options.optInputWidth = std::min(640, maxSideLen);
options.maxInputHeight = maxSideLen;
options.maxInputWidth = maxSideLen;
options.engineFileDir = cacheDir;
m_poolKey = { onnxPath,
static_cast<int>(options.precision),
options.maxBatchSize };
m_engine = EnginePoolManager<float>::instance().acquire(
m_poolKey, options, onnxPath,
kDetSubVals, kDetDivVals, true, -1);
m_usingSharedPool = (m_engine != nullptr);
if (!m_engine) {
std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
<< onnxPath << std::endl;
return false;
}
// Query actual profile max from the loaded engine
int profMaxH = m_engine->getProfileMaxHeight();
int profMaxW = m_engine->getProfileMaxWidth();
if (profMaxH > 0 && profMaxW > 0) {
m_engineMaxSideLen = std::min(profMaxH, profMaxW);
} else {
m_engineMaxSideLen = maxSideLen;
}
if (m_engineMaxSideLen < maxSideLen) {
std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
<< "x" << m_engineMaxSideLen << " (requested " << maxSideLen
<< " exceeded GPU capacity)" << std::endl;
}
std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
return false;
}
}
std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
int maxSideLen, float dbThresh,
float boxThresh, float unclipRatio,
bool useDilation) {
std::lock_guard<std::mutex> lock(_mutex);
if (!m_engine || image.empty()) return {};
try {
// Single-pass detection: resize the full image to fit within
// the engine's max spatial dimension (same approach as ONNX version).
int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);
// 1. Compute resize dimensions (multiples of 32)
cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
int newH = resizeShape.height;
int newW = resizeShape.width;
float ratioH = static_cast<float>(image.rows) / newH;
float ratioW = static_cast<float>(image.cols) / newW;
// 2. Upload to GPU and resize — try NV12 fast path first
cv::cuda::GpuMat gpuResized;
bool usedNV12 = false;
GpuFrameData* gpuFrame = tl_currentGpuFrame();
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
gpuFrame->width > 0 && gpuFrame->height > 0) {
// NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
// instead of CPU BGR upload (24MB) + separate resize
int fW = gpuFrame->width;
int fH = gpuFrame->height;
int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;
// Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
const uint8_t* devY = nullptr;
const uint8_t* devUV = nullptr;
int yPitch = 0, uvPitch = 0;
{
auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
// Cache hit
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
yPitch = static_cast<int>(gpuFrame->gpuCacheYPitch);
uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
} else if (!gpuFrame->gpuCacheValid) {
// Cache miss — upload CPU NV12 to GPU
size_t yBytes = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;
auto& reg = ANSGpuFrameRegistry::instance();
if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
cudaMalloc(&gpuFrame->gpuCacheY, yBytes);
cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
cudaMemcpy(gpuFrame->gpuCacheY, gpuFrame->cpuYPlane, yBytes, cudaMemcpyHostToDevice);
cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
gpuFrame->gpuCacheValid = true;
gpuFrame->gpuCacheDeviceIdx = gpuIdx;
gpuFrame->gpuCacheYPitch = static_cast<size_t>(gpuFrame->cpuYLinesize);
gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
gpuFrame->gpuCacheBytes = yBytes + uvBytes;
reg.onGpuCacheCreated(yBytes + uvBytes);
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
yPitch = gpuFrame->cpuYLinesize;
uvPitch = gpuFrame->cpuUvLinesize;
}
}
} // release registry lock before GPU kernel
if (devY && devUV) {
// Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
gpuResized.create(newH, newW, CV_8UC3);
NV12PreprocessHelper::nv12ToBGRResize(
devY, yPitch, devUV, uvPitch,
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
newW, newH, fW, fH);
usedNV12 = true;
// Update ratios to map from full-res NV12 to detection output
ratioH = static_cast<float>(fH) / newH;
ratioW = static_cast<float>(fW) / newW;
}
}
if (!usedNV12) {
// Fallback: standard BGR upload
cv::cuda::GpuMat gpuImg;
gpuImg.upload(image);
cv::cuda::resize(gpuImg, gpuResized, resizeShape);
}
// Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)
// 3. Run inference
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
std::vector<std::vector<std::vector<float>>> featureVectors;
if (!m_engine->runInference(inputs, featureVectors)) {
std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
return {};
}
if (featureVectors.empty() || featureVectors[0].empty()) return {};
// 4. Reshape output to probability map [H, W]
std::vector<float>& output = featureVectors[0][0];
int outputSize = static_cast<int>(output.size());
if (outputSize < newH * newW) {
std::cerr << "[RTOCRDetector] Output too small: expected at least "
<< newH * newW << " got " << outputSize << std::endl;
return {};
}
cv::Mat bitmap(newH, newW, CV_32FC1, output.data());
// 5. Threshold to binary (matches ONNX/PaddleOCR official order)
cv::Mat binaryMap;
cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
binaryMap.convertTo(binaryMap, CV_8UC1);
// 6. Apply dilation if requested (on binaryMap, matching ONNX version)
if (useDilation) {
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
cv::dilate(binaryMap, binaryMap, kernel);
}
// 7. Find contours and build text boxes
// (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
std::vector<std::vector<cv::Point>> contours;
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
std::vector<TextBox> boxes;
for (int i = 0; i < numCandidates; i++) {
if (contours[i].size() < 4) continue;
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
float sside = std::min(minRect.size.width, minRect.size.height);
if (sside < 3.0f) continue;
auto ordered = GetMiniBoxes(minRect);
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
float score = BoxScoreFast(bitmap, ordered);
if (score < boxThresh) continue;
// Step 3: UnclipPolygon - expand the 4-point box
auto expanded = UnclipPolygon(ordered, unclipRatio);
if (expanded.size() < 4) continue;
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
std::vector<cv::Point> expandedInt;
expandedInt.reserve(expanded.size());
for (auto& p : expanded) {
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
}
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
if (expandedSside < 5.0f) continue;
auto expandedOrdered = GetMiniBoxes(expandedRect);
// Step 5: Scale to original image coordinates
TextBox box;
for (int j = 0; j < 4; j++) {
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
}
box.score = score;
boxes.push_back(box);
}
SortTextBoxes(boxes);
return boxes;
}
catch (const std::exception& e) {
std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
return {};
}
}
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
cv::Point2f vertices[4];
rect.points(vertices);
// Sort all 4 points by x-coordinate ascending
std::sort(vertices, vertices + 4,
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
cv::Point2f topLeft, bottomLeft;
if (vertices[0].y <= vertices[1].y) {
topLeft = vertices[0];
bottomLeft = vertices[1];
} else {
topLeft = vertices[1];
bottomLeft = vertices[0];
}
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
cv::Point2f topRight, bottomRight;
if (vertices[2].y <= vertices[3].y) {
topRight = vertices[2];
bottomRight = vertices[3];
} else {
topRight = vertices[3];
bottomRight = vertices[2];
}
// Order: [TL, TR, BR, BL] (clockwise from top-left)
return { topLeft, topRight, bottomRight, bottomLeft };
}
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
const std::array<cv::Point2f, 4>& box) {
int h = probMap.rows;
int w = probMap.cols;
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
if (xmin >= xmax || ymin >= ymax) return 0.0f;
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
std::vector<cv::Point> pts(4);
for (int j = 0; j < 4; j++) {
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
static_cast<int>(box[j].y) - ymin);
}
std::vector<std::vector<cv::Point>> polys = { pts };
cv::fillPoly(mask, polys, cv::Scalar(1));
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
return static_cast<float>(cv::mean(roiMap, mask)[0]);
}
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
float unclipRatio) {
// Compute area using Shoelace formula and perimeter
float area = 0.0f;
float perimeter = 0.0f;
for (int i = 0; i < 4; i++) {
int j = (i + 1) % 4;
area += box[i].x * box[j].y - box[j].x * box[i].y;
float dx = box[j].x - box[i].x;
float dy = box[j].y - box[i].y;
perimeter += std::sqrt(dx * dx + dy * dy);
}
area = std::abs(area) * 0.5f;
if (perimeter < 1.0f) return {};
float distance = area * unclipRatio / perimeter;
ClipperLib::Path clipperPath;
for (int i = 0; i < 4; i++) {
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
static_cast<ClipperLib::cInt>(box[i].y) });
}
ClipperLib::ClipperOffset offset;
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
ClipperLib::Paths solution;
offset.Execute(solution, distance);
if (solution.empty() || solution[0].empty()) return {};
std::vector<cv::Point2f> result;
for (auto& p : solution[0]) {
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
}
return result;
}
RTOCRDetector::~RTOCRDetector() {
try {
if (m_usingSharedPool) {
EnginePoolManager<float>::instance().release(m_poolKey);
m_engine.reset();
m_usingSharedPool = false;
}
else if (m_engine) {
m_engine.reset();
}
}
catch (...) {}
}
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,44 @@
#pragma once
#include "RTOCRTypes.h"
#include "engine.h"
#include "engine/EnginePoolManager.h"
#include <memory>
#include <mutex>
namespace ANSCENTER {
namespace rtocr {
class RTOCRDetector {
public:
RTOCRDetector() = default;
~RTOCRDetector();
RTOCRDetector(const RTOCRDetector&) = delete;
RTOCRDetector& operator=(const RTOCRDetector&) = delete;
bool Initialize(const std::string& onnxPath, int gpuId = 0,
const std::string& engineCacheDir = "",
int maxSideLen = kDetMaxSideLen);
std::vector<TextBox> Detect(const cv::Mat& image,
int maxSideLen = kDetMaxSideLen,
float dbThresh = kDetDbThresh,
float boxThresh = kDetBoxThresh,
float unclipRatio = kDetUnclipRatio,
bool useDilation = false);
private:
// Postprocessing helpers (matches ONNX/PaddleOCR official flow exactly)
std::array<cv::Point2f, 4> GetMiniBoxes(const cv::RotatedRect& rect);
float BoxScoreFast(const cv::Mat& probMap, const std::array<cv::Point2f, 4>& box);
std::vector<cv::Point2f> UnclipPolygon(const std::array<cv::Point2f, 4>& box, float unclipRatio);
std::shared_ptr<Engine<float>> m_engine = nullptr;
EnginePoolManager<float>::PoolKey m_poolKey;
bool m_usingSharedPool = false;
int m_engineMaxSideLen = kDetMaxSideLen; // Actual TRT engine max spatial dim
std::mutex _mutex;
};
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,206 @@
#include "RTOCRRecognizer.h"
#include <opencv2/imgproc.hpp>
#include <opencv2/cudaimgproc.hpp>
#include <opencv2/cudawarping.hpp>
#include <opencv2/cudaarithm.hpp>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <cfloat>
namespace ANSCENTER {
namespace rtocr {
bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath,
int gpuId, const std::string& engineCacheDir) {
try {
// Load dictionary first
keys_ = LoadDict(dictPath);
if (keys_.size() < 2) {
std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
return false;
}
std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size()
<< " characters from: " << dictPath << std::endl;
ANSCENTER::Options options;
options.deviceIndex = gpuId;
options.precision = ANSCENTER::Precision::FP16;
options.maxBatchSize = 1;
options.optBatchSize = 1;
// Fixed height, dynamic width for recognition
options.minInputHeight = imgH_;
options.optInputHeight = imgH_;
options.maxInputHeight = imgH_;
options.minInputWidth = 32;
options.optInputWidth = imgMaxW_;
options.maxInputWidth = 960;
if (!engineCacheDir.empty()) {
options.engineFileDir = engineCacheDir;
}
else {
auto pos = onnxPath.find_last_of("/\\");
options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
}
m_poolKey = { onnxPath,
static_cast<int>(options.precision),
options.maxBatchSize };
m_engine = EnginePoolManager<float>::instance().acquire(
m_poolKey, options, onnxPath,
kRecSubVals, kRecDivVals, true, -1);
m_usingSharedPool = (m_engine != nullptr);
if (!m_engine) {
std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl;
return false;
}
std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl;
m_engine.reset();
return false;
}
}
TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
std::lock_guard<std::mutex> lock(_mutex);
if (!m_engine || croppedImage.empty() || keys_.empty()) {
return {};
}
try {
// Preprocess: resize to fixed height, proportional width
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
int resizedW = resized.cols;
// Pad to at least kRecImgW width (matching official PaddleOCR behavior)
// Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray)
int imgW = std::max(resizedW, kRecImgW);
if (imgW > resizedW) {
cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128));
resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_)));
resized = padded;
}
// Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
cv::cuda::GpuMat gpuImg;
gpuImg.upload(resized);
// Run inference
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
std::vector<std::vector<std::vector<float>>> featureVectors;
if (!m_engine->runInference(inputs, featureVectors)) {
std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl;
return {};
}
if (featureVectors.empty() || featureVectors[0].empty() ||
featureVectors[0][0].empty()) {
return {};
}
// Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses]
// IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions
// (e.g. 120 timesteps for max width 960), but the actual inference produces
// fewer timesteps for narrower images. We must use the ACTUAL seqLen
// derived from the input width, not getOutputDims() which returns max dims.
const std::vector<float>& output = featureVectors[0][0];
// numClasses from dictionary size (keys_ includes blank at index 0)
int numClasses = static_cast<int>(keys_.size());
// Actual seqLen from input width: recognition model stride = 8
// (confirmed: 960px input → 120 timesteps, 960/120 = 8)
int seqLen = imgW / 8;
// Sanity check: seqLen * numClasses must not exceed buffer size
if (seqLen * numClasses > static_cast<int>(output.size())) {
// Fallback: infer from buffer size
seqLen = static_cast<int>(output.size()) / numClasses;
}
return CTCDecode(output.data(), seqLen, numClasses);
}
catch (const std::exception& e) {
std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl;
return {};
}
}
std::vector<TextLine> RTOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
std::vector<TextLine> results;
results.reserve(croppedImages.size());
// Process one at a time (each image has different width)
for (size_t i = 0; i < croppedImages.size(); i++) {
results.push_back(Recognize(croppedImages[i]));
}
return results;
}
TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
TextLine result;
std::string text;
std::vector<float> scores;
int lastIndex = 0; // CTC blank is index 0
for (int t = 0; t < seqLen; t++) {
// Find argmax for this timestep
int maxIndex = 0;
float maxValue = -FLT_MAX;
const float* timeStep = outputData + t * numClasses;
for (int c = 0; c < numClasses; c++) {
if (timeStep[c] > maxValue) {
maxValue = timeStep[c];
maxIndex = c;
}
}
// CTC decode: skip blanks (index 0) and repeated characters
if (maxIndex != 0 && maxIndex != lastIndex) {
if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
// Use raw model output value as confidence (PaddleOCR v5 models include softmax)
scores.push_back(maxValue);
}
}
lastIndex = maxIndex;
}
result.text = text;
if (!scores.empty()) {
result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
static_cast<float>(scores.size());
}
return result;
}
RTOCRRecognizer::~RTOCRRecognizer() {
try {
if (m_usingSharedPool) {
EnginePoolManager<float>::instance().release(m_poolKey);
m_engine.reset();
m_usingSharedPool = false;
}
else if (m_engine) {
m_engine.reset();
}
}
catch (...) {}
}
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,41 @@
#pragma once
#include "RTOCRTypes.h"
#include "engine.h"
#include "engine/EnginePoolManager.h"
#include <memory>
#include <mutex>
namespace ANSCENTER {
namespace rtocr {
class RTOCRRecognizer {
public:
RTOCRRecognizer() = default;
~RTOCRRecognizer();
RTOCRRecognizer(const RTOCRRecognizer&) = delete;
RTOCRRecognizer& operator=(const RTOCRRecognizer&) = delete;
bool Initialize(const std::string& onnxPath, const std::string& dictPath,
int gpuId = 0, const std::string& engineCacheDir = "");
TextLine Recognize(const cv::Mat& croppedImage);
std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
void SetRecImageHeight(int h) { imgH_ = h; }
void SetRecImageMaxWidth(int w) { imgMaxW_ = w; }
private:
TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
std::shared_ptr<Engine<float>> m_engine = nullptr;
EnginePoolManager<float>::PoolKey m_poolKey;
bool m_usingSharedPool = false;
std::vector<std::string> keys_;
int imgH_ = kRecImgH;
int imgMaxW_ = kRecImgMaxW;
std::mutex _mutex;
};
} // namespace rtocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,196 @@
#pragma once
#include <string>
#include <vector>
#include <array>
#include <fstream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
namespace ANSCENTER {
namespace rtocr {
// ============================================================================
// Engine normalization constants (BGR channel order, matching PaddleOCR official)
// ============================================================================
// PaddleOCR processes images in BGR order (no BGR→RGB conversion).
// Engine applies: (pixel/255.0 - subVals[c]) / divVals[c] per channel.
// When feeding BGR input (no cvtColor), subVals/divVals indices map to:
// [0]=B channel, [1]=G channel, [2]=R channel
//
// PaddleOCR config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
// These are applied to BGR channels: B=0.485/0.229, G=0.456/0.224, R=0.406/0.225
// Detection normalization (BGR order)
constexpr std::array<float, 3> kDetSubVals = { 0.485f, 0.456f, 0.406f };
constexpr std::array<float, 3> kDetDivVals = { 0.229f, 0.224f, 0.225f };
// Classifier normalization: PP-LCNet_x1_0_textline_ori uses ImageNet normalization (BGR order)
// Config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], scale=1/255
constexpr std::array<float, 3> kClsSubVals = { 0.485f, 0.456f, 0.406f };
constexpr std::array<float, 3> kClsDivVals = { 0.229f, 0.224f, 0.225f };
// Recognition normalization: (pixel/255 - 0.5) / 0.5 (same as classifier)
constexpr std::array<float, 3> kRecSubVals = { 0.5f, 0.5f, 0.5f };
constexpr std::array<float, 3> kRecDivVals = { 0.5f, 0.5f, 0.5f };
// ============================================================================
// Detection defaults (PP-OCRv5 server)
// kDetMaxSideLen is the default max side length for detection preprocessing.
// TRT engine auto-fallbacks to smaller max if GPU memory is insufficient during build.
// ============================================================================
constexpr int kDetMaxSideLen = 2560;
constexpr int kDetMaxSideLimit = 4000; // Safety cap on max dimension
constexpr float kDetDbThresh = 0.3f;
constexpr float kDetBoxThresh = 0.6f;
constexpr float kDetUnclipRatio = 1.5f;
constexpr int kDetMaxCandidates = 1000;
// ============================================================================
// Classifier defaults (PP-LCNet_x1_0_textline_ori model)
// Input: [B, 3, 80, 160], ImageNet normalization, 2-class (0°/180°)
// Direct resize to 80x160 (no aspect ratio preservation)
// ============================================================================
constexpr int kClsImageH = 80;
constexpr int kClsImageW = 160;
constexpr float kClsThresh = 0.9f;
// ============================================================================
// Recognition defaults
// ============================================================================
constexpr int kRecImgH = 48;
constexpr int kRecImgW = 320; // Default rec width (PP-OCRv5 rec_image_shape[2]=320, min padded width)
constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines
constexpr int kRecBatchSize = 6;
// ============================================================================
// Data structures
// ============================================================================
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
struct TextBox {
std::array<cv::Point2f, 4> points;
float score = 0.0f;
};
// A single recognized text line
struct TextLine {
std::string text;
float score = 0.0f;
};
// OCR result matching PaddleOCR::OCRPredictResult format
struct OCRPredictResult {
std::vector<std::vector<int>> box; // 4 corner points [[x,y], ...]
std::string text;
float score = -1.0f;
float cls_score = 0.0f;
int cls_label = -1;
};
// ============================================================================
// Utility functions
// ============================================================================
// Load character dictionary from file
inline std::vector<std::string> LoadDict(const std::string& dictPath) {
std::vector<std::string> keys;
std::ifstream file(dictPath);
if (!file.is_open()) return keys;
std::string line;
while (std::getline(file, line)) {
if (!line.empty() && line.back() == '\r') {
line.pop_back();
}
keys.push_back(line);
}
// CTC blank token at index 0
keys.insert(keys.begin(), "#");
// Space at end
keys.push_back(" ");
return keys;
}
// Compute resize dimensions for detection model (multiples of 32)
// limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
// maxSideLimit: safety cap on final max dimension (default 4000)
inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
int maxSideLimit = kDetMaxSideLimit) {
float ratio = 1.0f;
int maxSide = std::max(srcH, srcW);
if (maxSide > maxSideLen) {
ratio = static_cast<float>(maxSideLen) / static_cast<float>(maxSide);
}
int newH = static_cast<int>(srcH * ratio);
int newW = static_cast<int>(srcW * ratio);
// Safety cap: clamp if either dimension exceeds maxSideLimit
if (std::max(newH, newW) > maxSideLimit) {
float clampRatio = static_cast<float>(maxSideLimit) / static_cast<float>(std::max(newH, newW));
newH = static_cast<int>(newH * clampRatio);
newW = static_cast<int>(newW * clampRatio);
}
newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
return cv::Size(newW, newH);
}
// Sort text boxes from top to bottom, left to right
inline void SortTextBoxes(std::vector<TextBox>& boxes) {
std::sort(boxes.begin(), boxes.end(),
[](const TextBox& a, const TextBox& b) {
if (std::abs(a.points[0].y - b.points[0].y) < 10.0f) {
return a.points[0].x < b.points[0].x;
}
return a.points[0].y < b.points[0].y;
});
}
// Get rotated and cropped image from text box polygon
inline cv::Mat GetRotateCropImage(const cv::Mat& srcImage, const TextBox& box) {
auto pts = box.points;
float width = static_cast<float>(std::max(
cv::norm(pts[0] - pts[1]),
cv::norm(pts[2] - pts[3])));
float height = static_cast<float>(std::max(
cv::norm(pts[0] - pts[3]),
cv::norm(pts[1] - pts[2])));
std::vector<cv::Point2f> srcPts = { pts[0], pts[1], pts[2], pts[3] };
std::vector<cv::Point2f> dstPts = {
{0, 0}, {width, 0}, {width, height}, {0, height}
};
cv::Mat M = cv::getPerspectiveTransform(srcPts, dstPts);
cv::Mat cropped;
cv::warpPerspective(srcImage, cropped, M,
cv::Size(static_cast<int>(width), static_cast<int>(height)),
cv::BORDER_REPLICATE);
if (cropped.rows > cropped.cols * 1.5f) {
cv::Mat rotated;
cv::transpose(cropped, rotated);
cv::flip(rotated, rotated, 0);
return rotated;
}
return cropped;
}
// Resize recognition image to fixed height, proportional width
inline cv::Mat ResizeRecImage(const cv::Mat& img, int targetH, int maxW) {
float ratio = static_cast<float>(targetH) / img.rows;
int targetW = static_cast<int>(img.cols * ratio);
targetW = std::min(targetW, maxW);
targetW = std::max(targetW, 1);
cv::Mat resized;
cv::resize(img, resized, cv::Size(targetW, targetH));
return resized;
}
} // namespace rtocr
} // namespace ANSCENTER