Initial setup for CLion
This commit is contained in:
151
ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.cpp
Normal file
151
ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.cpp
Normal file
@@ -0,0 +1,151 @@
|
||||
#include "PaddleOCRV5RTEngine.h"
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <iostream>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
bool PaddleOCRV5RTEngine::Initialize(const std::string& detModelPath,
|
||||
const std::string& clsModelPath,
|
||||
const std::string& recModelPath,
|
||||
const std::string& dictPath,
|
||||
int gpuId,
|
||||
const std::string& engineCacheDir) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
|
||||
gpuId_ = gpuId;
|
||||
if (!engineCacheDir.empty()) {
|
||||
engineCacheDir_ = engineCacheDir;
|
||||
}
|
||||
|
||||
try {
|
||||
// 1. Initialize detector
|
||||
detector_ = std::make_unique<RTOCRDetector>();
|
||||
if (!detector_->Initialize(detModelPath, gpuId_, engineCacheDir_, detMaxSideLen_)) {
|
||||
std::cerr << "[PaddleOCRV5RTEngine] Failed to initialize detector" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// 2. Initialize classifier (optional - only if path provided)
|
||||
if (!clsModelPath.empty()) {
|
||||
classifier_ = std::make_unique<RTOCRClassifier>();
|
||||
if (!classifier_->Initialize(clsModelPath, gpuId_, engineCacheDir_)) {
|
||||
std::cerr << "[PaddleOCRV5RTEngine] Warning: Failed to initialize classifier, skipping"
|
||||
<< std::endl;
|
||||
classifier_.reset();
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Initialize recognizer
|
||||
recognizer_ = std::make_unique<RTOCRRecognizer>();
|
||||
recognizer_->SetRecImageHeight(recImgH_);
|
||||
recognizer_->SetRecImageMaxWidth(recImgMaxW_);
|
||||
if (!recognizer_->Initialize(recModelPath, dictPath, gpuId_, engineCacheDir_)) {
|
||||
std::cerr << "[PaddleOCRV5RTEngine] Failed to initialize recognizer" << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "[PaddleOCRV5RTEngine] Initialized successfully"
|
||||
<< " (detector: yes, classifier: " << (classifier_ ? "yes" : "no")
|
||||
<< ", recognizer: yes)" << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[PaddleOCRV5RTEngine] Initialize failed: " << e.what() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<OCRPredictResult> PaddleOCRV5RTEngine::ocr(const cv::Mat& image) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
std::vector<OCRPredictResult> results;
|
||||
|
||||
if (!detector_ || !recognizer_ || image.empty()) return results;
|
||||
|
||||
try {
|
||||
// 1. Detection: find text boxes
|
||||
std::vector<TextBox> textBoxes = detector_->Detect(
|
||||
image, detMaxSideLen_, detDbThresh_, detBoxThresh_,
|
||||
detUnclipRatio_, useDilation_);
|
||||
|
||||
if (textBoxes.empty()) return results;
|
||||
|
||||
// 2. Crop text regions
|
||||
std::vector<cv::Mat> croppedImages;
|
||||
croppedImages.reserve(textBoxes.size());
|
||||
|
||||
for (size_t i = 0; i < textBoxes.size(); i++) {
|
||||
cv::Mat cropped = GetRotateCropImage(image, textBoxes[i]);
|
||||
if (cropped.empty()) continue;
|
||||
croppedImages.push_back(cropped);
|
||||
}
|
||||
|
||||
if (croppedImages.size() != textBoxes.size()) {
|
||||
// Some crops failed, rebuild aligned arrays
|
||||
std::vector<TextBox> validBoxes;
|
||||
std::vector<cv::Mat> validCrops;
|
||||
for (size_t i = 0; i < textBoxes.size(); i++) {
|
||||
cv::Mat cropped = GetRotateCropImage(image, textBoxes[i]);
|
||||
if (!cropped.empty()) {
|
||||
validBoxes.push_back(textBoxes[i]);
|
||||
validCrops.push_back(cropped);
|
||||
}
|
||||
}
|
||||
textBoxes = validBoxes;
|
||||
croppedImages = validCrops;
|
||||
}
|
||||
|
||||
// 3. Classification (optional): check orientation and rotate if needed
|
||||
std::vector<int> clsLabels(croppedImages.size(), 0);
|
||||
std::vector<float> clsScores(croppedImages.size(), 0.0f);
|
||||
|
||||
if (classifier_) {
|
||||
auto clsResults = classifier_->Classify(croppedImages, clsThresh_);
|
||||
for (size_t i = 0; i < clsResults.size() && i < croppedImages.size(); i++) {
|
||||
clsLabels[i] = clsResults[i].first;
|
||||
clsScores[i] = clsResults[i].second;
|
||||
|
||||
// Rotate 180 degrees if label is odd and confidence is high enough
|
||||
if (clsLabels[i] % 2 == 1 && clsScores[i] > clsThresh_) {
|
||||
cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// 4. Recognition: extract text from cropped images
|
||||
std::vector<TextLine> textLines = recognizer_->RecognizeBatch(croppedImages);
|
||||
|
||||
// 5. Combine results
|
||||
results.reserve(textBoxes.size());
|
||||
for (size_t i = 0; i < textBoxes.size(); i++) {
|
||||
OCRPredictResult res;
|
||||
|
||||
// Convert box to [[x,y], ...] format
|
||||
for (int j = 0; j < 4; j++) {
|
||||
res.box.push_back({
|
||||
static_cast<int>(textBoxes[i].points[j].x),
|
||||
static_cast<int>(textBoxes[i].points[j].y)
|
||||
});
|
||||
}
|
||||
|
||||
if (i < textLines.size()) {
|
||||
res.text = textLines[i].text;
|
||||
res.score = textLines[i].score;
|
||||
}
|
||||
|
||||
res.cls_label = clsLabels[i];
|
||||
res.cls_score = clsScores[i];
|
||||
|
||||
results.push_back(res);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[PaddleOCRV5RTEngine] OCR failed: " << e.what() << std::endl;
|
||||
return results;
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
67
ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.h
Normal file
67
ANSOCR/ANSRTOCR/PaddleOCRV5RTEngine.h
Normal file
@@ -0,0 +1,67 @@
|
||||
#pragma once
|
||||
|
||||
#include "RTOCRTypes.h"
|
||||
#include "RTOCRDetector.h"
|
||||
#include "RTOCRClassifier.h"
|
||||
#include "RTOCRRecognizer.h"
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include "ANSLicense.h"
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
class PaddleOCRV5RTEngine {
|
||||
public:
|
||||
PaddleOCRV5RTEngine() = default;
|
||||
~PaddleOCRV5RTEngine() = default;
|
||||
PaddleOCRV5RTEngine(const PaddleOCRV5RTEngine&) = delete;
|
||||
PaddleOCRV5RTEngine& operator=(const PaddleOCRV5RTEngine&) = delete;
|
||||
|
||||
// Initialize all components
|
||||
// clsModelPath can be empty to skip classifier
|
||||
bool Initialize(const std::string& detModelPath,
|
||||
const std::string& clsModelPath,
|
||||
const std::string& recModelPath,
|
||||
const std::string& dictPath,
|
||||
int gpuId = 0,
|
||||
const std::string& engineCacheDir = "");
|
||||
|
||||
// Run full OCR pipeline: detect → crop → [classify →] recognize
|
||||
std::vector<OCRPredictResult> ocr(const cv::Mat& image);
|
||||
|
||||
// Configuration setters
|
||||
void SetDetMaxSideLen(int v) { detMaxSideLen_ = v; }
|
||||
void SetDetDbThresh(float v) { detDbThresh_ = v; }
|
||||
void SetDetBoxThresh(float v) { detBoxThresh_ = v; }
|
||||
void SetDetUnclipRatio(float v) { detUnclipRatio_ = v; }
|
||||
void SetClsThresh(float v) { clsThresh_ = v; }
|
||||
void SetUseDilation(bool v) { useDilation_ = v; }
|
||||
void SetRecImageHeight(int v) { recImgH_ = v; }
|
||||
void SetRecImageMaxWidth(int v) { recImgMaxW_ = v; }
|
||||
void SetGpuId(int v) { gpuId_ = v; }
|
||||
void SetEngineCacheDir(const std::string& v) { engineCacheDir_ = v; }
|
||||
|
||||
private:
|
||||
std::unique_ptr<RTOCRDetector> detector_;
|
||||
std::unique_ptr<RTOCRClassifier> classifier_; // optional
|
||||
std::unique_ptr<RTOCRRecognizer> recognizer_;
|
||||
|
||||
// Configuration
|
||||
int detMaxSideLen_ = kDetMaxSideLen;
|
||||
float detDbThresh_ = kDetDbThresh;
|
||||
float detBoxThresh_ = kDetBoxThresh;
|
||||
float detUnclipRatio_ = kDetUnclipRatio;
|
||||
float clsThresh_ = kClsThresh;
|
||||
bool useDilation_ = false;
|
||||
int recImgH_ = kRecImgH;
|
||||
int recImgMaxW_ = kRecImgMaxW;
|
||||
int gpuId_ = 0;
|
||||
std::string engineCacheDir_;
|
||||
|
||||
std::recursive_mutex _mutex;
|
||||
};
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
143
ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
Normal file
143
ANSOCR/ANSRTOCR/RTOCRClassifier.cpp
Normal file
@@ -0,0 +1,143 @@
|
||||
#include "RTOCRClassifier.h"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/cudaimgproc.hpp>
|
||||
#include <opencv2/cudawarping.hpp>
|
||||
#include <opencv2/cudaarithm.hpp>
|
||||
#include <iostream>
|
||||
#include <cmath>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
bool RTOCRClassifier::Initialize(const std::string& onnxPath, int gpuId,
|
||||
const std::string& engineCacheDir) {
|
||||
try {
|
||||
ANSCENTER::Options options;
|
||||
options.deviceIndex = gpuId;
|
||||
options.precision = ANSCENTER::Precision::FP16;
|
||||
options.maxBatchSize = 1;
|
||||
options.optBatchSize = 1;
|
||||
|
||||
// Fixed input size for classifier
|
||||
options.minInputHeight = kClsImageH;
|
||||
options.optInputHeight = kClsImageH;
|
||||
options.maxInputHeight = kClsImageH;
|
||||
options.minInputWidth = kClsImageW;
|
||||
options.optInputWidth = kClsImageW;
|
||||
options.maxInputWidth = kClsImageW;
|
||||
|
||||
if (!engineCacheDir.empty()) {
|
||||
options.engineFileDir = engineCacheDir;
|
||||
}
|
||||
else {
|
||||
auto pos = onnxPath.find_last_of("/\\");
|
||||
options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
|
||||
}
|
||||
|
||||
m_poolKey = { onnxPath,
|
||||
static_cast<int>(options.precision),
|
||||
options.maxBatchSize };
|
||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||
m_poolKey, options, onnxPath,
|
||||
kClsSubVals, kClsDivVals, true, -1);
|
||||
m_usingSharedPool = (m_engine != nullptr);
|
||||
|
||||
if (!m_engine) {
|
||||
std::cerr << "[RTOCRClassifier] Failed to build/load TRT engine: " << onnxPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "[RTOCRClassifier] Initialized TRT engine from: " << onnxPath << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRClassifier] Initialize failed: " << e.what() << std::endl;
|
||||
m_engine.reset();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::pair<int, float>> RTOCRClassifier::Classify(
|
||||
const std::vector<cv::Mat>& images, float clsThresh) {
|
||||
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
std::vector<std::pair<int, float>> results;
|
||||
|
||||
if (!m_engine || images.empty()) return results;
|
||||
results.reserve(images.size());
|
||||
|
||||
for (size_t i = 0; i < images.size(); i++) {
|
||||
try {
|
||||
if (images[i].empty()) {
|
||||
results.push_back({ 0, 0.0f });
|
||||
continue;
|
||||
}
|
||||
|
||||
// Preprocess: direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
|
||||
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
|
||||
cv::Mat resized;
|
||||
cv::resize(images[i], resized, cv::Size(kClsImageW, kClsImageH));
|
||||
|
||||
// Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
|
||||
cv::cuda::GpuMat gpuImg;
|
||||
gpuImg.upload(resized);
|
||||
|
||||
// Run inference
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
|
||||
std::vector<std::vector<std::vector<float>>> featureVectors;
|
||||
|
||||
if (!m_engine->runInference(inputs, featureVectors)) {
|
||||
results.push_back({ 0, 0.0f });
|
||||
continue;
|
||||
}
|
||||
|
||||
if (featureVectors.empty() || featureVectors[0].empty() ||
|
||||
featureVectors[0][0].empty()) {
|
||||
results.push_back({ 0, 0.0f });
|
||||
continue;
|
||||
}
|
||||
|
||||
// Find argmax and use raw output value as score
|
||||
// PaddleOCR v5 models include softmax, so output values are probabilities
|
||||
// Matches PaddleOCR official: score = preds[i, argmax_idx]
|
||||
const std::vector<float>& output = featureVectors[0][0];
|
||||
int numClasses = static_cast<int>(output.size());
|
||||
|
||||
int bestIdx = 0;
|
||||
float bestScore = output[0];
|
||||
for (int c = 1; c < numClasses; c++) {
|
||||
if (output[c] > bestScore) {
|
||||
bestScore = output[c];
|
||||
bestIdx = c;
|
||||
}
|
||||
}
|
||||
|
||||
results.push_back({ bestIdx, bestScore });
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRClassifier] Classify failed for image " << i
|
||||
<< ": " << e.what() << std::endl;
|
||||
results.push_back({ 0, 0.0f });
|
||||
}
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
RTOCRClassifier::~RTOCRClassifier() {
|
||||
try {
|
||||
if (m_usingSharedPool) {
|
||||
EnginePoolManager<float>::instance().release(m_poolKey);
|
||||
m_engine.reset();
|
||||
m_usingSharedPool = false;
|
||||
}
|
||||
else if (m_engine) {
|
||||
m_engine.reset();
|
||||
}
|
||||
}
|
||||
catch (...) {}
|
||||
}
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
36
ANSOCR/ANSRTOCR/RTOCRClassifier.h
Normal file
36
ANSOCR/ANSRTOCR/RTOCRClassifier.h
Normal file
@@ -0,0 +1,36 @@
|
||||
#pragma once
|
||||
|
||||
#include "RTOCRTypes.h"
|
||||
#include "engine.h"
|
||||
#include "engine/EnginePoolManager.h"
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
class RTOCRClassifier {
|
||||
public:
|
||||
RTOCRClassifier() = default;
|
||||
~RTOCRClassifier();
|
||||
RTOCRClassifier(const RTOCRClassifier&) = delete;
|
||||
RTOCRClassifier& operator=(const RTOCRClassifier&) = delete;
|
||||
|
||||
bool Initialize(const std::string& onnxPath, int gpuId = 0,
|
||||
const std::string& engineCacheDir = "");
|
||||
|
||||
// Classify a batch of text images
|
||||
// Returns vector of (cls_label, cls_score) per image
|
||||
// cls_label: 0 = normal, 1 = rotated 180 degrees
|
||||
std::vector<std::pair<int, float>> Classify(
|
||||
const std::vector<cv::Mat>& images, float clsThresh = kClsThresh);
|
||||
|
||||
private:
|
||||
std::shared_ptr<Engine<float>> m_engine = nullptr;
|
||||
EnginePoolManager<float>::PoolKey m_poolKey;
|
||||
bool m_usingSharedPool = false;
|
||||
std::mutex _mutex;
|
||||
};
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
403
ANSOCR/ANSRTOCR/RTOCRDetector.cpp
Normal file
403
ANSOCR/ANSRTOCR/RTOCRDetector.cpp
Normal file
@@ -0,0 +1,403 @@
|
||||
#include "RTOCRDetector.h"
|
||||
#include "include/clipper.h"
|
||||
#include "NV12PreprocessHelper.h"
|
||||
#include "ANSGpuFrameRegistry.h"
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
|
||||
// NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll)
|
||||
#include <opencv2/cudaimgproc.hpp>
|
||||
#include <opencv2/cudawarping.hpp>
|
||||
#include <opencv2/cudaarithm.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId,
|
||||
const std::string& engineCacheDir,
|
||||
int maxSideLen) {
|
||||
// Engine cache directory
|
||||
std::string cacheDir;
|
||||
if (!engineCacheDir.empty()) {
|
||||
cacheDir = engineCacheDir;
|
||||
} else {
|
||||
auto pos = onnxPath.find_last_of("/\\");
|
||||
cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
|
||||
}
|
||||
|
||||
try {
|
||||
ANSCENTER::Options options;
|
||||
options.deviceIndex = gpuId;
|
||||
// FP32 required for detection: this CNN (DBNet) produces NaN in FP16.
|
||||
// The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose
|
||||
// intermediate values overflow FP16 range (65504). Mixed precision
|
||||
// (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN
|
||||
// originates deep in the conv->scale->relu backbone before reaching those layers.
|
||||
// Classifier and recognizer remain FP16 with mixed precision -- only the
|
||||
// detector needs full FP32.
|
||||
options.precision = ANSCENTER::Precision::FP32;
|
||||
options.maxBatchSize = 1;
|
||||
options.optBatchSize = 1;
|
||||
|
||||
// Dynamic spatial dimensions for detection (multiples of 32)
|
||||
options.minInputHeight = 32;
|
||||
options.minInputWidth = 32;
|
||||
options.optInputHeight = std::min(640, maxSideLen);
|
||||
options.optInputWidth = std::min(640, maxSideLen);
|
||||
options.maxInputHeight = maxSideLen;
|
||||
options.maxInputWidth = maxSideLen;
|
||||
options.engineFileDir = cacheDir;
|
||||
|
||||
m_poolKey = { onnxPath,
|
||||
static_cast<int>(options.precision),
|
||||
options.maxBatchSize };
|
||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||
m_poolKey, options, onnxPath,
|
||||
kDetSubVals, kDetDivVals, true, -1);
|
||||
m_usingSharedPool = (m_engine != nullptr);
|
||||
|
||||
if (!m_engine) {
|
||||
std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: "
|
||||
<< onnxPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
// Query actual profile max from the loaded engine
|
||||
int profMaxH = m_engine->getProfileMaxHeight();
|
||||
int profMaxW = m_engine->getProfileMaxWidth();
|
||||
if (profMaxH > 0 && profMaxW > 0) {
|
||||
m_engineMaxSideLen = std::min(profMaxH, profMaxW);
|
||||
} else {
|
||||
m_engineMaxSideLen = maxSideLen;
|
||||
}
|
||||
|
||||
if (m_engineMaxSideLen < maxSideLen) {
|
||||
std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen
|
||||
<< "x" << m_engineMaxSideLen << " (requested " << maxSideLen
|
||||
<< " exceeded GPU capacity)" << std::endl;
|
||||
}
|
||||
std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TextBox> RTOCRDetector::Detect(const cv::Mat& image,
|
||||
int maxSideLen, float dbThresh,
|
||||
float boxThresh, float unclipRatio,
|
||||
bool useDilation) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
|
||||
if (!m_engine || image.empty()) return {};
|
||||
|
||||
try {
|
||||
// Single-pass detection: resize the full image to fit within
|
||||
// the engine's max spatial dimension (same approach as ONNX version).
|
||||
int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen);
|
||||
|
||||
// 1. Compute resize dimensions (multiples of 32)
|
||||
cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide);
|
||||
int newH = resizeShape.height;
|
||||
int newW = resizeShape.width;
|
||||
|
||||
float ratioH = static_cast<float>(image.rows) / newH;
|
||||
float ratioW = static_cast<float>(image.cols) / newW;
|
||||
|
||||
// 2. Upload to GPU and resize — try NV12 fast path first
|
||||
cv::cuda::GpuMat gpuResized;
|
||||
bool usedNV12 = false;
|
||||
|
||||
GpuFrameData* gpuFrame = tl_currentGpuFrame();
|
||||
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
|
||||
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
|
||||
gpuFrame->width > 0 && gpuFrame->height > 0) {
|
||||
// NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch)
|
||||
// instead of CPU BGR upload (24MB) + separate resize
|
||||
int fW = gpuFrame->width;
|
||||
int fH = gpuFrame->height;
|
||||
int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0;
|
||||
|
||||
// Get NV12 Y/UV pointers on GPU (from cache or fresh upload)
|
||||
const uint8_t* devY = nullptr;
|
||||
const uint8_t* devUV = nullptr;
|
||||
int yPitch = 0, uvPitch = 0;
|
||||
{
|
||||
auto regLock = ANSGpuFrameRegistry::instance().acquire_lock();
|
||||
if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) {
|
||||
// Cache hit
|
||||
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
|
||||
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
|
||||
yPitch = static_cast<int>(gpuFrame->gpuCacheYPitch);
|
||||
uvPitch = static_cast<int>(gpuFrame->gpuCacheUVPitch);
|
||||
} else if (!gpuFrame->gpuCacheValid) {
|
||||
// Cache miss — upload CPU NV12 to GPU
|
||||
size_t yBytes = static_cast<size_t>(fH) * gpuFrame->cpuYLinesize;
|
||||
size_t uvBytes = static_cast<size_t>(fH / 2) * gpuFrame->cpuUvLinesize;
|
||||
|
||||
auto& reg = ANSGpuFrameRegistry::instance();
|
||||
if (reg.canAllocateGpuCache(yBytes + uvBytes)) {
|
||||
cudaMalloc(&gpuFrame->gpuCacheY, yBytes);
|
||||
cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes);
|
||||
cudaMemcpy(gpuFrame->gpuCacheY, gpuFrame->cpuYPlane, yBytes, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice);
|
||||
gpuFrame->gpuCacheValid = true;
|
||||
gpuFrame->gpuCacheDeviceIdx = gpuIdx;
|
||||
gpuFrame->gpuCacheYPitch = static_cast<size_t>(gpuFrame->cpuYLinesize);
|
||||
gpuFrame->gpuCacheUVPitch = static_cast<size_t>(gpuFrame->cpuUvLinesize);
|
||||
gpuFrame->gpuCacheBytes = yBytes + uvBytes;
|
||||
reg.onGpuCacheCreated(yBytes + uvBytes);
|
||||
|
||||
devY = static_cast<const uint8_t*>(gpuFrame->gpuCacheY);
|
||||
devUV = static_cast<const uint8_t*>(gpuFrame->gpuCacheUV);
|
||||
yPitch = gpuFrame->cpuYLinesize;
|
||||
uvPitch = gpuFrame->cpuUvLinesize;
|
||||
}
|
||||
}
|
||||
} // release registry lock before GPU kernel
|
||||
|
||||
if (devY && devUV) {
|
||||
// Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc)
|
||||
gpuResized.create(newH, newW, CV_8UC3);
|
||||
NV12PreprocessHelper::nv12ToBGRResize(
|
||||
devY, yPitch, devUV, uvPitch,
|
||||
gpuResized.ptr<uint8_t>(), static_cast<int>(gpuResized.step),
|
||||
newW, newH, fW, fH);
|
||||
usedNV12 = true;
|
||||
|
||||
// Update ratios to map from full-res NV12 to detection output
|
||||
ratioH = static_cast<float>(fH) / newH;
|
||||
ratioW = static_cast<float>(fW) / newW;
|
||||
}
|
||||
}
|
||||
|
||||
if (!usedNV12) {
|
||||
// Fallback: standard BGR upload
|
||||
cv::cuda::GpuMat gpuImg;
|
||||
gpuImg.upload(image);
|
||||
cv::cuda::resize(gpuImg, gpuResized, resizeShape);
|
||||
}
|
||||
|
||||
// Keep BGR order (PaddleOCR official does NOT convert BGR->RGB)
|
||||
|
||||
// 3. Run inference
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuResized } };
|
||||
std::vector<std::vector<std::vector<float>>> featureVectors;
|
||||
|
||||
if (!m_engine->runInference(inputs, featureVectors)) {
|
||||
std::cerr << "[RTOCRDetector] Inference failed" << std::endl;
|
||||
return {};
|
||||
}
|
||||
|
||||
if (featureVectors.empty() || featureVectors[0].empty()) return {};
|
||||
|
||||
// 4. Reshape output to probability map [H, W]
|
||||
std::vector<float>& output = featureVectors[0][0];
|
||||
int outputSize = static_cast<int>(output.size());
|
||||
|
||||
if (outputSize < newH * newW) {
|
||||
std::cerr << "[RTOCRDetector] Output too small: expected at least "
|
||||
<< newH * newW << " got " << outputSize << std::endl;
|
||||
return {};
|
||||
}
|
||||
|
||||
cv::Mat bitmap(newH, newW, CV_32FC1, output.data());
|
||||
|
||||
// 5. Threshold to binary (matches ONNX/PaddleOCR official order)
|
||||
cv::Mat binaryMap;
|
||||
cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
|
||||
binaryMap.convertTo(binaryMap, CV_8UC1);
|
||||
|
||||
// 6. Apply dilation if requested (on binaryMap, matching ONNX version)
|
||||
if (useDilation) {
|
||||
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
|
||||
cv::dilate(binaryMap, binaryMap, kernel);
|
||||
}
|
||||
|
||||
// 7. Find contours and build text boxes
|
||||
// (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly)
|
||||
std::vector<std::vector<cv::Point>> contours;
|
||||
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
|
||||
|
||||
int numCandidates = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
|
||||
std::vector<TextBox> boxes;
|
||||
|
||||
for (int i = 0; i < numCandidates; i++) {
|
||||
if (contours[i].size() < 4) continue;
|
||||
|
||||
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
|
||||
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
|
||||
float sside = std::min(minRect.size.width, minRect.size.height);
|
||||
if (sside < 3.0f) continue;
|
||||
|
||||
auto ordered = GetMiniBoxes(minRect);
|
||||
|
||||
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
|
||||
float score = BoxScoreFast(bitmap, ordered);
|
||||
if (score < boxThresh) continue;
|
||||
|
||||
// Step 3: UnclipPolygon - expand the 4-point box
|
||||
auto expanded = UnclipPolygon(ordered, unclipRatio);
|
||||
if (expanded.size() < 4) continue;
|
||||
|
||||
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
|
||||
std::vector<cv::Point> expandedInt;
|
||||
expandedInt.reserve(expanded.size());
|
||||
for (auto& p : expanded) {
|
||||
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
|
||||
}
|
||||
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
|
||||
|
||||
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
|
||||
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
|
||||
if (expandedSside < 5.0f) continue;
|
||||
|
||||
auto expandedOrdered = GetMiniBoxes(expandedRect);
|
||||
|
||||
// Step 5: Scale to original image coordinates
|
||||
TextBox box;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(image.cols - 1));
|
||||
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(image.rows - 1));
|
||||
}
|
||||
box.score = score;
|
||||
boxes.push_back(box);
|
||||
}
|
||||
|
||||
SortTextBoxes(boxes);
|
||||
return boxes;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
|
||||
std::array<cv::Point2f, 4> RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
|
||||
cv::Point2f vertices[4];
|
||||
rect.points(vertices);
|
||||
|
||||
// Sort all 4 points by x-coordinate ascending
|
||||
std::sort(vertices, vertices + 4,
|
||||
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
|
||||
|
||||
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
|
||||
cv::Point2f topLeft, bottomLeft;
|
||||
if (vertices[0].y <= vertices[1].y) {
|
||||
topLeft = vertices[0];
|
||||
bottomLeft = vertices[1];
|
||||
} else {
|
||||
topLeft = vertices[1];
|
||||
bottomLeft = vertices[0];
|
||||
}
|
||||
|
||||
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
|
||||
cv::Point2f topRight, bottomRight;
|
||||
if (vertices[2].y <= vertices[3].y) {
|
||||
topRight = vertices[2];
|
||||
bottomRight = vertices[3];
|
||||
} else {
|
||||
topRight = vertices[3];
|
||||
bottomRight = vertices[2];
|
||||
}
|
||||
|
||||
// Order: [TL, TR, BR, BL] (clockwise from top-left)
|
||||
return { topLeft, topRight, bottomRight, bottomLeft };
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
|
||||
float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap,
|
||||
const std::array<cv::Point2f, 4>& box) {
|
||||
int h = probMap.rows;
|
||||
int w = probMap.cols;
|
||||
|
||||
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
|
||||
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
|
||||
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
|
||||
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
|
||||
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
|
||||
|
||||
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
|
||||
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
|
||||
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
|
||||
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
|
||||
|
||||
if (xmin >= xmax || ymin >= ymax) return 0.0f;
|
||||
|
||||
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
|
||||
|
||||
std::vector<cv::Point> pts(4);
|
||||
for (int j = 0; j < 4; j++) {
|
||||
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
|
||||
static_cast<int>(box[j].y) - ymin);
|
||||
}
|
||||
std::vector<std::vector<cv::Point>> polys = { pts };
|
||||
cv::fillPoly(mask, polys, cv::Scalar(1));
|
||||
|
||||
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
|
||||
return static_cast<float>(cv::mean(roiMap, mask)[0]);
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
|
||||
// Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly)
|
||||
std::vector<cv::Point2f> RTOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
|
||||
float unclipRatio) {
|
||||
// Compute area using Shoelace formula and perimeter
|
||||
float area = 0.0f;
|
||||
float perimeter = 0.0f;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int j = (i + 1) % 4;
|
||||
area += box[i].x * box[j].y - box[j].x * box[i].y;
|
||||
float dx = box[j].x - box[i].x;
|
||||
float dy = box[j].y - box[i].y;
|
||||
perimeter += std::sqrt(dx * dx + dy * dy);
|
||||
}
|
||||
area = std::abs(area) * 0.5f;
|
||||
if (perimeter < 1.0f) return {};
|
||||
|
||||
float distance = area * unclipRatio / perimeter;
|
||||
|
||||
ClipperLib::Path clipperPath;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
|
||||
static_cast<ClipperLib::cInt>(box[i].y) });
|
||||
}
|
||||
|
||||
ClipperLib::ClipperOffset offset;
|
||||
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
|
||||
|
||||
ClipperLib::Paths solution;
|
||||
offset.Execute(solution, distance);
|
||||
|
||||
if (solution.empty() || solution[0].empty()) return {};
|
||||
|
||||
std::vector<cv::Point2f> result;
|
||||
for (auto& p : solution[0]) {
|
||||
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
RTOCRDetector::~RTOCRDetector() {
|
||||
try {
|
||||
if (m_usingSharedPool) {
|
||||
EnginePoolManager<float>::instance().release(m_poolKey);
|
||||
m_engine.reset();
|
||||
m_usingSharedPool = false;
|
||||
}
|
||||
else if (m_engine) {
|
||||
m_engine.reset();
|
||||
}
|
||||
}
|
||||
catch (...) {}
|
||||
}
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
44
ANSOCR/ANSRTOCR/RTOCRDetector.h
Normal file
44
ANSOCR/ANSRTOCR/RTOCRDetector.h
Normal file
@@ -0,0 +1,44 @@
|
||||
#pragma once
|
||||
|
||||
#include "RTOCRTypes.h"
|
||||
#include "engine.h"
|
||||
#include "engine/EnginePoolManager.h"
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
class RTOCRDetector {
|
||||
public:
|
||||
RTOCRDetector() = default;
|
||||
~RTOCRDetector();
|
||||
RTOCRDetector(const RTOCRDetector&) = delete;
|
||||
RTOCRDetector& operator=(const RTOCRDetector&) = delete;
|
||||
|
||||
bool Initialize(const std::string& onnxPath, int gpuId = 0,
|
||||
const std::string& engineCacheDir = "",
|
||||
int maxSideLen = kDetMaxSideLen);
|
||||
|
||||
std::vector<TextBox> Detect(const cv::Mat& image,
|
||||
int maxSideLen = kDetMaxSideLen,
|
||||
float dbThresh = kDetDbThresh,
|
||||
float boxThresh = kDetBoxThresh,
|
||||
float unclipRatio = kDetUnclipRatio,
|
||||
bool useDilation = false);
|
||||
|
||||
private:
|
||||
// Postprocessing helpers (matches ONNX/PaddleOCR official flow exactly)
|
||||
std::array<cv::Point2f, 4> GetMiniBoxes(const cv::RotatedRect& rect);
|
||||
float BoxScoreFast(const cv::Mat& probMap, const std::array<cv::Point2f, 4>& box);
|
||||
std::vector<cv::Point2f> UnclipPolygon(const std::array<cv::Point2f, 4>& box, float unclipRatio);
|
||||
|
||||
std::shared_ptr<Engine<float>> m_engine = nullptr;
|
||||
EnginePoolManager<float>::PoolKey m_poolKey;
|
||||
bool m_usingSharedPool = false;
|
||||
int m_engineMaxSideLen = kDetMaxSideLen; // Actual TRT engine max spatial dim
|
||||
std::mutex _mutex;
|
||||
};
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
206
ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
Normal file
206
ANSOCR/ANSRTOCR/RTOCRRecognizer.cpp
Normal file
@@ -0,0 +1,206 @@
|
||||
#include "RTOCRRecognizer.h"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <opencv2/cudaimgproc.hpp>
|
||||
#include <opencv2/cudawarping.hpp>
|
||||
#include <opencv2/cudaarithm.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <cmath>
|
||||
#include <cfloat>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
bool RTOCRRecognizer::Initialize(const std::string& onnxPath, const std::string& dictPath,
|
||||
int gpuId, const std::string& engineCacheDir) {
|
||||
try {
|
||||
// Load dictionary first
|
||||
keys_ = LoadDict(dictPath);
|
||||
if (keys_.size() < 2) {
|
||||
std::cerr << "[RTOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
std::cout << "[RTOCRRecognizer] Loaded dictionary with " << keys_.size()
|
||||
<< " characters from: " << dictPath << std::endl;
|
||||
|
||||
ANSCENTER::Options options;
|
||||
options.deviceIndex = gpuId;
|
||||
options.precision = ANSCENTER::Precision::FP16;
|
||||
options.maxBatchSize = 1;
|
||||
options.optBatchSize = 1;
|
||||
|
||||
// Fixed height, dynamic width for recognition
|
||||
options.minInputHeight = imgH_;
|
||||
options.optInputHeight = imgH_;
|
||||
options.maxInputHeight = imgH_;
|
||||
options.minInputWidth = 32;
|
||||
options.optInputWidth = imgMaxW_;
|
||||
options.maxInputWidth = 960;
|
||||
|
||||
if (!engineCacheDir.empty()) {
|
||||
options.engineFileDir = engineCacheDir;
|
||||
}
|
||||
else {
|
||||
auto pos = onnxPath.find_last_of("/\\");
|
||||
options.engineFileDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : ".";
|
||||
}
|
||||
|
||||
m_poolKey = { onnxPath,
|
||||
static_cast<int>(options.precision),
|
||||
options.maxBatchSize };
|
||||
m_engine = EnginePoolManager<float>::instance().acquire(
|
||||
m_poolKey, options, onnxPath,
|
||||
kRecSubVals, kRecDivVals, true, -1);
|
||||
m_usingSharedPool = (m_engine != nullptr);
|
||||
|
||||
if (!m_engine) {
|
||||
std::cerr << "[RTOCRRecognizer] Failed to build/load TRT engine: " << onnxPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
|
||||
std::cout << "[RTOCRRecognizer] Initialized TRT engine from: " << onnxPath << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRRecognizer] Initialize failed: " << e.what() << std::endl;
|
||||
m_engine.reset();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
TextLine RTOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
|
||||
if (!m_engine || croppedImage.empty() || keys_.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
try {
|
||||
// Preprocess: resize to fixed height, proportional width
|
||||
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
|
||||
int resizedW = resized.cols;
|
||||
|
||||
// Pad to at least kRecImgW width (matching official PaddleOCR behavior)
|
||||
// Official PaddleOCR pads with 0.0 in normalized space ≈ pixel value 128 (gray)
|
||||
int imgW = std::max(resizedW, kRecImgW);
|
||||
if (imgW > resizedW) {
|
||||
cv::Mat padded(imgH_, imgW, resized.type(), cv::Scalar(128, 128, 128));
|
||||
resized.copyTo(padded(cv::Rect(0, 0, resizedW, imgH_)));
|
||||
resized = padded;
|
||||
}
|
||||
|
||||
// Upload to GPU (keep BGR order - PaddleOCR official does NOT convert BGR→RGB)
|
||||
cv::cuda::GpuMat gpuImg;
|
||||
gpuImg.upload(resized);
|
||||
|
||||
// Run inference
|
||||
std::vector<std::vector<cv::cuda::GpuMat>> inputs = { { gpuImg } };
|
||||
std::vector<std::vector<std::vector<float>>> featureVectors;
|
||||
|
||||
if (!m_engine->runInference(inputs, featureVectors)) {
|
||||
std::cerr << "[RTOCRRecognizer] Inference failed" << std::endl;
|
||||
return {};
|
||||
}
|
||||
|
||||
if (featureVectors.empty() || featureVectors[0].empty() ||
|
||||
featureVectors[0][0].empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Output shape: [1, seqLen, numClasses] flattened to [seqLen * numClasses]
|
||||
// IMPORTANT: The TRT engine output buffer is pre-allocated to MAX dimensions
|
||||
// (e.g. 120 timesteps for max width 960), but the actual inference produces
|
||||
// fewer timesteps for narrower images. We must use the ACTUAL seqLen
|
||||
// derived from the input width, not getOutputDims() which returns max dims.
|
||||
const std::vector<float>& output = featureVectors[0][0];
|
||||
|
||||
// numClasses from dictionary size (keys_ includes blank at index 0)
|
||||
int numClasses = static_cast<int>(keys_.size());
|
||||
|
||||
// Actual seqLen from input width: recognition model stride = 8
|
||||
// (confirmed: 960px input → 120 timesteps, 960/120 = 8)
|
||||
int seqLen = imgW / 8;
|
||||
|
||||
// Sanity check: seqLen * numClasses must not exceed buffer size
|
||||
if (seqLen * numClasses > static_cast<int>(output.size())) {
|
||||
// Fallback: infer from buffer size
|
||||
seqLen = static_cast<int>(output.size()) / numClasses;
|
||||
}
|
||||
|
||||
return CTCDecode(output.data(), seqLen, numClasses);
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[RTOCRRecognizer] Recognize failed: " << e.what() << std::endl;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TextLine> RTOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
|
||||
std::vector<TextLine> results;
|
||||
results.reserve(croppedImages.size());
|
||||
|
||||
// Process one at a time (each image has different width)
|
||||
for (size_t i = 0; i < croppedImages.size(); i++) {
|
||||
results.push_back(Recognize(croppedImages[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
TextLine RTOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
|
||||
TextLine result;
|
||||
std::string text;
|
||||
std::vector<float> scores;
|
||||
|
||||
int lastIndex = 0; // CTC blank is index 0
|
||||
|
||||
for (int t = 0; t < seqLen; t++) {
|
||||
// Find argmax for this timestep
|
||||
int maxIndex = 0;
|
||||
float maxValue = -FLT_MAX;
|
||||
|
||||
const float* timeStep = outputData + t * numClasses;
|
||||
for (int c = 0; c < numClasses; c++) {
|
||||
if (timeStep[c] > maxValue) {
|
||||
maxValue = timeStep[c];
|
||||
maxIndex = c;
|
||||
}
|
||||
}
|
||||
|
||||
// CTC decode: skip blanks (index 0) and repeated characters
|
||||
if (maxIndex != 0 && maxIndex != lastIndex) {
|
||||
if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
|
||||
text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
|
||||
// Use raw model output value as confidence (PaddleOCR v5 models include softmax)
|
||||
scores.push_back(maxValue);
|
||||
}
|
||||
}
|
||||
lastIndex = maxIndex;
|
||||
}
|
||||
|
||||
result.text = text;
|
||||
if (!scores.empty()) {
|
||||
result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
|
||||
static_cast<float>(scores.size());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
RTOCRRecognizer::~RTOCRRecognizer() {
|
||||
try {
|
||||
if (m_usingSharedPool) {
|
||||
EnginePoolManager<float>::instance().release(m_poolKey);
|
||||
m_engine.reset();
|
||||
m_usingSharedPool = false;
|
||||
}
|
||||
else if (m_engine) {
|
||||
m_engine.reset();
|
||||
}
|
||||
}
|
||||
catch (...) {}
|
||||
}
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
41
ANSOCR/ANSRTOCR/RTOCRRecognizer.h
Normal file
41
ANSOCR/ANSRTOCR/RTOCRRecognizer.h
Normal file
@@ -0,0 +1,41 @@
|
||||
#pragma once
|
||||
|
||||
#include "RTOCRTypes.h"
|
||||
#include "engine.h"
|
||||
#include "engine/EnginePoolManager.h"
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
class RTOCRRecognizer {
|
||||
public:
|
||||
RTOCRRecognizer() = default;
|
||||
~RTOCRRecognizer();
|
||||
RTOCRRecognizer(const RTOCRRecognizer&) = delete;
|
||||
RTOCRRecognizer& operator=(const RTOCRRecognizer&) = delete;
|
||||
|
||||
bool Initialize(const std::string& onnxPath, const std::string& dictPath,
|
||||
int gpuId = 0, const std::string& engineCacheDir = "");
|
||||
|
||||
TextLine Recognize(const cv::Mat& croppedImage);
|
||||
std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
|
||||
|
||||
void SetRecImageHeight(int h) { imgH_ = h; }
|
||||
void SetRecImageMaxWidth(int w) { imgMaxW_ = w; }
|
||||
|
||||
private:
|
||||
TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
|
||||
|
||||
std::shared_ptr<Engine<float>> m_engine = nullptr;
|
||||
EnginePoolManager<float>::PoolKey m_poolKey;
|
||||
bool m_usingSharedPool = false;
|
||||
std::vector<std::string> keys_;
|
||||
int imgH_ = kRecImgH;
|
||||
int imgMaxW_ = kRecImgMaxW;
|
||||
std::mutex _mutex;
|
||||
};
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
196
ANSOCR/ANSRTOCR/RTOCRTypes.h
Normal file
196
ANSOCR/ANSRTOCR/RTOCRTypes.h
Normal file
@@ -0,0 +1,196 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <cmath>
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace rtocr {
|
||||
|
||||
// ============================================================================
|
||||
// Engine normalization constants (BGR channel order, matching PaddleOCR official)
|
||||
// ============================================================================
|
||||
// PaddleOCR processes images in BGR order (no BGR→RGB conversion).
|
||||
// Engine applies: (pixel/255.0 - subVals[c]) / divVals[c] per channel.
|
||||
// When feeding BGR input (no cvtColor), subVals/divVals indices map to:
|
||||
// [0]=B channel, [1]=G channel, [2]=R channel
|
||||
//
|
||||
// PaddleOCR config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||
// These are applied to BGR channels: B=0.485/0.229, G=0.456/0.224, R=0.406/0.225
|
||||
|
||||
// Detection normalization (BGR order)
|
||||
constexpr std::array<float, 3> kDetSubVals = { 0.485f, 0.456f, 0.406f };
|
||||
constexpr std::array<float, 3> kDetDivVals = { 0.229f, 0.224f, 0.225f };
|
||||
|
||||
// Classifier normalization: PP-LCNet_x1_0_textline_ori uses ImageNet normalization (BGR order)
|
||||
// Config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225], scale=1/255
|
||||
constexpr std::array<float, 3> kClsSubVals = { 0.485f, 0.456f, 0.406f };
|
||||
constexpr std::array<float, 3> kClsDivVals = { 0.229f, 0.224f, 0.225f };
|
||||
|
||||
// Recognition normalization: (pixel/255 - 0.5) / 0.5 (same as classifier)
|
||||
constexpr std::array<float, 3> kRecSubVals = { 0.5f, 0.5f, 0.5f };
|
||||
constexpr std::array<float, 3> kRecDivVals = { 0.5f, 0.5f, 0.5f };
|
||||
|
||||
// ============================================================================
|
||||
// Detection defaults (PP-OCRv5 server)
|
||||
// kDetMaxSideLen is the default max side length for detection preprocessing.
|
||||
// TRT engine auto-fallbacks to smaller max if GPU memory is insufficient during build.
|
||||
// ============================================================================
|
||||
constexpr int kDetMaxSideLen = 2560;
|
||||
constexpr int kDetMaxSideLimit = 4000; // Safety cap on max dimension
|
||||
constexpr float kDetDbThresh = 0.3f;
|
||||
constexpr float kDetBoxThresh = 0.6f;
|
||||
constexpr float kDetUnclipRatio = 1.5f;
|
||||
constexpr int kDetMaxCandidates = 1000;
|
||||
|
||||
// ============================================================================
|
||||
// Classifier defaults (PP-LCNet_x1_0_textline_ori model)
|
||||
// Input: [B, 3, 80, 160], ImageNet normalization, 2-class (0°/180°)
|
||||
// Direct resize to 80x160 (no aspect ratio preservation)
|
||||
// ============================================================================
|
||||
constexpr int kClsImageH = 80;
|
||||
constexpr int kClsImageW = 160;
|
||||
constexpr float kClsThresh = 0.9f;
|
||||
|
||||
// ============================================================================
|
||||
// Recognition defaults
|
||||
// ============================================================================
|
||||
constexpr int kRecImgH = 48;
|
||||
constexpr int kRecImgW = 320; // Default rec width (PP-OCRv5 rec_image_shape[2]=320, min padded width)
|
||||
constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines
|
||||
constexpr int kRecBatchSize = 6;
|
||||
|
||||
// ============================================================================
|
||||
// Data structures
|
||||
// ============================================================================
|
||||
|
||||
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
|
||||
struct TextBox {
|
||||
std::array<cv::Point2f, 4> points;
|
||||
float score = 0.0f;
|
||||
};
|
||||
|
||||
// A single recognized text line
|
||||
struct TextLine {
|
||||
std::string text;
|
||||
float score = 0.0f;
|
||||
};
|
||||
|
||||
// OCR result matching PaddleOCR::OCRPredictResult format
|
||||
struct OCRPredictResult {
|
||||
std::vector<std::vector<int>> box; // 4 corner points [[x,y], ...]
|
||||
std::string text;
|
||||
float score = -1.0f;
|
||||
float cls_score = 0.0f;
|
||||
int cls_label = -1;
|
||||
};
|
||||
|
||||
// ============================================================================
|
||||
// Utility functions
|
||||
// ============================================================================
|
||||
|
||||
// Load character dictionary from file
|
||||
inline std::vector<std::string> LoadDict(const std::string& dictPath) {
|
||||
std::vector<std::string> keys;
|
||||
std::ifstream file(dictPath);
|
||||
if (!file.is_open()) return keys;
|
||||
std::string line;
|
||||
while (std::getline(file, line)) {
|
||||
if (!line.empty() && line.back() == '\r') {
|
||||
line.pop_back();
|
||||
}
|
||||
keys.push_back(line);
|
||||
}
|
||||
// CTC blank token at index 0
|
||||
keys.insert(keys.begin(), "#");
|
||||
// Space at end
|
||||
keys.push_back(" ");
|
||||
return keys;
|
||||
}
|
||||
|
||||
// Compute resize dimensions for detection model (multiples of 32)
|
||||
// limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
|
||||
// maxSideLimit: safety cap on final max dimension (default 4000)
|
||||
inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
|
||||
int maxSideLimit = kDetMaxSideLimit) {
|
||||
float ratio = 1.0f;
|
||||
int maxSide = std::max(srcH, srcW);
|
||||
if (maxSide > maxSideLen) {
|
||||
ratio = static_cast<float>(maxSideLen) / static_cast<float>(maxSide);
|
||||
}
|
||||
int newH = static_cast<int>(srcH * ratio);
|
||||
int newW = static_cast<int>(srcW * ratio);
|
||||
|
||||
// Safety cap: clamp if either dimension exceeds maxSideLimit
|
||||
if (std::max(newH, newW) > maxSideLimit) {
|
||||
float clampRatio = static_cast<float>(maxSideLimit) / static_cast<float>(std::max(newH, newW));
|
||||
newH = static_cast<int>(newH * clampRatio);
|
||||
newW = static_cast<int>(newW * clampRatio);
|
||||
}
|
||||
|
||||
newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
|
||||
newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
|
||||
return cv::Size(newW, newH);
|
||||
}
|
||||
|
||||
// Sort text boxes from top to bottom, left to right
|
||||
inline void SortTextBoxes(std::vector<TextBox>& boxes) {
|
||||
std::sort(boxes.begin(), boxes.end(),
|
||||
[](const TextBox& a, const TextBox& b) {
|
||||
if (std::abs(a.points[0].y - b.points[0].y) < 10.0f) {
|
||||
return a.points[0].x < b.points[0].x;
|
||||
}
|
||||
return a.points[0].y < b.points[0].y;
|
||||
});
|
||||
}
|
||||
|
||||
// Get rotated and cropped image from text box polygon
|
||||
inline cv::Mat GetRotateCropImage(const cv::Mat& srcImage, const TextBox& box) {
|
||||
auto pts = box.points;
|
||||
float width = static_cast<float>(std::max(
|
||||
cv::norm(pts[0] - pts[1]),
|
||||
cv::norm(pts[2] - pts[3])));
|
||||
float height = static_cast<float>(std::max(
|
||||
cv::norm(pts[0] - pts[3]),
|
||||
cv::norm(pts[1] - pts[2])));
|
||||
|
||||
std::vector<cv::Point2f> srcPts = { pts[0], pts[1], pts[2], pts[3] };
|
||||
std::vector<cv::Point2f> dstPts = {
|
||||
{0, 0}, {width, 0}, {width, height}, {0, height}
|
||||
};
|
||||
|
||||
cv::Mat M = cv::getPerspectiveTransform(srcPts, dstPts);
|
||||
cv::Mat cropped;
|
||||
cv::warpPerspective(srcImage, cropped, M,
|
||||
cv::Size(static_cast<int>(width), static_cast<int>(height)),
|
||||
cv::BORDER_REPLICATE);
|
||||
|
||||
if (cropped.rows > cropped.cols * 1.5f) {
|
||||
cv::Mat rotated;
|
||||
cv::transpose(cropped, rotated);
|
||||
cv::flip(rotated, rotated, 0);
|
||||
return rotated;
|
||||
}
|
||||
return cropped;
|
||||
}
|
||||
|
||||
// Resize recognition image to fixed height, proportional width
|
||||
inline cv::Mat ResizeRecImage(const cv::Mat& img, int targetH, int maxW) {
|
||||
float ratio = static_cast<float>(targetH) / img.rows;
|
||||
int targetW = static_cast<int>(img.cols * ratio);
|
||||
targetW = std::min(targetW, maxW);
|
||||
targetW = std::max(targetW, 1);
|
||||
|
||||
cv::Mat resized;
|
||||
cv::resize(img, resized, cv::Size(targetW, targetH));
|
||||
return resized;
|
||||
}
|
||||
|
||||
} // namespace rtocr
|
||||
} // namespace ANSCENTER
|
||||
Reference in New Issue
Block a user