Refactor project structure

This commit is contained in:
2026-03-28 19:56:39 +11:00
parent 1d267378b2
commit 8a2e721058
511 changed files with 59 additions and 48 deletions

View File

@@ -0,0 +1,107 @@
#include "ONNXOCRClassifier.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>
namespace ANSCENTER {
namespace onnxocr {
ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads)
: BasicOrtHandler(onnx_path, num_threads) {
}
Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) {
cv::Mat resized;
// Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
cv::resize(mat, resized, cv::Size(kClsImageW, kClsImageH));
resized.convertTo(resized, CV_32FC3);
// PP-LCNet uses ImageNet normalization (same as detection)
auto data = NormalizeAndPermute(resized);
input_values_handler.assign(data.begin(), data.end());
return Ort::Value::CreateTensor<float>(
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
input_node_dims.data(), input_node_dims.size());
}
Ort::Value ONNXOCRClassifier::transformBatch(const std::vector<cv::Mat>& images) {
// Not used - classifier processes single images in Classify() loop
if (!images.empty()) {
return transform(images[0]);
}
return Ort::Value(nullptr);
}
void ONNXOCRClassifier::Classify(std::vector<cv::Mat>& img_list,
std::vector<int>& cls_labels,
std::vector<float>& cls_scores,
float cls_thresh) {
std::lock_guard<std::mutex> lock(_mutex);
cls_labels.clear();
cls_scores.clear();
if (!ort_session || img_list.empty()) return;
cls_labels.resize(img_list.size(), 0);
cls_scores.resize(img_list.size(), 0.0f);
// Process one image at a time (dynamic shapes)
for (size_t i = 0; i < img_list.size(); i++) {
if (img_list[i].empty()) continue;
try {
// Preprocess: direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
cv::Mat resized;
cv::resize(img_list[i], resized, cv::Size(kClsImageW, kClsImageH));
resized.convertTo(resized, CV_32FC3);
// PP-LCNet uses ImageNet normalization (same as detection)
auto inputData = NormalizeAndPermute(resized);
std::array<int64_t, 4> inputShape = { 1, 3, kClsImageH, kClsImageW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
float* outData = outputTensors[0].GetTensorMutableData<float>();
auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int numClasses = (outShape.size() > 1) ? static_cast<int>(outShape[1]) : 2;
// Find argmax and use raw output value as score
// PaddleOCR v5 models include softmax, so output values are probabilities
// Matches PaddleOCR official: score = preds[i, argmax_idx]
int maxIdx = 0;
float maxVal = outData[0];
for (int c = 1; c < numClasses; c++) {
if (outData[c] > maxVal) {
maxVal = outData[c];
maxIdx = c;
}
}
cls_labels[i] = maxIdx;
cls_scores[i] = maxVal;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRClassifier] Inference failed for image " << i
<< ": " << e.what() << std::endl;
cls_labels[i] = 0;
cls_scores[i] = 0.0f;
}
}
}
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,32 @@
#pragma once
#include "ONNXOCRTypes.h"
#include "ONNXEngine.h"
#include <vector>
#include <mutex>
namespace ANSCENTER {
namespace onnxocr {
class ONNXOCRClassifier : public BasicOrtHandler {
public:
explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1);
~ONNXOCRClassifier() override = default;
// Classify text orientation for a list of cropped images
// Returns vector of (cls_label, cls_score) pairs
// cls_label: 0 = normal, 1 = rotated 180 degrees
void Classify(std::vector<cv::Mat>& img_list,
std::vector<int>& cls_labels,
std::vector<float>& cls_scores,
float cls_thresh = kClsThresh);
private:
Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
std::mutex _mutex;
};
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,312 @@
#include "ONNXOCRDetector.h"
#include "include/clipper.h"
#include "ANSGpuFrameRegistry.h"
#include "NV12PreprocessHelper.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>
namespace ANSCENTER {
namespace onnxocr {
ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads)
: BasicOrtHandler(onnx_path, num_threads) {
}
Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
// Not used directly - detection uses custom Preprocess + manual tensor creation
// Provided to satisfy BasicOrtHandler pure virtual
cv::Mat canvas;
mat.convertTo(canvas, CV_32FC3);
auto data = NormalizeAndPermute(canvas);
input_values_handler.assign(data.begin(), data.end());
return Ort::Value::CreateTensor<float>(
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
input_node_dims.data(), input_node_dims.size());
}
Ort::Value ONNXOCRDetector::transformBatch(const std::vector<cv::Mat>& images) {
// Not used - detection processes single images with dynamic shapes
if (!images.empty()) {
return transform(images[0]);
}
return Ort::Value(nullptr);
}
std::vector<TextBox> ONNXOCRDetector::Detect(const cv::Mat& srcImage,
int maxSideLen,
float dbThresh,
float boxThresh,
float unclipRatio,
bool useDilation) {
std::lock_guard<std::mutex> lock(_mutex);
if (!ort_session || srcImage.empty()) {
return {};
}
// Try to get full-resolution image from NV12 frame (same pattern as ANSONNXYOLO)
cv::Mat inferenceImage = srcImage;
float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
GpuFrameData* gpuFrame = tl_currentGpuFrame();
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
gpuFrame->width > 0 && gpuFrame->height > 0) {
// Full-res NV12 available — convert to BGR on CPU for ONNX
cv::Mat yPlane(gpuFrame->height, gpuFrame->width, CV_8UC1,
gpuFrame->cpuYPlane, gpuFrame->cpuYLinesize);
cv::Mat uvPlane(gpuFrame->height / 2, gpuFrame->width, CV_8UC1,
gpuFrame->cpuUvPlane, gpuFrame->cpuUvLinesize);
cv::Mat fullResBGR;
cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
if (!fullResBGR.empty()) {
bgrScaleX = static_cast<float>(srcImage.cols) / fullResBGR.cols;
bgrScaleY = static_cast<float>(srcImage.rows) / fullResBGR.rows;
inferenceImage = fullResBGR;
}
}
int resizeH, resizeW;
float ratioH, ratioW;
// Preprocess (using full-res image if NV12 was available)
auto inputData = Preprocess(inferenceImage, maxSideLen, resizeH, resizeW, ratioH, ratioW);
// Create input tensor with dynamic shape
std::array<int64_t, 4> inputShape = { 1, 3, resizeH, resizeW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
// Run inference
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
// Get output data
float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int outH = static_cast<int>(outputShape[2]);
int outW = static_cast<int>(outputShape[3]);
// Postprocess — detection coords are relative to inferenceImage (full-res),
// then scaled back to srcImage (display-res) coordinates
auto boxes = Postprocess(outputData, outH, outW, ratioH, ratioW,
inferenceImage.rows, inferenceImage.cols,
dbThresh, boxThresh, unclipRatio, useDilation);
// Rescale box coordinates from full-res to display-res
if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
for (auto& box : boxes) {
for (auto& pt : box.points) {
pt.x *= bgrScaleX;
pt.y *= bgrScaleY;
}
}
}
return boxes;
}
std::vector<float> ONNXOCRDetector::Preprocess(const cv::Mat& srcImage, int maxSideLen,
int& resizeH, int& resizeW,
float& ratioH, float& ratioW) {
cv::Size newSize = ComputeDetResizeShape(srcImage.rows, srcImage.cols, maxSideLen);
resizeW = newSize.width;
resizeH = newSize.height;
ratioH = static_cast<float>(srcImage.rows) / resizeH;
ratioW = static_cast<float>(srcImage.cols) / resizeW;
cv::Mat resized;
cv::resize(srcImage, resized, newSize);
resized.convertTo(resized, CV_32FC3);
return NormalizeAndPermute(resized);
}
// Matches PaddleOCR official DBPostProcess.boxes_from_bitmap flow
std::vector<TextBox> ONNXOCRDetector::Postprocess(const float* outputData, int outH, int outW,
float ratioH, float ratioW,
int srcH, int srcW,
float dbThresh, float boxThresh,
float unclipRatio, bool useDilation) {
// Create probability map from output
cv::Mat probMap(outH, outW, CV_32FC1, const_cast<float*>(outputData));
// Binary threshold
cv::Mat binaryMap;
cv::threshold(probMap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
binaryMap.convertTo(binaryMap, CV_8UC1);
// Optional dilation (PaddleOCR default: use_dilation=False, kernel=[[1,1],[1,1]])
if (useDilation) {
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
cv::dilate(binaryMap, binaryMap, kernel);
}
// Find contours
std::vector<std::vector<cv::Point>> contours;
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
std::vector<TextBox> boxes;
int numContours = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
for (int i = 0; i < numContours; i++) {
if (contours[i].size() < 4) continue;
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
float sside = std::min(minRect.size.width, minRect.size.height);
if (sside < 3.0f) continue;
auto ordered = GetMiniBoxes(minRect);
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
float score = BoxScoreFast(probMap, ordered);
if (score < boxThresh) continue;
// Step 3: UnclipPolygon - expand the 4-point box
auto expanded = UnclipPolygon(ordered, unclipRatio);
if (expanded.size() < 4) continue;
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
std::vector<cv::Point> expandedInt;
expandedInt.reserve(expanded.size());
for (auto& p : expanded) {
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
}
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
if (expandedSside < 5.0f) continue;
auto expandedOrdered = GetMiniBoxes(expandedRect);
// Step 5: Scale to original image coordinates
TextBox box;
for (int j = 0; j < 4; j++) {
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(srcW - 1));
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(srcH - 1));
}
box.score = score;
boxes.push_back(box);
}
SortTextBoxes(boxes);
return boxes;
}
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
std::array<cv::Point2f, 4> ONNXOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
cv::Point2f vertices[4];
rect.points(vertices);
// Sort all 4 points by x-coordinate ascending
std::sort(vertices, vertices + 4,
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
cv::Point2f topLeft, bottomLeft;
if (vertices[0].y <= vertices[1].y) {
topLeft = vertices[0];
bottomLeft = vertices[1];
} else {
topLeft = vertices[1];
bottomLeft = vertices[0];
}
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
cv::Point2f topRight, bottomRight;
if (vertices[2].y <= vertices[3].y) {
topRight = vertices[2];
bottomRight = vertices[3];
} else {
topRight = vertices[3];
bottomRight = vertices[2];
}
// Order: [TL, TR, BR, BL] (clockwise from top-left)
return { topLeft, topRight, bottomRight, bottomLeft };
}
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
float ONNXOCRDetector::BoxScoreFast(const cv::Mat& probMap,
const std::array<cv::Point2f, 4>& box) {
int h = probMap.rows;
int w = probMap.cols;
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
if (xmin >= xmax || ymin >= ymax) return 0.0f;
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
std::vector<cv::Point> pts(4);
for (int j = 0; j < 4; j++) {
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
static_cast<int>(box[j].y) - ymin);
}
std::vector<std::vector<cv::Point>> polys = { pts };
cv::fillPoly(mask, polys, cv::Scalar(1));
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
return static_cast<float>(cv::mean(roiMap, mask)[0]);
}
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
float unclipRatio) {
// Compute area using Shoelace formula and perimeter
float area = 0.0f;
float perimeter = 0.0f;
for (int i = 0; i < 4; i++) {
int j = (i + 1) % 4;
area += box[i].x * box[j].y - box[j].x * box[i].y;
float dx = box[j].x - box[i].x;
float dy = box[j].y - box[i].y;
perimeter += std::sqrt(dx * dx + dy * dy);
}
area = std::abs(area) * 0.5f;
if (perimeter < 1.0f) return {};
float distance = area * unclipRatio / perimeter;
ClipperLib::Path clipperPath;
for (int i = 0; i < 4; i++) {
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
static_cast<ClipperLib::cInt>(box[i].y) });
}
ClipperLib::ClipperOffset offset;
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
ClipperLib::Paths solution;
offset.Execute(solution, distance);
if (solution.empty() || solution[0].empty()) return {};
std::vector<cv::Point2f> result;
for (auto& p : solution[0]) {
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
}
return result;
}
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,52 @@
#pragma once
#include "ONNXOCRTypes.h"
#include "ONNXEngine.h"
#include <vector>
#include <mutex>
namespace ANSCENTER {
namespace onnxocr {
class ONNXOCRDetector : public BasicOrtHandler {
public:
explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1);
~ONNXOCRDetector() override = default;
// Run text detection on an image
std::vector<TextBox> Detect(const cv::Mat& srcImage,
int maxSideLen = kDetMaxSideLen,
float dbThresh = kDetDbThresh,
float boxThresh = kDetBoxThresh,
float unclipRatio = kDetUnclipRatio,
bool useDilation = false);
private:
Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
// Preprocessing
std::vector<float> Preprocess(const cv::Mat& srcImage, int maxSideLen,
int& resizeH, int& resizeW,
float& ratioH, float& ratioW);
// Postprocessing: threshold -> contours -> boxes (matches PaddleOCR official flow)
std::vector<TextBox> Postprocess(const float* outputData, int outH, int outW,
float ratioH, float ratioW, int srcH, int srcW,
float dbThresh, float boxThresh, float unclipRatio,
bool useDilation);
// Get ordered 4 corners [TL, TR, BR, BL] from rotated rect (matches PaddleOCR GetMiniBoxes)
std::array<cv::Point2f, 4> GetMiniBoxes(const cv::RotatedRect& rect);
// Compute mean score inside box polygon on the probability map
float BoxScoreFast(const cv::Mat& probMap, const std::array<cv::Point2f, 4>& box);
// Expand 4-point box using Clipper offset
std::vector<cv::Point2f> UnclipPolygon(const std::array<cv::Point2f, 4>& box, float unclipRatio);
std::mutex _mutex;
};
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,165 @@
#include "ONNXOCRRecognizer.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <cfloat>
#include <cstring>
namespace ANSCENTER {
namespace onnxocr {
ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads)
: BasicOrtHandler(onnx_path, num_threads) {
}
bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
keys_ = LoadDict(dictPath);
if (keys_.size() < 2) {
std::cerr << "[ONNXOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
return false;
}
std::cout << "[ONNXOCRRecognizer] Loaded dictionary with " << keys_.size()
<< " characters from: " << dictPath << std::endl;
return true;
}
Ort::Value ONNXOCRRecognizer::transform(const cv::Mat& mat) {
// Not used directly - recognition uses custom preprocess with dynamic width
cv::Mat resized = ResizeRecImage(mat, imgH_, imgMaxW_);
resized.convertTo(resized, CV_32FC3);
auto data = NormalizeAndPermuteCls(resized);
input_values_handler.assign(data.begin(), data.end());
return Ort::Value::CreateTensor<float>(
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
input_node_dims.data(), input_node_dims.size());
}
Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images) {
// Not used - recognizer processes single images with dynamic widths
if (!images.empty()) {
return transform(images[0]);
}
return Ort::Value(nullptr);
}
TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
std::lock_guard<std::mutex> lock(_mutex);
if (!ort_session || croppedImage.empty() || keys_.empty()) {
return {};
}
try {
// Preprocess: resize to fixed height, proportional width
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
int resizedW = resized.cols;
resized.convertTo(resized, CV_32FC3);
// Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier)
auto normalizedData = NormalizeAndPermuteCls(resized);
// Pad to at least kRecImgW width (matching official PaddleOCR behavior)
// Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized
// image into left portion. Padding value = 0.0 in normalized space.
int imgW = std::max(resizedW, kRecImgW);
std::vector<float> inputData;
if (imgW > resizedW) {
// Zero-pad on the right (CHW layout)
inputData.resize(3 * imgH_ * imgW, 0.0f);
for (int c = 0; c < 3; c++) {
for (int y = 0; y < imgH_; y++) {
std::memcpy(
&inputData[c * imgH_ * imgW + y * imgW],
&normalizedData[c * imgH_ * resizedW + y * resizedW],
resizedW * sizeof(float));
}
}
} else {
inputData = std::move(normalizedData);
}
// Create input tensor with (possibly padded) width
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, imgW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
// Run inference
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
// Get output
float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int seqLen = static_cast<int>(outputShape[1]);
int numClasses = static_cast<int>(outputShape[2]);
return CTCDecode(outputData, seqLen, numClasses);
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRRecognizer] Inference failed: " << e.what() << std::endl;
return {};
}
}
std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
std::vector<TextLine> results;
results.reserve(croppedImages.size());
// Process one at a time (dynamic width per image)
for (size_t i = 0; i < croppedImages.size(); i++) {
results.push_back(Recognize(croppedImages[i]));
}
return results;
}
TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
TextLine result;
std::string text;
std::vector<float> scores;
int lastIndex = 0; // CTC blank is index 0
for (int t = 0; t < seqLen; t++) {
// Find argmax for this timestep
int maxIndex = 0;
float maxValue = -FLT_MAX;
const float* timeStep = outputData + t * numClasses;
for (int c = 0; c < numClasses; c++) {
if (timeStep[c] > maxValue) {
maxValue = timeStep[c];
maxIndex = c;
}
}
// CTC decode: skip blanks (index 0) and repeated characters
if (maxIndex != 0 && maxIndex != lastIndex) {
if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
// Use raw model output value as confidence (PaddleOCR v5 models include softmax)
scores.push_back(maxValue);
}
}
lastIndex = maxIndex;
}
result.text = text;
if (!scores.empty()) {
result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
static_cast<float>(scores.size());
}
return result;
}
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,40 @@
#pragma once
#include "ONNXOCRTypes.h"
#include "ONNXEngine.h"
#include <vector>
#include <string>
#include <mutex>
namespace ANSCENTER {
namespace onnxocr {
class ONNXOCRRecognizer : public BasicOrtHandler {
public:
explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1);
~ONNXOCRRecognizer() override = default;
// Load character dictionary (must be called before Recognize)
bool LoadDictionary(const std::string& dictPath);
// Recognize text from a single cropped text image
TextLine Recognize(const cv::Mat& croppedImage);
// Batch recognition for multiple cropped images
std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
private:
Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
// CTC greedy decode
TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
std::vector<std::string> keys_;
int imgH_ = kRecImgH;
int imgMaxW_ = kRecImgMaxW;
std::mutex _mutex;
};
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,212 @@
#pragma once
#include <string>
#include <vector>
#include <array>
#include <fstream>
#include <algorithm>
#include <numeric>
#include <cmath>
#include <opencv2/core.hpp>
#include <opencv2/imgproc.hpp>
namespace ANSCENTER {
namespace onnxocr {
// Detection normalization constants (BGR channel order, matching PaddleOCR official)
// PaddleOCR config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
// Applied directly to BGR channels WITHOUT BGR→RGB conversion:
// Channel 0 (B) → mean=0.485, std=0.229
// Channel 1 (G) → mean=0.456, std=0.224
// Channel 2 (R) → mean=0.406, std=0.225
constexpr float kDetMean0 = 0.485f; // B channel
constexpr float kDetMean1 = 0.456f; // G channel
constexpr float kDetMean2 = 0.406f; // R channel
constexpr float kDetStd0 = 0.229f; // B channel
constexpr float kDetStd1 = 0.224f; // G channel
constexpr float kDetStd2 = 0.225f; // R channel
constexpr float kScale = 1.0f / 255.0f;
// Detection defaults (PP-OCRv5 server: limit_type=max, limit_side_len=960)
constexpr int kDetMaxSideLen = 960;
constexpr int kDetMaxSideLimit = 4000; // Safety cap on max dimension
constexpr float kDetDbThresh = 0.3f;
constexpr float kDetBoxThresh = 0.6f;
constexpr float kDetUnclipRatio = 1.5f;
constexpr int kDetMaxCandidates = 1000;
// Classifier defaults (PP-LCNet_x1_0_textline_ori model)
// Input: [B, 3, 80, 160], ImageNet normalization, 2-class (0°/180°)
// Direct resize to 80x160 (no aspect ratio preservation)
constexpr int kClsImageH = 80;
constexpr int kClsImageW = 160;
constexpr float kClsThresh = 0.9f;
// Recognition defaults
constexpr int kRecImgH = 48;
constexpr int kRecImgW = 320; // Default rec width (PP-OCRv5 rec_image_shape[2]=320, min padded width)
constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines
constexpr int kRecBatchSize = 6;
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
struct TextBox {
std::array<cv::Point2f, 4> points;
float score = 0.0f;
};
// A single recognized text line
struct TextLine {
std::string text;
float score = 0.0f;
};
// OCR result matching PaddleOCR::OCRPredictResult format
struct OCRPredictResult {
std::vector<std::vector<int>> box; // 4 corner points [[x,y], ...]
std::string text;
float score = -1.0f;
float cls_score = 0.0f;
int cls_label = -1;
};
// Load character dictionary from file
inline std::vector<std::string> LoadDict(const std::string& dictPath) {
std::vector<std::string> keys;
std::ifstream file(dictPath);
if (!file.is_open()) return keys;
std::string line;
while (std::getline(file, line)) {
if (!line.empty() && line.back() == '\r') {
line.pop_back();
}
keys.push_back(line);
}
// CTC blank token at index 0
keys.insert(keys.begin(), "#");
// Space at end
keys.push_back(" ");
return keys;
}
// Compute resize dimensions for detection model (multiples of 32)
// limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
// maxSideLimit: safety cap on final max dimension (default 4000)
inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
int maxSideLimit = kDetMaxSideLimit) {
float ratio = 1.0f;
int maxSide = std::max(srcH, srcW);
if (maxSide > maxSideLen) {
ratio = static_cast<float>(maxSideLen) / static_cast<float>(maxSide);
}
int newH = static_cast<int>(srcH * ratio);
int newW = static_cast<int>(srcW * ratio);
// Safety cap: clamp if either dimension exceeds maxSideLimit
if (std::max(newH, newW) > maxSideLimit) {
float clampRatio = static_cast<float>(maxSideLimit) / static_cast<float>(std::max(newH, newW));
newH = static_cast<int>(newH * clampRatio);
newW = static_cast<int>(newW * clampRatio);
}
newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
return cv::Size(newW, newH);
}
// Normalize BGR float image to CHW BGR vector for detection
// BGR channel order preserved (matching PaddleOCR official - no BGR→RGB conversion)
inline std::vector<float> NormalizeAndPermute(const cv::Mat& img) {
int h = img.rows;
int w = img.cols;
std::vector<float> result(3 * h * w);
for (int y = 0; y < h; y++) {
const float* row = img.ptr<float>(y);
for (int x = 0; x < w; x++) {
float b = row[x * 3 + 0];
float g = row[x * 3 + 1];
float r = row[x * 3 + 2];
// BGR order: channel 0=B, 1=G, 2=R (matching PaddleOCR official)
result[0 * h * w + y * w + x] = (b * kScale - kDetMean0) / kDetStd0;
result[1 * h * w + y * w + x] = (g * kScale - kDetMean1) / kDetStd1;
result[2 * h * w + y * w + x] = (r * kScale - kDetMean2) / kDetStd2;
}
}
return result;
}
// Normalize for classifier and recognizer: (x/255 - 0.5) / 0.5
// BGR channel order preserved (matching PaddleOCR official - no BGR→RGB conversion)
inline std::vector<float> NormalizeAndPermuteCls(const cv::Mat& img) {
int h = img.rows;
int w = img.cols;
std::vector<float> result(3 * h * w);
for (int y = 0; y < h; y++) {
const float* row = img.ptr<float>(y);
for (int x = 0; x < w; x++) {
float b = row[x * 3 + 0];
float g = row[x * 3 + 1];
float r = row[x * 3 + 2];
// BGR order: channel 0=B, 1=G, 2=R (matching PaddleOCR official)
result[0 * h * w + y * w + x] = (b * kScale - 0.5f) / 0.5f;
result[1 * h * w + y * w + x] = (g * kScale - 0.5f) / 0.5f;
result[2 * h * w + y * w + x] = (r * kScale - 0.5f) / 0.5f;
}
}
return result;
}
// Sort text boxes from top to bottom, left to right
inline void SortTextBoxes(std::vector<TextBox>& boxes) {
std::sort(boxes.begin(), boxes.end(),
[](const TextBox& a, const TextBox& b) {
if (std::abs(a.points[0].y - b.points[0].y) < 10.0f) {
return a.points[0].x < b.points[0].x;
}
return a.points[0].y < b.points[0].y;
});
}
// Get rotated and cropped image from text box polygon
inline cv::Mat GetRotateCropImage(const cv::Mat& srcImage, const TextBox& box) {
auto pts = box.points;
float width = static_cast<float>(std::max(
cv::norm(pts[0] - pts[1]),
cv::norm(pts[2] - pts[3])));
float height = static_cast<float>(std::max(
cv::norm(pts[0] - pts[3]),
cv::norm(pts[1] - pts[2])));
std::vector<cv::Point2f> srcPts = { pts[0], pts[1], pts[2], pts[3] };
std::vector<cv::Point2f> dstPts = {
{0, 0}, {width, 0}, {width, height}, {0, height}
};
cv::Mat M = cv::getPerspectiveTransform(srcPts, dstPts);
cv::Mat cropped;
cv::warpPerspective(srcImage, cropped, M,
cv::Size(static_cast<int>(width), static_cast<int>(height)),
cv::BORDER_REPLICATE);
if (cropped.rows > cropped.cols * 1.5f) {
cv::Mat rotated;
cv::transpose(cropped, rotated);
cv::flip(rotated, rotated, 0);
return rotated;
}
return cropped;
}
// Resize recognition image to fixed height, proportional width
inline cv::Mat ResizeRecImage(const cv::Mat& img, int targetH, int maxW) {
float ratio = static_cast<float>(targetH) / img.rows;
int targetW = static_cast<int>(img.cols * ratio);
targetW = std::min(targetW, maxW);
targetW = std::max(targetW, 1);
cv::Mat resized;
cv::resize(img, resized, cv::Size(targetW, targetH));
return resized;
}
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,130 @@
#include "PaddleOCRV5Engine.h"
#include "EPLoader.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
namespace ANSCENTER {
namespace onnxocr {
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
const std::string& clsModelPath,
const std::string& recModelPath,
const std::string& dictPath) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
try {
// Initialize detector (also triggers EPLoader init in BasicOrtHandler)
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath);
std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
// Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
// BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own
// inline-static copy. Without this, inference calls from ANSOCR.dll crash.
if (Ort::Global<void>::api_ == nullptr) {
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
}
// Initialize classifier (optional)
if (!clsModelPath.empty()) {
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath);
std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
}
else {
classifier_.reset();
std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl;
}
// Initialize recognizer
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath);
if (!recognizer_->LoadDictionary(dictPath)) {
std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
return false;
}
std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
_initialized = true;
std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
return true;
}
catch (const std::exception& e) {
std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl;
detector_.reset();
classifier_.reset();
recognizer_.reset();
_initialized = false;
return false;
}
}
std::vector<OCRPredictResult> PaddleOCRV5Engine::ocr(const cv::Mat& img) {
std::lock_guard<std::recursive_mutex> lock(_mutex);
std::vector<OCRPredictResult> results;
if (!_initialized || img.empty()) {
return results;
}
// Step 1: Text Detection
auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation);
if (boxes.empty()) {
return results;
}
// Step 2: Crop detected text regions
std::vector<cv::Mat> croppedImages;
croppedImages.reserve(boxes.size());
for (auto& box : boxes) {
cv::Mat cropped = GetRotateCropImage(img, box);
if (!cropped.empty()) {
croppedImages.push_back(cropped);
}
}
// Step 3: Classification (optional)
std::vector<int> cls_labels(croppedImages.size(), 0);
std::vector<float> cls_scores(croppedImages.size(), 0.0f);
if (classifier_) {
classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh);
// Rotate images classified as upside-down (label=1 and score > threshold)
for (size_t i = 0; i < croppedImages.size(); i++) {
if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) {
cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
}
}
}
// Step 4: Text Recognition
auto textLines = recognizer_->RecognizeBatch(croppedImages);
// Step 5: Combine results
for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) {
OCRPredictResult result;
// Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
result.box.resize(4);
for (int j = 0; j < 4; j++) {
result.box[j] = {
static_cast<int>(boxes[i].points[j].x),
static_cast<int>(boxes[i].points[j].y)
};
}
result.text = textLines[i].text;
result.score = textLines[i].score;
result.cls_label = cls_labels[i];
result.cls_score = cls_scores[i];
results.push_back(result);
}
return results;
}
} // namespace onnxocr
} // namespace ANSCENTER

View File

@@ -0,0 +1,63 @@
#pragma once
#include "ONNXOCRTypes.h"
#include "ONNXOCRDetector.h"
#include "ONNXOCRClassifier.h"
#include "ONNXOCRRecognizer.h"
#include <memory>
#include <mutex>
#include <string>
#include <vector>
namespace ANSCENTER {
namespace onnxocr {
// PaddleOCR V5 pipeline engine: Detection -> (Classification) -> Recognition
// Mirrors the PaddleOCR::PPOCR interface for drop-in replacement
class PaddleOCRV5Engine {
public:
PaddleOCRV5Engine() = default;
~PaddleOCRV5Engine() = default;
// Initialize the OCR pipeline
// clsModelPath can be empty to skip classification
bool Initialize(const std::string& detModelPath,
const std::string& clsModelPath,
const std::string& recModelPath,
const std::string& dictPath);
// Run full OCR pipeline on an image
// Returns results matching PaddleOCR::OCRPredictResult format
std::vector<OCRPredictResult> ocr(const cv::Mat& img);
// Configuration setters (matching OCRModelConfig parameters)
void SetDetMaxSideLen(int val) { _maxSideLen = val; }
void SetDetDbThresh(float val) { _detDbThresh = val; }
void SetDetBoxThresh(float val) { _detBoxThresh = val; }
void SetDetUnclipRatio(float val) { _detUnclipRatio = val; }
void SetClsThresh(float val) { _clsThresh = val; }
void SetUseDilation(bool val) { _useDilation = val; }
private:
std::unique_ptr<ONNXOCRDetector> detector_;
std::unique_ptr<ONNXOCRClassifier> classifier_; // nullptr if not used
std::unique_ptr<ONNXOCRRecognizer> recognizer_;
std::recursive_mutex _mutex;
// Detection parameters
int _maxSideLen = kDetMaxSideLen;
float _detDbThresh = kDetDbThresh;
float _detBoxThresh = kDetBoxThresh;
float _detUnclipRatio = kDetUnclipRatio;
bool _useDilation = false;
// Classifier parameters
float _clsThresh = kClsThresh;
bool _initialized = false;
};
} // namespace onnxocr
} // namespace ANSCENTER