Initial setup for CLion
This commit is contained in:
107
ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
Normal file
107
ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp
Normal file
@@ -0,0 +1,107 @@
|
||||
#include "ONNXOCRClassifier.h"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads)
|
||||
: BasicOrtHandler(onnx_path, num_threads) {
|
||||
}
|
||||
|
||||
Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) {
|
||||
cv::Mat resized;
|
||||
// Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
|
||||
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
|
||||
cv::resize(mat, resized, cv::Size(kClsImageW, kClsImageH));
|
||||
|
||||
resized.convertTo(resized, CV_32FC3);
|
||||
|
||||
// PP-LCNet uses ImageNet normalization (same as detection)
|
||||
auto data = NormalizeAndPermute(resized);
|
||||
|
||||
input_values_handler.assign(data.begin(), data.end());
|
||||
return Ort::Value::CreateTensor<float>(
|
||||
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
|
||||
input_node_dims.data(), input_node_dims.size());
|
||||
}
|
||||
|
||||
Ort::Value ONNXOCRClassifier::transformBatch(const std::vector<cv::Mat>& images) {
|
||||
// Not used - classifier processes single images in Classify() loop
|
||||
if (!images.empty()) {
|
||||
return transform(images[0]);
|
||||
}
|
||||
return Ort::Value(nullptr);
|
||||
}
|
||||
|
||||
void ONNXOCRClassifier::Classify(std::vector<cv::Mat>& img_list,
|
||||
std::vector<int>& cls_labels,
|
||||
std::vector<float>& cls_scores,
|
||||
float cls_thresh) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
|
||||
cls_labels.clear();
|
||||
cls_scores.clear();
|
||||
|
||||
if (!ort_session || img_list.empty()) return;
|
||||
|
||||
cls_labels.resize(img_list.size(), 0);
|
||||
cls_scores.resize(img_list.size(), 0.0f);
|
||||
|
||||
// Process one image at a time (dynamic shapes)
|
||||
for (size_t i = 0; i < img_list.size(); i++) {
|
||||
if (img_list[i].empty()) continue;
|
||||
|
||||
try {
|
||||
// Preprocess: direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
|
||||
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
|
||||
cv::Mat resized;
|
||||
cv::resize(img_list[i], resized, cv::Size(kClsImageW, kClsImageH));
|
||||
|
||||
resized.convertTo(resized, CV_32FC3);
|
||||
// PP-LCNet uses ImageNet normalization (same as detection)
|
||||
auto inputData = NormalizeAndPermute(resized);
|
||||
|
||||
std::array<int64_t, 4> inputShape = { 1, 3, kClsImageH, kClsImageW };
|
||||
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
||||
*memory_info_handler, inputData.data(), inputData.size(),
|
||||
inputShape.data(), inputShape.size());
|
||||
|
||||
auto outputTensors = ort_session->Run(
|
||||
Ort::RunOptions{ nullptr },
|
||||
input_node_names.data(), &inputTensor, 1,
|
||||
output_node_names.data(), num_outputs);
|
||||
|
||||
float* outData = outputTensors[0].GetTensorMutableData<float>();
|
||||
auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
int numClasses = (outShape.size() > 1) ? static_cast<int>(outShape[1]) : 2;
|
||||
|
||||
// Find argmax and use raw output value as score
|
||||
// PaddleOCR v5 models include softmax, so output values are probabilities
|
||||
// Matches PaddleOCR official: score = preds[i, argmax_idx]
|
||||
int maxIdx = 0;
|
||||
float maxVal = outData[0];
|
||||
for (int c = 1; c < numClasses; c++) {
|
||||
if (outData[c] > maxVal) {
|
||||
maxVal = outData[c];
|
||||
maxIdx = c;
|
||||
}
|
||||
}
|
||||
|
||||
cls_labels[i] = maxIdx;
|
||||
cls_scores[i] = maxVal;
|
||||
}
|
||||
catch (const Ort::Exception& e) {
|
||||
std::cerr << "[ONNXOCRClassifier] Inference failed for image " << i
|
||||
<< ": " << e.what() << std::endl;
|
||||
cls_labels[i] = 0;
|
||||
cls_scores[i] = 0.0f;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
32
ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
Normal file
32
ANSOCR/ANSONNXOCR/ONNXOCRClassifier.h
Normal file
@@ -0,0 +1,32 @@
|
||||
#pragma once
|
||||
|
||||
#include "ONNXOCRTypes.h"
|
||||
#include "ONNXEngine.h"
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
class ONNXOCRClassifier : public BasicOrtHandler {
|
||||
public:
|
||||
explicit ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads = 1);
|
||||
~ONNXOCRClassifier() override = default;
|
||||
|
||||
// Classify text orientation for a list of cropped images
|
||||
// Returns vector of (cls_label, cls_score) pairs
|
||||
// cls_label: 0 = normal, 1 = rotated 180 degrees
|
||||
void Classify(std::vector<cv::Mat>& img_list,
|
||||
std::vector<int>& cls_labels,
|
||||
std::vector<float>& cls_scores,
|
||||
float cls_thresh = kClsThresh);
|
||||
|
||||
private:
|
||||
Ort::Value transform(const cv::Mat& mat) override;
|
||||
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
|
||||
|
||||
std::mutex _mutex;
|
||||
};
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
312
ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
Normal file
312
ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp
Normal file
@@ -0,0 +1,312 @@
|
||||
#include "ONNXOCRDetector.h"
|
||||
#include "include/clipper.h"
|
||||
#include "ANSGpuFrameRegistry.h"
|
||||
#include "NV12PreprocessHelper.h"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads)
|
||||
: BasicOrtHandler(onnx_path, num_threads) {
|
||||
}
|
||||
|
||||
Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
|
||||
// Not used directly - detection uses custom Preprocess + manual tensor creation
|
||||
// Provided to satisfy BasicOrtHandler pure virtual
|
||||
cv::Mat canvas;
|
||||
mat.convertTo(canvas, CV_32FC3);
|
||||
auto data = NormalizeAndPermute(canvas);
|
||||
|
||||
input_values_handler.assign(data.begin(), data.end());
|
||||
return Ort::Value::CreateTensor<float>(
|
||||
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
|
||||
input_node_dims.data(), input_node_dims.size());
|
||||
}
|
||||
|
||||
Ort::Value ONNXOCRDetector::transformBatch(const std::vector<cv::Mat>& images) {
|
||||
// Not used - detection processes single images with dynamic shapes
|
||||
if (!images.empty()) {
|
||||
return transform(images[0]);
|
||||
}
|
||||
return Ort::Value(nullptr);
|
||||
}
|
||||
|
||||
std::vector<TextBox> ONNXOCRDetector::Detect(const cv::Mat& srcImage,
|
||||
int maxSideLen,
|
||||
float dbThresh,
|
||||
float boxThresh,
|
||||
float unclipRatio,
|
||||
bool useDilation) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
|
||||
if (!ort_session || srcImage.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
// Try to get full-resolution image from NV12 frame (same pattern as ANSONNXYOLO)
|
||||
cv::Mat inferenceImage = srcImage;
|
||||
float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
|
||||
|
||||
GpuFrameData* gpuFrame = tl_currentGpuFrame();
|
||||
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
|
||||
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
|
||||
gpuFrame->width > 0 && gpuFrame->height > 0) {
|
||||
// Full-res NV12 available — convert to BGR on CPU for ONNX
|
||||
cv::Mat yPlane(gpuFrame->height, gpuFrame->width, CV_8UC1,
|
||||
gpuFrame->cpuYPlane, gpuFrame->cpuYLinesize);
|
||||
cv::Mat uvPlane(gpuFrame->height / 2, gpuFrame->width, CV_8UC1,
|
||||
gpuFrame->cpuUvPlane, gpuFrame->cpuUvLinesize);
|
||||
cv::Mat fullResBGR;
|
||||
cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
|
||||
if (!fullResBGR.empty()) {
|
||||
bgrScaleX = static_cast<float>(srcImage.cols) / fullResBGR.cols;
|
||||
bgrScaleY = static_cast<float>(srcImage.rows) / fullResBGR.rows;
|
||||
inferenceImage = fullResBGR;
|
||||
}
|
||||
}
|
||||
|
||||
int resizeH, resizeW;
|
||||
float ratioH, ratioW;
|
||||
|
||||
// Preprocess (using full-res image if NV12 was available)
|
||||
auto inputData = Preprocess(inferenceImage, maxSideLen, resizeH, resizeW, ratioH, ratioW);
|
||||
|
||||
// Create input tensor with dynamic shape
|
||||
std::array<int64_t, 4> inputShape = { 1, 3, resizeH, resizeW };
|
||||
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
||||
*memory_info_handler, inputData.data(), inputData.size(),
|
||||
inputShape.data(), inputShape.size());
|
||||
|
||||
// Run inference
|
||||
auto outputTensors = ort_session->Run(
|
||||
Ort::RunOptions{ nullptr },
|
||||
input_node_names.data(), &inputTensor, 1,
|
||||
output_node_names.data(), num_outputs);
|
||||
|
||||
// Get output data
|
||||
float* outputData = outputTensors[0].GetTensorMutableData<float>();
|
||||
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
int outH = static_cast<int>(outputShape[2]);
|
||||
int outW = static_cast<int>(outputShape[3]);
|
||||
|
||||
// Postprocess — detection coords are relative to inferenceImage (full-res),
|
||||
// then scaled back to srcImage (display-res) coordinates
|
||||
auto boxes = Postprocess(outputData, outH, outW, ratioH, ratioW,
|
||||
inferenceImage.rows, inferenceImage.cols,
|
||||
dbThresh, boxThresh, unclipRatio, useDilation);
|
||||
|
||||
// Rescale box coordinates from full-res to display-res
|
||||
if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
|
||||
for (auto& box : boxes) {
|
||||
for (auto& pt : box.points) {
|
||||
pt.x *= bgrScaleX;
|
||||
pt.y *= bgrScaleY;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return boxes;
|
||||
}
|
||||
|
||||
std::vector<float> ONNXOCRDetector::Preprocess(const cv::Mat& srcImage, int maxSideLen,
|
||||
int& resizeH, int& resizeW,
|
||||
float& ratioH, float& ratioW) {
|
||||
cv::Size newSize = ComputeDetResizeShape(srcImage.rows, srcImage.cols, maxSideLen);
|
||||
resizeW = newSize.width;
|
||||
resizeH = newSize.height;
|
||||
ratioH = static_cast<float>(srcImage.rows) / resizeH;
|
||||
ratioW = static_cast<float>(srcImage.cols) / resizeW;
|
||||
|
||||
cv::Mat resized;
|
||||
cv::resize(srcImage, resized, newSize);
|
||||
resized.convertTo(resized, CV_32FC3);
|
||||
|
||||
return NormalizeAndPermute(resized);
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official DBPostProcess.boxes_from_bitmap flow
|
||||
std::vector<TextBox> ONNXOCRDetector::Postprocess(const float* outputData, int outH, int outW,
|
||||
float ratioH, float ratioW,
|
||||
int srcH, int srcW,
|
||||
float dbThresh, float boxThresh,
|
||||
float unclipRatio, bool useDilation) {
|
||||
// Create probability map from output
|
||||
cv::Mat probMap(outH, outW, CV_32FC1, const_cast<float*>(outputData));
|
||||
|
||||
// Binary threshold
|
||||
cv::Mat binaryMap;
|
||||
cv::threshold(probMap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
|
||||
binaryMap.convertTo(binaryMap, CV_8UC1);
|
||||
|
||||
// Optional dilation (PaddleOCR default: use_dilation=False, kernel=[[1,1],[1,1]])
|
||||
if (useDilation) {
|
||||
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
|
||||
cv::dilate(binaryMap, binaryMap, kernel);
|
||||
}
|
||||
|
||||
// Find contours
|
||||
std::vector<std::vector<cv::Point>> contours;
|
||||
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
|
||||
|
||||
std::vector<TextBox> boxes;
|
||||
int numContours = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
|
||||
|
||||
for (int i = 0; i < numContours; i++) {
|
||||
if (contours[i].size() < 4) continue;
|
||||
|
||||
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
|
||||
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
|
||||
float sside = std::min(minRect.size.width, minRect.size.height);
|
||||
if (sside < 3.0f) continue;
|
||||
|
||||
auto ordered = GetMiniBoxes(minRect);
|
||||
|
||||
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
|
||||
float score = BoxScoreFast(probMap, ordered);
|
||||
if (score < boxThresh) continue;
|
||||
|
||||
// Step 3: UnclipPolygon - expand the 4-point box
|
||||
auto expanded = UnclipPolygon(ordered, unclipRatio);
|
||||
if (expanded.size() < 4) continue;
|
||||
|
||||
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
|
||||
std::vector<cv::Point> expandedInt;
|
||||
expandedInt.reserve(expanded.size());
|
||||
for (auto& p : expanded) {
|
||||
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
|
||||
}
|
||||
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
|
||||
|
||||
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
|
||||
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
|
||||
if (expandedSside < 5.0f) continue;
|
||||
|
||||
auto expandedOrdered = GetMiniBoxes(expandedRect);
|
||||
|
||||
// Step 5: Scale to original image coordinates
|
||||
TextBox box;
|
||||
for (int j = 0; j < 4; j++) {
|
||||
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(srcW - 1));
|
||||
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(srcH - 1));
|
||||
}
|
||||
box.score = score;
|
||||
boxes.push_back(box);
|
||||
}
|
||||
|
||||
SortTextBoxes(boxes);
|
||||
return boxes;
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
|
||||
std::array<cv::Point2f, 4> ONNXOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
|
||||
cv::Point2f vertices[4];
|
||||
rect.points(vertices);
|
||||
|
||||
// Sort all 4 points by x-coordinate ascending
|
||||
std::sort(vertices, vertices + 4,
|
||||
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
|
||||
|
||||
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
|
||||
cv::Point2f topLeft, bottomLeft;
|
||||
if (vertices[0].y <= vertices[1].y) {
|
||||
topLeft = vertices[0];
|
||||
bottomLeft = vertices[1];
|
||||
} else {
|
||||
topLeft = vertices[1];
|
||||
bottomLeft = vertices[0];
|
||||
}
|
||||
|
||||
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
|
||||
cv::Point2f topRight, bottomRight;
|
||||
if (vertices[2].y <= vertices[3].y) {
|
||||
topRight = vertices[2];
|
||||
bottomRight = vertices[3];
|
||||
} else {
|
||||
topRight = vertices[3];
|
||||
bottomRight = vertices[2];
|
||||
}
|
||||
|
||||
// Order: [TL, TR, BR, BL] (clockwise from top-left)
|
||||
return { topLeft, topRight, bottomRight, bottomLeft };
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
|
||||
float ONNXOCRDetector::BoxScoreFast(const cv::Mat& probMap,
|
||||
const std::array<cv::Point2f, 4>& box) {
|
||||
int h = probMap.rows;
|
||||
int w = probMap.cols;
|
||||
|
||||
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
|
||||
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
|
||||
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
|
||||
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
|
||||
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
|
||||
|
||||
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
|
||||
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
|
||||
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
|
||||
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
|
||||
|
||||
if (xmin >= xmax || ymin >= ymax) return 0.0f;
|
||||
|
||||
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
|
||||
|
||||
std::vector<cv::Point> pts(4);
|
||||
for (int j = 0; j < 4; j++) {
|
||||
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
|
||||
static_cast<int>(box[j].y) - ymin);
|
||||
}
|
||||
std::vector<std::vector<cv::Point>> polys = { pts };
|
||||
cv::fillPoly(mask, polys, cv::Scalar(1));
|
||||
|
||||
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
|
||||
return static_cast<float>(cv::mean(roiMap, mask)[0]);
|
||||
}
|
||||
|
||||
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
|
||||
std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
|
||||
float unclipRatio) {
|
||||
// Compute area using Shoelace formula and perimeter
|
||||
float area = 0.0f;
|
||||
float perimeter = 0.0f;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
int j = (i + 1) % 4;
|
||||
area += box[i].x * box[j].y - box[j].x * box[i].y;
|
||||
float dx = box[j].x - box[i].x;
|
||||
float dy = box[j].y - box[i].y;
|
||||
perimeter += std::sqrt(dx * dx + dy * dy);
|
||||
}
|
||||
area = std::abs(area) * 0.5f;
|
||||
if (perimeter < 1.0f) return {};
|
||||
|
||||
float distance = area * unclipRatio / perimeter;
|
||||
|
||||
ClipperLib::Path clipperPath;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
|
||||
static_cast<ClipperLib::cInt>(box[i].y) });
|
||||
}
|
||||
|
||||
ClipperLib::ClipperOffset offset;
|
||||
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
|
||||
|
||||
ClipperLib::Paths solution;
|
||||
offset.Execute(solution, distance);
|
||||
|
||||
if (solution.empty() || solution[0].empty()) return {};
|
||||
|
||||
std::vector<cv::Point2f> result;
|
||||
for (auto& p : solution[0]) {
|
||||
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
52
ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
Normal file
52
ANSOCR/ANSONNXOCR/ONNXOCRDetector.h
Normal file
@@ -0,0 +1,52 @@
|
||||
#pragma once
|
||||
|
||||
#include "ONNXOCRTypes.h"
|
||||
#include "ONNXEngine.h"
|
||||
#include <vector>
|
||||
#include <mutex>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
class ONNXOCRDetector : public BasicOrtHandler {
|
||||
public:
|
||||
explicit ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads = 1);
|
||||
~ONNXOCRDetector() override = default;
|
||||
|
||||
// Run text detection on an image
|
||||
std::vector<TextBox> Detect(const cv::Mat& srcImage,
|
||||
int maxSideLen = kDetMaxSideLen,
|
||||
float dbThresh = kDetDbThresh,
|
||||
float boxThresh = kDetBoxThresh,
|
||||
float unclipRatio = kDetUnclipRatio,
|
||||
bool useDilation = false);
|
||||
|
||||
private:
|
||||
Ort::Value transform(const cv::Mat& mat) override;
|
||||
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
|
||||
|
||||
// Preprocessing
|
||||
std::vector<float> Preprocess(const cv::Mat& srcImage, int maxSideLen,
|
||||
int& resizeH, int& resizeW,
|
||||
float& ratioH, float& ratioW);
|
||||
|
||||
// Postprocessing: threshold -> contours -> boxes (matches PaddleOCR official flow)
|
||||
std::vector<TextBox> Postprocess(const float* outputData, int outH, int outW,
|
||||
float ratioH, float ratioW, int srcH, int srcW,
|
||||
float dbThresh, float boxThresh, float unclipRatio,
|
||||
bool useDilation);
|
||||
|
||||
// Get ordered 4 corners [TL, TR, BR, BL] from rotated rect (matches PaddleOCR GetMiniBoxes)
|
||||
std::array<cv::Point2f, 4> GetMiniBoxes(const cv::RotatedRect& rect);
|
||||
|
||||
// Compute mean score inside box polygon on the probability map
|
||||
float BoxScoreFast(const cv::Mat& probMap, const std::array<cv::Point2f, 4>& box);
|
||||
|
||||
// Expand 4-point box using Clipper offset
|
||||
std::vector<cv::Point2f> UnclipPolygon(const std::array<cv::Point2f, 4>& box, float unclipRatio);
|
||||
|
||||
std::mutex _mutex;
|
||||
};
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
165
ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
Normal file
165
ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.cpp
Normal file
@@ -0,0 +1,165 @@
|
||||
#include "ONNXOCRRecognizer.h"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <cmath>
|
||||
#include <cfloat>
|
||||
#include <cstring>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
ONNXOCRRecognizer::ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads)
|
||||
: BasicOrtHandler(onnx_path, num_threads) {
|
||||
}
|
||||
|
||||
bool ONNXOCRRecognizer::LoadDictionary(const std::string& dictPath) {
|
||||
keys_ = LoadDict(dictPath);
|
||||
if (keys_.size() < 2) {
|
||||
std::cerr << "[ONNXOCRRecognizer] Failed to load dictionary: " << dictPath << std::endl;
|
||||
return false;
|
||||
}
|
||||
std::cout << "[ONNXOCRRecognizer] Loaded dictionary with " << keys_.size()
|
||||
<< " characters from: " << dictPath << std::endl;
|
||||
return true;
|
||||
}
|
||||
|
||||
Ort::Value ONNXOCRRecognizer::transform(const cv::Mat& mat) {
|
||||
// Not used directly - recognition uses custom preprocess with dynamic width
|
||||
cv::Mat resized = ResizeRecImage(mat, imgH_, imgMaxW_);
|
||||
resized.convertTo(resized, CV_32FC3);
|
||||
auto data = NormalizeAndPermuteCls(resized);
|
||||
|
||||
input_values_handler.assign(data.begin(), data.end());
|
||||
return Ort::Value::CreateTensor<float>(
|
||||
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
|
||||
input_node_dims.data(), input_node_dims.size());
|
||||
}
|
||||
|
||||
Ort::Value ONNXOCRRecognizer::transformBatch(const std::vector<cv::Mat>& images) {
|
||||
// Not used - recognizer processes single images with dynamic widths
|
||||
if (!images.empty()) {
|
||||
return transform(images[0]);
|
||||
}
|
||||
return Ort::Value(nullptr);
|
||||
}
|
||||
|
||||
TextLine ONNXOCRRecognizer::Recognize(const cv::Mat& croppedImage) {
|
||||
std::lock_guard<std::mutex> lock(_mutex);
|
||||
|
||||
if (!ort_session || croppedImage.empty() || keys_.empty()) {
|
||||
return {};
|
||||
}
|
||||
|
||||
try {
|
||||
// Preprocess: resize to fixed height, proportional width
|
||||
cv::Mat resized = ResizeRecImage(croppedImage, imgH_, imgMaxW_);
|
||||
int resizedW = resized.cols;
|
||||
|
||||
resized.convertTo(resized, CV_32FC3);
|
||||
// Recognition uses (pixel/255 - 0.5) / 0.5 normalization (same as classifier)
|
||||
auto normalizedData = NormalizeAndPermuteCls(resized);
|
||||
|
||||
// Pad to at least kRecImgW width (matching official PaddleOCR behavior)
|
||||
// Official PaddleOCR: padding_im = np.zeros((C, H, W)), then copies normalized
|
||||
// image into left portion. Padding value = 0.0 in normalized space.
|
||||
int imgW = std::max(resizedW, kRecImgW);
|
||||
|
||||
std::vector<float> inputData;
|
||||
if (imgW > resizedW) {
|
||||
// Zero-pad on the right (CHW layout)
|
||||
inputData.resize(3 * imgH_ * imgW, 0.0f);
|
||||
for (int c = 0; c < 3; c++) {
|
||||
for (int y = 0; y < imgH_; y++) {
|
||||
std::memcpy(
|
||||
&inputData[c * imgH_ * imgW + y * imgW],
|
||||
&normalizedData[c * imgH_ * resizedW + y * resizedW],
|
||||
resizedW * sizeof(float));
|
||||
}
|
||||
}
|
||||
} else {
|
||||
inputData = std::move(normalizedData);
|
||||
}
|
||||
|
||||
// Create input tensor with (possibly padded) width
|
||||
std::array<int64_t, 4> inputShape = { 1, 3, imgH_, imgW };
|
||||
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
||||
*memory_info_handler, inputData.data(), inputData.size(),
|
||||
inputShape.data(), inputShape.size());
|
||||
|
||||
// Run inference
|
||||
auto outputTensors = ort_session->Run(
|
||||
Ort::RunOptions{ nullptr },
|
||||
input_node_names.data(), &inputTensor, 1,
|
||||
output_node_names.data(), num_outputs);
|
||||
|
||||
// Get output
|
||||
float* outputData = outputTensors[0].GetTensorMutableData<float>();
|
||||
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
||||
|
||||
int seqLen = static_cast<int>(outputShape[1]);
|
||||
int numClasses = static_cast<int>(outputShape[2]);
|
||||
|
||||
return CTCDecode(outputData, seqLen, numClasses);
|
||||
}
|
||||
catch (const Ort::Exception& e) {
|
||||
std::cerr << "[ONNXOCRRecognizer] Inference failed: " << e.what() << std::endl;
|
||||
return {};
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<TextLine> ONNXOCRRecognizer::RecognizeBatch(const std::vector<cv::Mat>& croppedImages) {
|
||||
std::vector<TextLine> results;
|
||||
results.reserve(croppedImages.size());
|
||||
|
||||
// Process one at a time (dynamic width per image)
|
||||
for (size_t i = 0; i < croppedImages.size(); i++) {
|
||||
results.push_back(Recognize(croppedImages[i]));
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
TextLine ONNXOCRRecognizer::CTCDecode(const float* outputData, int seqLen, int numClasses) {
|
||||
TextLine result;
|
||||
std::string text;
|
||||
std::vector<float> scores;
|
||||
|
||||
int lastIndex = 0; // CTC blank is index 0
|
||||
|
||||
for (int t = 0; t < seqLen; t++) {
|
||||
// Find argmax for this timestep
|
||||
int maxIndex = 0;
|
||||
float maxValue = -FLT_MAX;
|
||||
|
||||
const float* timeStep = outputData + t * numClasses;
|
||||
for (int c = 0; c < numClasses; c++) {
|
||||
if (timeStep[c] > maxValue) {
|
||||
maxValue = timeStep[c];
|
||||
maxIndex = c;
|
||||
}
|
||||
}
|
||||
|
||||
// CTC decode: skip blanks (index 0) and repeated characters
|
||||
if (maxIndex != 0 && maxIndex != lastIndex) {
|
||||
if (maxIndex > 0 && maxIndex < static_cast<int>(keys_.size())) {
|
||||
text += keys_[maxIndex]; // keys_[0]="#"(blank), keys_[1]=first_char, etc.
|
||||
// Use raw model output value as confidence (PaddleOCR v5 models include softmax)
|
||||
scores.push_back(maxValue);
|
||||
}
|
||||
}
|
||||
lastIndex = maxIndex;
|
||||
}
|
||||
|
||||
result.text = text;
|
||||
if (!scores.empty()) {
|
||||
result.score = std::accumulate(scores.begin(), scores.end(), 0.0f) /
|
||||
static_cast<float>(scores.size());
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
40
ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
Normal file
40
ANSOCR/ANSONNXOCR/ONNXOCRRecognizer.h
Normal file
@@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#include "ONNXOCRTypes.h"
|
||||
#include "ONNXEngine.h"
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <mutex>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
class ONNXOCRRecognizer : public BasicOrtHandler {
|
||||
public:
|
||||
explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1);
|
||||
~ONNXOCRRecognizer() override = default;
|
||||
|
||||
// Load character dictionary (must be called before Recognize)
|
||||
bool LoadDictionary(const std::string& dictPath);
|
||||
|
||||
// Recognize text from a single cropped text image
|
||||
TextLine Recognize(const cv::Mat& croppedImage);
|
||||
|
||||
// Batch recognition for multiple cropped images
|
||||
std::vector<TextLine> RecognizeBatch(const std::vector<cv::Mat>& croppedImages);
|
||||
|
||||
private:
|
||||
Ort::Value transform(const cv::Mat& mat) override;
|
||||
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
|
||||
|
||||
// CTC greedy decode
|
||||
TextLine CTCDecode(const float* outputData, int seqLen, int numClasses);
|
||||
|
||||
std::vector<std::string> keys_;
|
||||
int imgH_ = kRecImgH;
|
||||
int imgMaxW_ = kRecImgMaxW;
|
||||
std::mutex _mutex;
|
||||
};
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
212
ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
Normal file
212
ANSOCR/ANSONNXOCR/ONNXOCRTypes.h
Normal file
@@ -0,0 +1,212 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <array>
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <numeric>
|
||||
#include <cmath>
|
||||
#include <opencv2/core.hpp>
|
||||
#include <opencv2/imgproc.hpp>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
// Detection normalization constants (BGR channel order, matching PaddleOCR official)
|
||||
// PaddleOCR config: mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
|
||||
// Applied directly to BGR channels WITHOUT BGR→RGB conversion:
|
||||
// Channel 0 (B) → mean=0.485, std=0.229
|
||||
// Channel 1 (G) → mean=0.456, std=0.224
|
||||
// Channel 2 (R) → mean=0.406, std=0.225
|
||||
constexpr float kDetMean0 = 0.485f; // B channel
|
||||
constexpr float kDetMean1 = 0.456f; // G channel
|
||||
constexpr float kDetMean2 = 0.406f; // R channel
|
||||
constexpr float kDetStd0 = 0.229f; // B channel
|
||||
constexpr float kDetStd1 = 0.224f; // G channel
|
||||
constexpr float kDetStd2 = 0.225f; // R channel
|
||||
constexpr float kScale = 1.0f / 255.0f;
|
||||
|
||||
// Detection defaults (PP-OCRv5 server: limit_type=max, limit_side_len=960)
|
||||
constexpr int kDetMaxSideLen = 960;
|
||||
constexpr int kDetMaxSideLimit = 4000; // Safety cap on max dimension
|
||||
constexpr float kDetDbThresh = 0.3f;
|
||||
constexpr float kDetBoxThresh = 0.6f;
|
||||
constexpr float kDetUnclipRatio = 1.5f;
|
||||
constexpr int kDetMaxCandidates = 1000;
|
||||
|
||||
// Classifier defaults (PP-LCNet_x1_0_textline_ori model)
|
||||
// Input: [B, 3, 80, 160], ImageNet normalization, 2-class (0°/180°)
|
||||
// Direct resize to 80x160 (no aspect ratio preservation)
|
||||
constexpr int kClsImageH = 80;
|
||||
constexpr int kClsImageW = 160;
|
||||
constexpr float kClsThresh = 0.9f;
|
||||
|
||||
// Recognition defaults
|
||||
constexpr int kRecImgH = 48;
|
||||
constexpr int kRecImgW = 320; // Default rec width (PP-OCRv5 rec_image_shape[2]=320, min padded width)
|
||||
constexpr int kRecImgMaxW = 960; // Allow wide recognition input for long text lines
|
||||
constexpr int kRecBatchSize = 6;
|
||||
|
||||
// A detected text box: 4 corner points (top-left, top-right, bottom-right, bottom-left)
|
||||
struct TextBox {
|
||||
std::array<cv::Point2f, 4> points;
|
||||
float score = 0.0f;
|
||||
};
|
||||
|
||||
// A single recognized text line
|
||||
struct TextLine {
|
||||
std::string text;
|
||||
float score = 0.0f;
|
||||
};
|
||||
|
||||
// OCR result matching PaddleOCR::OCRPredictResult format
|
||||
struct OCRPredictResult {
|
||||
std::vector<std::vector<int>> box; // 4 corner points [[x,y], ...]
|
||||
std::string text;
|
||||
float score = -1.0f;
|
||||
float cls_score = 0.0f;
|
||||
int cls_label = -1;
|
||||
};
|
||||
|
||||
// Load character dictionary from file
|
||||
inline std::vector<std::string> LoadDict(const std::string& dictPath) {
|
||||
std::vector<std::string> keys;
|
||||
std::ifstream file(dictPath);
|
||||
if (!file.is_open()) return keys;
|
||||
std::string line;
|
||||
while (std::getline(file, line)) {
|
||||
if (!line.empty() && line.back() == '\r') {
|
||||
line.pop_back();
|
||||
}
|
||||
keys.push_back(line);
|
||||
}
|
||||
// CTC blank token at index 0
|
||||
keys.insert(keys.begin(), "#");
|
||||
// Space at end
|
||||
keys.push_back(" ");
|
||||
return keys;
|
||||
}
|
||||
|
||||
// Compute resize dimensions for detection model (multiples of 32)
|
||||
// limit_type='max': scale down if max side > maxSideLen (PP-OCRv5 server default)
|
||||
// maxSideLimit: safety cap on final max dimension (default 4000)
|
||||
inline cv::Size ComputeDetResizeShape(int srcH, int srcW, int maxSideLen,
|
||||
int maxSideLimit = kDetMaxSideLimit) {
|
||||
float ratio = 1.0f;
|
||||
int maxSide = std::max(srcH, srcW);
|
||||
if (maxSide > maxSideLen) {
|
||||
ratio = static_cast<float>(maxSideLen) / static_cast<float>(maxSide);
|
||||
}
|
||||
int newH = static_cast<int>(srcH * ratio);
|
||||
int newW = static_cast<int>(srcW * ratio);
|
||||
|
||||
// Safety cap: clamp if either dimension exceeds maxSideLimit
|
||||
if (std::max(newH, newW) > maxSideLimit) {
|
||||
float clampRatio = static_cast<float>(maxSideLimit) / static_cast<float>(std::max(newH, newW));
|
||||
newH = static_cast<int>(newH * clampRatio);
|
||||
newW = static_cast<int>(newW * clampRatio);
|
||||
}
|
||||
|
||||
newH = std::max(32, static_cast<int>(std::round(newH / 32.0) * 32));
|
||||
newW = std::max(32, static_cast<int>(std::round(newW / 32.0) * 32));
|
||||
return cv::Size(newW, newH);
|
||||
}
|
||||
|
||||
// Normalize BGR float image to CHW BGR vector for detection
|
||||
// BGR channel order preserved (matching PaddleOCR official - no BGR→RGB conversion)
|
||||
inline std::vector<float> NormalizeAndPermute(const cv::Mat& img) {
|
||||
int h = img.rows;
|
||||
int w = img.cols;
|
||||
std::vector<float> result(3 * h * w);
|
||||
for (int y = 0; y < h; y++) {
|
||||
const float* row = img.ptr<float>(y);
|
||||
for (int x = 0; x < w; x++) {
|
||||
float b = row[x * 3 + 0];
|
||||
float g = row[x * 3 + 1];
|
||||
float r = row[x * 3 + 2];
|
||||
// BGR order: channel 0=B, 1=G, 2=R (matching PaddleOCR official)
|
||||
result[0 * h * w + y * w + x] = (b * kScale - kDetMean0) / kDetStd0;
|
||||
result[1 * h * w + y * w + x] = (g * kScale - kDetMean1) / kDetStd1;
|
||||
result[2 * h * w + y * w + x] = (r * kScale - kDetMean2) / kDetStd2;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Normalize for classifier and recognizer: (x/255 - 0.5) / 0.5
|
||||
// BGR channel order preserved (matching PaddleOCR official - no BGR→RGB conversion)
|
||||
inline std::vector<float> NormalizeAndPermuteCls(const cv::Mat& img) {
|
||||
int h = img.rows;
|
||||
int w = img.cols;
|
||||
std::vector<float> result(3 * h * w);
|
||||
for (int y = 0; y < h; y++) {
|
||||
const float* row = img.ptr<float>(y);
|
||||
for (int x = 0; x < w; x++) {
|
||||
float b = row[x * 3 + 0];
|
||||
float g = row[x * 3 + 1];
|
||||
float r = row[x * 3 + 2];
|
||||
// BGR order: channel 0=B, 1=G, 2=R (matching PaddleOCR official)
|
||||
result[0 * h * w + y * w + x] = (b * kScale - 0.5f) / 0.5f;
|
||||
result[1 * h * w + y * w + x] = (g * kScale - 0.5f) / 0.5f;
|
||||
result[2 * h * w + y * w + x] = (r * kScale - 0.5f) / 0.5f;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// Sort text boxes from top to bottom, left to right
|
||||
inline void SortTextBoxes(std::vector<TextBox>& boxes) {
|
||||
std::sort(boxes.begin(), boxes.end(),
|
||||
[](const TextBox& a, const TextBox& b) {
|
||||
if (std::abs(a.points[0].y - b.points[0].y) < 10.0f) {
|
||||
return a.points[0].x < b.points[0].x;
|
||||
}
|
||||
return a.points[0].y < b.points[0].y;
|
||||
});
|
||||
}
|
||||
|
||||
// Get rotated and cropped image from text box polygon
|
||||
inline cv::Mat GetRotateCropImage(const cv::Mat& srcImage, const TextBox& box) {
|
||||
auto pts = box.points;
|
||||
float width = static_cast<float>(std::max(
|
||||
cv::norm(pts[0] - pts[1]),
|
||||
cv::norm(pts[2] - pts[3])));
|
||||
float height = static_cast<float>(std::max(
|
||||
cv::norm(pts[0] - pts[3]),
|
||||
cv::norm(pts[1] - pts[2])));
|
||||
|
||||
std::vector<cv::Point2f> srcPts = { pts[0], pts[1], pts[2], pts[3] };
|
||||
std::vector<cv::Point2f> dstPts = {
|
||||
{0, 0}, {width, 0}, {width, height}, {0, height}
|
||||
};
|
||||
|
||||
cv::Mat M = cv::getPerspectiveTransform(srcPts, dstPts);
|
||||
cv::Mat cropped;
|
||||
cv::warpPerspective(srcImage, cropped, M,
|
||||
cv::Size(static_cast<int>(width), static_cast<int>(height)),
|
||||
cv::BORDER_REPLICATE);
|
||||
|
||||
if (cropped.rows > cropped.cols * 1.5f) {
|
||||
cv::Mat rotated;
|
||||
cv::transpose(cropped, rotated);
|
||||
cv::flip(rotated, rotated, 0);
|
||||
return rotated;
|
||||
}
|
||||
return cropped;
|
||||
}
|
||||
|
||||
// Resize recognition image to fixed height, proportional width
|
||||
inline cv::Mat ResizeRecImage(const cv::Mat& img, int targetH, int maxW) {
|
||||
float ratio = static_cast<float>(targetH) / img.rows;
|
||||
int targetW = static_cast<int>(img.cols * ratio);
|
||||
targetW = std::min(targetW, maxW);
|
||||
targetW = std::max(targetW, 1);
|
||||
|
||||
cv::Mat resized;
|
||||
cv::resize(img, resized, cv::Size(targetW, targetH));
|
||||
return resized;
|
||||
}
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
130
ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
Normal file
130
ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.cpp
Normal file
@@ -0,0 +1,130 @@
|
||||
#include "PaddleOCRV5Engine.h"
|
||||
#include "EPLoader.h"
|
||||
|
||||
#include <opencv2/imgproc.hpp>
|
||||
#include <iostream>
|
||||
#include <algorithm>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
bool PaddleOCRV5Engine::Initialize(const std::string& detModelPath,
|
||||
const std::string& clsModelPath,
|
||||
const std::string& recModelPath,
|
||||
const std::string& dictPath) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
|
||||
try {
|
||||
// Initialize detector (also triggers EPLoader init in BasicOrtHandler)
|
||||
detector_ = std::make_unique<ONNXOCRDetector>(detModelPath);
|
||||
std::cout << "[PaddleOCRV5Engine] Detector initialized: " << detModelPath << std::endl;
|
||||
|
||||
// Ensure this DLL's copy of Ort::Global<void>::api_ is initialized.
|
||||
// BasicOrtHandler sets it in ONNXEngine.dll, but each DLL has its own
|
||||
// inline-static copy. Without this, inference calls from ANSOCR.dll crash.
|
||||
if (Ort::Global<void>::api_ == nullptr) {
|
||||
Ort::InitApi(static_cast<const OrtApi*>(EPLoader::GetOrtApiRaw()));
|
||||
}
|
||||
|
||||
// Initialize classifier (optional)
|
||||
if (!clsModelPath.empty()) {
|
||||
classifier_ = std::make_unique<ONNXOCRClassifier>(clsModelPath);
|
||||
std::cout << "[PaddleOCRV5Engine] Classifier initialized: " << clsModelPath << std::endl;
|
||||
}
|
||||
else {
|
||||
classifier_.reset();
|
||||
std::cout << "[PaddleOCRV5Engine] Classifier skipped (no model path)" << std::endl;
|
||||
}
|
||||
|
||||
// Initialize recognizer
|
||||
recognizer_ = std::make_unique<ONNXOCRRecognizer>(recModelPath);
|
||||
if (!recognizer_->LoadDictionary(dictPath)) {
|
||||
std::cerr << "[PaddleOCRV5Engine] Failed to load dictionary" << std::endl;
|
||||
return false;
|
||||
}
|
||||
std::cout << "[PaddleOCRV5Engine] Recognizer initialized: " << recModelPath << std::endl;
|
||||
|
||||
_initialized = true;
|
||||
std::cout << "[PaddleOCRV5Engine] Pipeline initialized successfully" << std::endl;
|
||||
return true;
|
||||
}
|
||||
catch (const std::exception& e) {
|
||||
std::cerr << "[PaddleOCRV5Engine] Initialization failed: " << e.what() << std::endl;
|
||||
detector_.reset();
|
||||
classifier_.reset();
|
||||
recognizer_.reset();
|
||||
_initialized = false;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<OCRPredictResult> PaddleOCRV5Engine::ocr(const cv::Mat& img) {
|
||||
std::lock_guard<std::recursive_mutex> lock(_mutex);
|
||||
|
||||
std::vector<OCRPredictResult> results;
|
||||
|
||||
if (!_initialized || img.empty()) {
|
||||
return results;
|
||||
}
|
||||
|
||||
// Step 1: Text Detection
|
||||
auto boxes = detector_->Detect(img, _maxSideLen, _detDbThresh, _detBoxThresh, _detUnclipRatio, _useDilation);
|
||||
|
||||
if (boxes.empty()) {
|
||||
return results;
|
||||
}
|
||||
|
||||
// Step 2: Crop detected text regions
|
||||
std::vector<cv::Mat> croppedImages;
|
||||
croppedImages.reserve(boxes.size());
|
||||
for (auto& box : boxes) {
|
||||
cv::Mat cropped = GetRotateCropImage(img, box);
|
||||
if (!cropped.empty()) {
|
||||
croppedImages.push_back(cropped);
|
||||
}
|
||||
}
|
||||
|
||||
// Step 3: Classification (optional)
|
||||
std::vector<int> cls_labels(croppedImages.size(), 0);
|
||||
std::vector<float> cls_scores(croppedImages.size(), 0.0f);
|
||||
|
||||
if (classifier_) {
|
||||
classifier_->Classify(croppedImages, cls_labels, cls_scores, _clsThresh);
|
||||
|
||||
// Rotate images classified as upside-down (label=1 and score > threshold)
|
||||
for (size_t i = 0; i < croppedImages.size(); i++) {
|
||||
if (cls_labels[i] % 2 == 1 && cls_scores[i] > _clsThresh) {
|
||||
cv::rotate(croppedImages[i], croppedImages[i], cv::ROTATE_180);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Step 4: Text Recognition
|
||||
auto textLines = recognizer_->RecognizeBatch(croppedImages);
|
||||
|
||||
// Step 5: Combine results
|
||||
for (size_t i = 0; i < boxes.size() && i < textLines.size(); i++) {
|
||||
OCRPredictResult result;
|
||||
|
||||
// Convert TextBox points to box format [[x0,y0], [x1,y1], [x2,y2], [x3,y3]]
|
||||
result.box.resize(4);
|
||||
for (int j = 0; j < 4; j++) {
|
||||
result.box[j] = {
|
||||
static_cast<int>(boxes[i].points[j].x),
|
||||
static_cast<int>(boxes[i].points[j].y)
|
||||
};
|
||||
}
|
||||
|
||||
result.text = textLines[i].text;
|
||||
result.score = textLines[i].score;
|
||||
result.cls_label = cls_labels[i];
|
||||
result.cls_score = cls_scores[i];
|
||||
|
||||
results.push_back(result);
|
||||
}
|
||||
|
||||
return results;
|
||||
}
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
63
ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
Normal file
63
ANSOCR/ANSONNXOCR/PaddleOCRV5Engine.h
Normal file
@@ -0,0 +1,63 @@
|
||||
#pragma once
|
||||
|
||||
#include "ONNXOCRTypes.h"
|
||||
#include "ONNXOCRDetector.h"
|
||||
#include "ONNXOCRClassifier.h"
|
||||
#include "ONNXOCRRecognizer.h"
|
||||
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
namespace ANSCENTER {
|
||||
namespace onnxocr {
|
||||
|
||||
// PaddleOCR V5 pipeline engine: Detection -> (Classification) -> Recognition
|
||||
// Mirrors the PaddleOCR::PPOCR interface for drop-in replacement
|
||||
class PaddleOCRV5Engine {
|
||||
public:
|
||||
PaddleOCRV5Engine() = default;
|
||||
~PaddleOCRV5Engine() = default;
|
||||
|
||||
// Initialize the OCR pipeline
|
||||
// clsModelPath can be empty to skip classification
|
||||
bool Initialize(const std::string& detModelPath,
|
||||
const std::string& clsModelPath,
|
||||
const std::string& recModelPath,
|
||||
const std::string& dictPath);
|
||||
|
||||
// Run full OCR pipeline on an image
|
||||
// Returns results matching PaddleOCR::OCRPredictResult format
|
||||
std::vector<OCRPredictResult> ocr(const cv::Mat& img);
|
||||
|
||||
// Configuration setters (matching OCRModelConfig parameters)
|
||||
void SetDetMaxSideLen(int val) { _maxSideLen = val; }
|
||||
void SetDetDbThresh(float val) { _detDbThresh = val; }
|
||||
void SetDetBoxThresh(float val) { _detBoxThresh = val; }
|
||||
void SetDetUnclipRatio(float val) { _detUnclipRatio = val; }
|
||||
void SetClsThresh(float val) { _clsThresh = val; }
|
||||
void SetUseDilation(bool val) { _useDilation = val; }
|
||||
|
||||
private:
|
||||
std::unique_ptr<ONNXOCRDetector> detector_;
|
||||
std::unique_ptr<ONNXOCRClassifier> classifier_; // nullptr if not used
|
||||
std::unique_ptr<ONNXOCRRecognizer> recognizer_;
|
||||
|
||||
std::recursive_mutex _mutex;
|
||||
|
||||
// Detection parameters
|
||||
int _maxSideLen = kDetMaxSideLen;
|
||||
float _detDbThresh = kDetDbThresh;
|
||||
float _detBoxThresh = kDetBoxThresh;
|
||||
float _detUnclipRatio = kDetUnclipRatio;
|
||||
bool _useDilation = false;
|
||||
|
||||
// Classifier parameters
|
||||
float _clsThresh = kClsThresh;
|
||||
|
||||
bool _initialized = false;
|
||||
};
|
||||
|
||||
} // namespace onnxocr
|
||||
} // namespace ANSCENTER
|
||||
Reference in New Issue
Block a user