Files
ANSCORE/modules/ANSOCR/ANSONNXOCR/ONNXOCRDetector.cpp

356 lines
14 KiB
C++

#include "ONNXOCRDetector.h"
#include "include/clipper.h"
#include "ANSGpuFrameRegistry.h"
#include "NV12PreprocessHelper.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>
#include <chrono>
namespace ANSCENTER {
namespace onnxocr {
ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads)
: BasicOrtHandler(onnx_path, num_threads) {
}
ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path,
const OrtHandlerOptions& options,
unsigned int num_threads)
: BasicOrtHandler(onnx_path, options, num_threads) {
}
Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
// Not used directly - detection uses custom Preprocess + manual tensor creation
// Provided to satisfy BasicOrtHandler pure virtual
cv::Mat canvas;
mat.convertTo(canvas, CV_32FC3);
auto data = NormalizeAndPermute(canvas);
input_values_handler.assign(data.begin(), data.end());
return Ort::Value::CreateTensor<float>(
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
input_node_dims.data(), input_node_dims.size());
}
Ort::Value ONNXOCRDetector::transformBatch(const std::vector<cv::Mat>& images) {
// Not used - detection processes single images with dynamic shapes
if (!images.empty()) {
return transform(images[0]);
}
return Ort::Value(nullptr);
}
std::vector<TextBox> ONNXOCRDetector::Detect(const cv::Mat& srcImage,
int maxSideLen,
float dbThresh,
float boxThresh,
float unclipRatio,
bool useDilation) {
std::lock_guard<std::mutex> lock(_mutex);
if (!ort_session || srcImage.empty()) {
return {};
}
// Try to get full-resolution image from NV12 frame (same pattern as ANSONNXYOLO)
cv::Mat inferenceImage = srcImage;
float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
GpuFrameData* gpuFrame = tl_currentGpuFrame();
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
gpuFrame->width > 0 && gpuFrame->height > 0) {
// Full-res NV12 available — convert to BGR on CPU for ONNX
cv::Mat yPlane(gpuFrame->height, gpuFrame->width, CV_8UC1,
gpuFrame->cpuYPlane, gpuFrame->cpuYLinesize);
cv::Mat uvPlane(gpuFrame->height / 2, gpuFrame->width, CV_8UC1,
gpuFrame->cpuUvPlane, gpuFrame->cpuUvLinesize);
cv::Mat fullResBGR;
cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
if (!fullResBGR.empty()) {
bgrScaleX = static_cast<float>(srcImage.cols) / fullResBGR.cols;
bgrScaleY = static_cast<float>(srcImage.rows) / fullResBGR.rows;
inferenceImage = fullResBGR;
}
}
int resizeH, resizeW;
float ratioH, ratioW;
// Preprocess (using full-res image if NV12 was available)
auto inputData = Preprocess(inferenceImage, maxSideLen, resizeH, resizeW, ratioH, ratioW);
// Create input tensor with dynamic shape
std::array<int64_t, 4> inputShape = { 1, 3, resizeH, resizeW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
// Run inference
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
// Get output data
float* outputData = outputTensors[0].GetTensorMutableData<float>();
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int outH = static_cast<int>(outputShape[2]);
int outW = static_cast<int>(outputShape[3]);
// Postprocess — detection coords are relative to inferenceImage (full-res),
// then scaled back to srcImage (display-res) coordinates
auto boxes = Postprocess(outputData, outH, outW, ratioH, ratioW,
inferenceImage.rows, inferenceImage.cols,
dbThresh, boxThresh, unclipRatio, useDilation);
// Rescale box coordinates from full-res to display-res
if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
for (auto& box : boxes) {
for (auto& pt : box.points) {
pt.x *= bgrScaleX;
pt.y *= bgrScaleY;
}
}
}
return boxes;
}
std::vector<float> ONNXOCRDetector::Preprocess(const cv::Mat& srcImage, int maxSideLen,
int& resizeH, int& resizeW,
float& ratioH, float& ratioW) {
cv::Size newSize = ComputeDetResizeShape(srcImage.rows, srcImage.cols, maxSideLen);
resizeW = newSize.width;
resizeH = newSize.height;
ratioH = static_cast<float>(srcImage.rows) / resizeH;
ratioW = static_cast<float>(srcImage.cols) / resizeW;
cv::Mat resized;
cv::resize(srcImage, resized, newSize);
resized.convertTo(resized, CV_32FC3);
return NormalizeAndPermute(resized);
}
// Matches PaddleOCR official DBPostProcess.boxes_from_bitmap flow
std::vector<TextBox> ONNXOCRDetector::Postprocess(const float* outputData, int outH, int outW,
float ratioH, float ratioW,
int srcH, int srcW,
float dbThresh, float boxThresh,
float unclipRatio, bool useDilation) {
// Create probability map from output
cv::Mat probMap(outH, outW, CV_32FC1, const_cast<float*>(outputData));
// Binary threshold
cv::Mat binaryMap;
cv::threshold(probMap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
binaryMap.convertTo(binaryMap, CV_8UC1);
// Optional dilation (PaddleOCR default: use_dilation=False, kernel=[[1,1],[1,1]])
if (useDilation) {
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
cv::dilate(binaryMap, binaryMap, kernel);
}
// Find contours
std::vector<std::vector<cv::Point>> contours;
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
std::vector<TextBox> boxes;
int numContours = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
for (int i = 0; i < numContours; i++) {
if (contours[i].size() < 4) continue;
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
float sside = std::min(minRect.size.width, minRect.size.height);
if (sside < 3.0f) continue;
auto ordered = GetMiniBoxes(minRect);
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
float score = BoxScoreFast(probMap, ordered);
if (score < boxThresh) continue;
// Step 3: UnclipPolygon - expand the 4-point box
auto expanded = UnclipPolygon(ordered, unclipRatio);
if (expanded.size() < 4) continue;
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
std::vector<cv::Point> expandedInt;
expandedInt.reserve(expanded.size());
for (auto& p : expanded) {
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
}
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
if (expandedSside < 5.0f) continue;
auto expandedOrdered = GetMiniBoxes(expandedRect);
// Step 5: Scale to original image coordinates
TextBox box;
for (int j = 0; j < 4; j++) {
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(srcW - 1));
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(srcH - 1));
}
box.score = score;
boxes.push_back(box);
}
SortTextBoxes(boxes);
return boxes;
}
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
std::array<cv::Point2f, 4> ONNXOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
cv::Point2f vertices[4];
rect.points(vertices);
// Sort all 4 points by x-coordinate ascending
std::sort(vertices, vertices + 4,
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
cv::Point2f topLeft, bottomLeft;
if (vertices[0].y <= vertices[1].y) {
topLeft = vertices[0];
bottomLeft = vertices[1];
} else {
topLeft = vertices[1];
bottomLeft = vertices[0];
}
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
cv::Point2f topRight, bottomRight;
if (vertices[2].y <= vertices[3].y) {
topRight = vertices[2];
bottomRight = vertices[3];
} else {
topRight = vertices[3];
bottomRight = vertices[2];
}
// Order: [TL, TR, BR, BL] (clockwise from top-left)
return { topLeft, topRight, bottomRight, bottomLeft };
}
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
float ONNXOCRDetector::BoxScoreFast(const cv::Mat& probMap,
const std::array<cv::Point2f, 4>& box) {
int h = probMap.rows;
int w = probMap.cols;
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
if (xmin >= xmax || ymin >= ymax) return 0.0f;
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
std::vector<cv::Point> pts(4);
for (int j = 0; j < 4; j++) {
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
static_cast<int>(box[j].y) - ymin);
}
std::vector<std::vector<cv::Point>> polys = { pts };
cv::fillPoly(mask, polys, cv::Scalar(1));
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
return static_cast<float>(cv::mean(roiMap, mask)[0]);
}
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
float unclipRatio) {
// Compute area using Shoelace formula and perimeter
float area = 0.0f;
float perimeter = 0.0f;
for (int i = 0; i < 4; i++) {
int j = (i + 1) % 4;
area += box[i].x * box[j].y - box[j].x * box[i].y;
float dx = box[j].x - box[i].x;
float dy = box[j].y - box[i].y;
perimeter += std::sqrt(dx * dx + dy * dy);
}
area = std::abs(area) * 0.5f;
if (perimeter < 1.0f) return {};
float distance = area * unclipRatio / perimeter;
ClipperLib::Path clipperPath;
for (int i = 0; i < 4; i++) {
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
static_cast<ClipperLib::cInt>(box[i].y) });
}
ClipperLib::ClipperOffset offset;
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
ClipperLib::Paths solution;
offset.Execute(solution, distance);
if (solution.empty() || solution[0].empty()) return {};
std::vector<cv::Point2f> result;
for (auto& p : solution[0]) {
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
}
return result;
}
void ONNXOCRDetector::Warmup() {
std::lock_guard<std::mutex> lock(_mutex);
if (_warmedUp || !ort_session) return;
// 320x320 covers the typical license-plate ROI after LPD crop +
// multiple-of-32 rounding. cuDNN caches the algorithm for this
// shape so the first real inference doesn't pay the picker cost.
constexpr int kWarmupSide = 320;
try {
cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128));
cv::Mat dummyF;
dummy.convertTo(dummyF, CV_32FC3);
auto inputData = NormalizeAndPermute(dummyF);
std::array<int64_t, 4> inputShape = { 1, 3, kWarmupSide, kWarmupSide };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto t0 = std::chrono::high_resolution_clock::now();
(void)ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
auto t1 = std::chrono::high_resolution_clock::now();
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
std::cout << "[ONNXOCRDetector] Warmup [1,3,"
<< kWarmupSide << "," << kWarmupSide << "] "
<< ms << " ms" << std::endl;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl;
}
_warmedUp = true;
}
} // namespace onnxocr
} // namespace ANSCENTER