356 lines
14 KiB
C++
356 lines
14 KiB
C++
#include "ONNXOCRDetector.h"
|
|
#include "include/clipper.h"
|
|
#include "ANSGpuFrameRegistry.h"
|
|
#include "NV12PreprocessHelper.h"
|
|
|
|
#include <opencv2/imgproc.hpp>
|
|
#include <iostream>
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <chrono>
|
|
|
|
namespace ANSCENTER {
|
|
namespace onnxocr {
|
|
|
|
ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads)
|
|
: BasicOrtHandler(onnx_path, num_threads) {
|
|
}
|
|
|
|
ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path,
|
|
const OrtHandlerOptions& options,
|
|
unsigned int num_threads)
|
|
: BasicOrtHandler(onnx_path, options, num_threads) {
|
|
}
|
|
|
|
Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) {
|
|
// Not used directly - detection uses custom Preprocess + manual tensor creation
|
|
// Provided to satisfy BasicOrtHandler pure virtual
|
|
cv::Mat canvas;
|
|
mat.convertTo(canvas, CV_32FC3);
|
|
auto data = NormalizeAndPermute(canvas);
|
|
|
|
input_values_handler.assign(data.begin(), data.end());
|
|
return Ort::Value::CreateTensor<float>(
|
|
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
|
|
input_node_dims.data(), input_node_dims.size());
|
|
}
|
|
|
|
Ort::Value ONNXOCRDetector::transformBatch(const std::vector<cv::Mat>& images) {
|
|
// Not used - detection processes single images with dynamic shapes
|
|
if (!images.empty()) {
|
|
return transform(images[0]);
|
|
}
|
|
return Ort::Value(nullptr);
|
|
}
|
|
|
|
std::vector<TextBox> ONNXOCRDetector::Detect(const cv::Mat& srcImage,
|
|
int maxSideLen,
|
|
float dbThresh,
|
|
float boxThresh,
|
|
float unclipRatio,
|
|
bool useDilation) {
|
|
std::lock_guard<std::mutex> lock(_mutex);
|
|
|
|
if (!ort_session || srcImage.empty()) {
|
|
return {};
|
|
}
|
|
|
|
// Try to get full-resolution image from NV12 frame (same pattern as ANSONNXYOLO)
|
|
cv::Mat inferenceImage = srcImage;
|
|
float bgrScaleX = 1.0f, bgrScaleY = 1.0f;
|
|
|
|
GpuFrameData* gpuFrame = tl_currentGpuFrame();
|
|
if (gpuFrame && gpuFrame->pixelFormat == 23 &&
|
|
gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane &&
|
|
gpuFrame->width > 0 && gpuFrame->height > 0) {
|
|
// Full-res NV12 available — convert to BGR on CPU for ONNX
|
|
cv::Mat yPlane(gpuFrame->height, gpuFrame->width, CV_8UC1,
|
|
gpuFrame->cpuYPlane, gpuFrame->cpuYLinesize);
|
|
cv::Mat uvPlane(gpuFrame->height / 2, gpuFrame->width, CV_8UC1,
|
|
gpuFrame->cpuUvPlane, gpuFrame->cpuUvLinesize);
|
|
cv::Mat fullResBGR;
|
|
cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12);
|
|
if (!fullResBGR.empty()) {
|
|
bgrScaleX = static_cast<float>(srcImage.cols) / fullResBGR.cols;
|
|
bgrScaleY = static_cast<float>(srcImage.rows) / fullResBGR.rows;
|
|
inferenceImage = fullResBGR;
|
|
}
|
|
}
|
|
|
|
int resizeH, resizeW;
|
|
float ratioH, ratioW;
|
|
|
|
// Preprocess (using full-res image if NV12 was available)
|
|
auto inputData = Preprocess(inferenceImage, maxSideLen, resizeH, resizeW, ratioH, ratioW);
|
|
|
|
// Create input tensor with dynamic shape
|
|
std::array<int64_t, 4> inputShape = { 1, 3, resizeH, resizeW };
|
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
|
*memory_info_handler, inputData.data(), inputData.size(),
|
|
inputShape.data(), inputShape.size());
|
|
|
|
// Run inference
|
|
auto outputTensors = ort_session->Run(
|
|
Ort::RunOptions{ nullptr },
|
|
input_node_names.data(), &inputTensor, 1,
|
|
output_node_names.data(), num_outputs);
|
|
|
|
// Get output data
|
|
float* outputData = outputTensors[0].GetTensorMutableData<float>();
|
|
auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
|
|
|
|
int outH = static_cast<int>(outputShape[2]);
|
|
int outW = static_cast<int>(outputShape[3]);
|
|
|
|
// Postprocess — detection coords are relative to inferenceImage (full-res),
|
|
// then scaled back to srcImage (display-res) coordinates
|
|
auto boxes = Postprocess(outputData, outH, outW, ratioH, ratioW,
|
|
inferenceImage.rows, inferenceImage.cols,
|
|
dbThresh, boxThresh, unclipRatio, useDilation);
|
|
|
|
// Rescale box coordinates from full-res to display-res
|
|
if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) {
|
|
for (auto& box : boxes) {
|
|
for (auto& pt : box.points) {
|
|
pt.x *= bgrScaleX;
|
|
pt.y *= bgrScaleY;
|
|
}
|
|
}
|
|
}
|
|
|
|
return boxes;
|
|
}
|
|
|
|
std::vector<float> ONNXOCRDetector::Preprocess(const cv::Mat& srcImage, int maxSideLen,
|
|
int& resizeH, int& resizeW,
|
|
float& ratioH, float& ratioW) {
|
|
cv::Size newSize = ComputeDetResizeShape(srcImage.rows, srcImage.cols, maxSideLen);
|
|
resizeW = newSize.width;
|
|
resizeH = newSize.height;
|
|
ratioH = static_cast<float>(srcImage.rows) / resizeH;
|
|
ratioW = static_cast<float>(srcImage.cols) / resizeW;
|
|
|
|
cv::Mat resized;
|
|
cv::resize(srcImage, resized, newSize);
|
|
resized.convertTo(resized, CV_32FC3);
|
|
|
|
return NormalizeAndPermute(resized);
|
|
}
|
|
|
|
// Matches PaddleOCR official DBPostProcess.boxes_from_bitmap flow
|
|
std::vector<TextBox> ONNXOCRDetector::Postprocess(const float* outputData, int outH, int outW,
|
|
float ratioH, float ratioW,
|
|
int srcH, int srcW,
|
|
float dbThresh, float boxThresh,
|
|
float unclipRatio, bool useDilation) {
|
|
// Create probability map from output
|
|
cv::Mat probMap(outH, outW, CV_32FC1, const_cast<float*>(outputData));
|
|
|
|
// Binary threshold
|
|
cv::Mat binaryMap;
|
|
cv::threshold(probMap, binaryMap, dbThresh, 255, cv::THRESH_BINARY);
|
|
binaryMap.convertTo(binaryMap, CV_8UC1);
|
|
|
|
// Optional dilation (PaddleOCR default: use_dilation=False, kernel=[[1,1],[1,1]])
|
|
if (useDilation) {
|
|
cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2));
|
|
cv::dilate(binaryMap, binaryMap, kernel);
|
|
}
|
|
|
|
// Find contours
|
|
std::vector<std::vector<cv::Point>> contours;
|
|
cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE);
|
|
|
|
std::vector<TextBox> boxes;
|
|
int numContours = std::min(static_cast<int>(contours.size()), kDetMaxCandidates);
|
|
|
|
for (int i = 0; i < numContours; i++) {
|
|
if (contours[i].size() < 4) continue;
|
|
|
|
// Step 1: GetMiniBoxes - get ordered 4 corners of min area rect
|
|
cv::RotatedRect minRect = cv::minAreaRect(contours[i]);
|
|
float sside = std::min(minRect.size.width, minRect.size.height);
|
|
if (sside < 3.0f) continue;
|
|
|
|
auto ordered = GetMiniBoxes(minRect);
|
|
|
|
// Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon
|
|
float score = BoxScoreFast(probMap, ordered);
|
|
if (score < boxThresh) continue;
|
|
|
|
// Step 3: UnclipPolygon - expand the 4-point box
|
|
auto expanded = UnclipPolygon(ordered, unclipRatio);
|
|
if (expanded.size() < 4) continue;
|
|
|
|
// Step 4: Re-compute GetMiniBoxes on the expanded polygon
|
|
std::vector<cv::Point> expandedInt;
|
|
expandedInt.reserve(expanded.size());
|
|
for (auto& p : expanded) {
|
|
expandedInt.push_back(cv::Point(static_cast<int>(p.x), static_cast<int>(p.y)));
|
|
}
|
|
cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt);
|
|
|
|
// Filter by min_size + 2 = 5 (matches PaddleOCR official)
|
|
float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height);
|
|
if (expandedSside < 5.0f) continue;
|
|
|
|
auto expandedOrdered = GetMiniBoxes(expandedRect);
|
|
|
|
// Step 5: Scale to original image coordinates
|
|
TextBox box;
|
|
for (int j = 0; j < 4; j++) {
|
|
box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast<float>(srcW - 1));
|
|
box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast<float>(srcH - 1));
|
|
}
|
|
box.score = score;
|
|
boxes.push_back(box);
|
|
}
|
|
|
|
SortTextBoxes(boxes);
|
|
return boxes;
|
|
}
|
|
|
|
// Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y
|
|
std::array<cv::Point2f, 4> ONNXOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) {
|
|
cv::Point2f vertices[4];
|
|
rect.points(vertices);
|
|
|
|
// Sort all 4 points by x-coordinate ascending
|
|
std::sort(vertices, vertices + 4,
|
|
[](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; });
|
|
|
|
// Left two (indices 0,1): smaller y = top-left, larger y = bottom-left
|
|
cv::Point2f topLeft, bottomLeft;
|
|
if (vertices[0].y <= vertices[1].y) {
|
|
topLeft = vertices[0];
|
|
bottomLeft = vertices[1];
|
|
} else {
|
|
topLeft = vertices[1];
|
|
bottomLeft = vertices[0];
|
|
}
|
|
|
|
// Right two (indices 2,3): smaller y = top-right, larger y = bottom-right
|
|
cv::Point2f topRight, bottomRight;
|
|
if (vertices[2].y <= vertices[3].y) {
|
|
topRight = vertices[2];
|
|
bottomRight = vertices[3];
|
|
} else {
|
|
topRight = vertices[3];
|
|
bottomRight = vertices[2];
|
|
}
|
|
|
|
// Order: [TL, TR, BR, BL] (clockwise from top-left)
|
|
return { topLeft, topRight, bottomRight, bottomLeft };
|
|
}
|
|
|
|
// Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon
|
|
float ONNXOCRDetector::BoxScoreFast(const cv::Mat& probMap,
|
|
const std::array<cv::Point2f, 4>& box) {
|
|
int h = probMap.rows;
|
|
int w = probMap.cols;
|
|
|
|
// Get bounding rectangle with proper clamping (matches PaddleOCR official)
|
|
float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x});
|
|
float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x});
|
|
float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y});
|
|
float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y});
|
|
|
|
int xmin = std::clamp(static_cast<int>(std::floor(minX)), 0, w - 1);
|
|
int xmax = std::clamp(static_cast<int>(std::ceil(maxX)), 0, w - 1);
|
|
int ymin = std::clamp(static_cast<int>(std::floor(minY)), 0, h - 1);
|
|
int ymax = std::clamp(static_cast<int>(std::ceil(maxY)), 0, h - 1);
|
|
|
|
if (xmin >= xmax || ymin >= ymax) return 0.0f;
|
|
|
|
cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1);
|
|
|
|
std::vector<cv::Point> pts(4);
|
|
for (int j = 0; j < 4; j++) {
|
|
pts[j] = cv::Point(static_cast<int>(box[j].x) - xmin,
|
|
static_cast<int>(box[j].y) - ymin);
|
|
}
|
|
std::vector<std::vector<cv::Point>> polys = { pts };
|
|
cv::fillPoly(mask, polys, cv::Scalar(1));
|
|
|
|
cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1));
|
|
return static_cast<float>(cv::mean(roiMap, mask)[0]);
|
|
}
|
|
|
|
// Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND
|
|
std::vector<cv::Point2f> ONNXOCRDetector::UnclipPolygon(const std::array<cv::Point2f, 4>& box,
|
|
float unclipRatio) {
|
|
// Compute area using Shoelace formula and perimeter
|
|
float area = 0.0f;
|
|
float perimeter = 0.0f;
|
|
for (int i = 0; i < 4; i++) {
|
|
int j = (i + 1) % 4;
|
|
area += box[i].x * box[j].y - box[j].x * box[i].y;
|
|
float dx = box[j].x - box[i].x;
|
|
float dy = box[j].y - box[i].y;
|
|
perimeter += std::sqrt(dx * dx + dy * dy);
|
|
}
|
|
area = std::abs(area) * 0.5f;
|
|
if (perimeter < 1.0f) return {};
|
|
|
|
float distance = area * unclipRatio / perimeter;
|
|
|
|
ClipperLib::Path clipperPath;
|
|
for (int i = 0; i < 4; i++) {
|
|
clipperPath.push_back({ static_cast<ClipperLib::cInt>(box[i].x),
|
|
static_cast<ClipperLib::cInt>(box[i].y) });
|
|
}
|
|
|
|
ClipperLib::ClipperOffset offset;
|
|
offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon);
|
|
|
|
ClipperLib::Paths solution;
|
|
offset.Execute(solution, distance);
|
|
|
|
if (solution.empty() || solution[0].empty()) return {};
|
|
|
|
std::vector<cv::Point2f> result;
|
|
for (auto& p : solution[0]) {
|
|
result.push_back(cv::Point2f(static_cast<float>(p.X), static_cast<float>(p.Y)));
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void ONNXOCRDetector::Warmup() {
|
|
std::lock_guard<std::mutex> lock(_mutex);
|
|
if (_warmedUp || !ort_session) return;
|
|
|
|
// 320x320 covers the typical license-plate ROI after LPD crop +
|
|
// multiple-of-32 rounding. cuDNN caches the algorithm for this
|
|
// shape so the first real inference doesn't pay the picker cost.
|
|
constexpr int kWarmupSide = 320;
|
|
try {
|
|
cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128));
|
|
cv::Mat dummyF;
|
|
dummy.convertTo(dummyF, CV_32FC3);
|
|
auto inputData = NormalizeAndPermute(dummyF);
|
|
|
|
std::array<int64_t, 4> inputShape = { 1, 3, kWarmupSide, kWarmupSide };
|
|
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
|
|
*memory_info_handler, inputData.data(), inputData.size(),
|
|
inputShape.data(), inputShape.size());
|
|
|
|
auto t0 = std::chrono::high_resolution_clock::now();
|
|
(void)ort_session->Run(
|
|
Ort::RunOptions{ nullptr },
|
|
input_node_names.data(), &inputTensor, 1,
|
|
output_node_names.data(), num_outputs);
|
|
auto t1 = std::chrono::high_resolution_clock::now();
|
|
double ms = std::chrono::duration<double, std::milli>(t1 - t0).count();
|
|
std::cout << "[ONNXOCRDetector] Warmup [1,3,"
|
|
<< kWarmupSide << "," << kWarmupSide << "] "
|
|
<< ms << " ms" << std::endl;
|
|
}
|
|
catch (const Ort::Exception& e) {
|
|
std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl;
|
|
}
|
|
_warmedUp = true;
|
|
}
|
|
|
|
} // namespace onnxocr
|
|
} // namespace ANSCENTER
|