#include "ONNXOCRDetector.h" #include "include/clipper.h" #include "ANSGpuFrameRegistry.h" #include "NV12PreprocessHelper.h" #include #include #include #include #include namespace ANSCENTER { namespace onnxocr { ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, unsigned int num_threads) : BasicOrtHandler(onnx_path, num_threads) { } ONNXOCRDetector::ONNXOCRDetector(const std::string& onnx_path, const OrtHandlerOptions& options, unsigned int num_threads) : BasicOrtHandler(onnx_path, options, num_threads) { } Ort::Value ONNXOCRDetector::transform(const cv::Mat& mat) { // Not used directly - detection uses custom Preprocess + manual tensor creation // Provided to satisfy BasicOrtHandler pure virtual cv::Mat canvas; mat.convertTo(canvas, CV_32FC3); auto data = NormalizeAndPermute(canvas); input_values_handler.assign(data.begin(), data.end()); return Ort::Value::CreateTensor( *memory_info_handler, input_values_handler.data(), input_values_handler.size(), input_node_dims.data(), input_node_dims.size()); } Ort::Value ONNXOCRDetector::transformBatch(const std::vector& images) { // Not used - detection processes single images with dynamic shapes if (!images.empty()) { return transform(images[0]); } return Ort::Value(nullptr); } std::vector ONNXOCRDetector::Detect(const cv::Mat& srcImage, int maxSideLen, float dbThresh, float boxThresh, float unclipRatio, bool useDilation) { std::lock_guard lock(_mutex); if (!ort_session || srcImage.empty()) { return {}; } // Try to get full-resolution image from NV12 frame (same pattern as ANSONNXYOLO) cv::Mat inferenceImage = srcImage; float bgrScaleX = 1.0f, bgrScaleY = 1.0f; GpuFrameData* gpuFrame = tl_currentGpuFrame(); if (gpuFrame && gpuFrame->pixelFormat == 23 && gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane && gpuFrame->width > 0 && gpuFrame->height > 0) { // Full-res NV12 available — convert to BGR on CPU for ONNX cv::Mat yPlane(gpuFrame->height, gpuFrame->width, CV_8UC1, gpuFrame->cpuYPlane, gpuFrame->cpuYLinesize); cv::Mat uvPlane(gpuFrame->height / 2, gpuFrame->width, CV_8UC1, gpuFrame->cpuUvPlane, gpuFrame->cpuUvLinesize); cv::Mat fullResBGR; cv::cvtColorTwoPlane(yPlane, uvPlane, fullResBGR, cv::COLOR_YUV2BGR_NV12); if (!fullResBGR.empty()) { bgrScaleX = static_cast(srcImage.cols) / fullResBGR.cols; bgrScaleY = static_cast(srcImage.rows) / fullResBGR.rows; inferenceImage = fullResBGR; } } int resizeH, resizeW; float ratioH, ratioW; // Preprocess (using full-res image if NV12 was available) auto inputData = Preprocess(inferenceImage, maxSideLen, resizeH, resizeW, ratioH, ratioW); // Create input tensor with dynamic shape std::array inputShape = { 1, 3, resizeH, resizeW }; Ort::Value inputTensor = Ort::Value::CreateTensor( *memory_info_handler, inputData.data(), inputData.size(), inputShape.data(), inputShape.size()); // Run inference auto outputTensors = ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); // Get output data float* outputData = outputTensors[0].GetTensorMutableData(); auto outputShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape(); int outH = static_cast(outputShape[2]); int outW = static_cast(outputShape[3]); // Postprocess — detection coords are relative to inferenceImage (full-res), // then scaled back to srcImage (display-res) coordinates auto boxes = Postprocess(outputData, outH, outW, ratioH, ratioW, inferenceImage.rows, inferenceImage.cols, dbThresh, boxThresh, unclipRatio, useDilation); // Rescale box coordinates from full-res to display-res if (bgrScaleX != 1.0f || bgrScaleY != 1.0f) { for (auto& box : boxes) { for (auto& pt : box.points) { pt.x *= bgrScaleX; pt.y *= bgrScaleY; } } } return boxes; } std::vector ONNXOCRDetector::Preprocess(const cv::Mat& srcImage, int maxSideLen, int& resizeH, int& resizeW, float& ratioH, float& ratioW) { cv::Size newSize = ComputeDetResizeShape(srcImage.rows, srcImage.cols, maxSideLen); resizeW = newSize.width; resizeH = newSize.height; ratioH = static_cast(srcImage.rows) / resizeH; ratioW = static_cast(srcImage.cols) / resizeW; cv::Mat resized; cv::resize(srcImage, resized, newSize); resized.convertTo(resized, CV_32FC3); return NormalizeAndPermute(resized); } // Matches PaddleOCR official DBPostProcess.boxes_from_bitmap flow std::vector ONNXOCRDetector::Postprocess(const float* outputData, int outH, int outW, float ratioH, float ratioW, int srcH, int srcW, float dbThresh, float boxThresh, float unclipRatio, bool useDilation) { // Create probability map from output cv::Mat probMap(outH, outW, CV_32FC1, const_cast(outputData)); // Binary threshold cv::Mat binaryMap; cv::threshold(probMap, binaryMap, dbThresh, 255, cv::THRESH_BINARY); binaryMap.convertTo(binaryMap, CV_8UC1); // Optional dilation (PaddleOCR default: use_dilation=False, kernel=[[1,1],[1,1]]) if (useDilation) { cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2)); cv::dilate(binaryMap, binaryMap, kernel); } // Find contours std::vector> contours; cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); std::vector boxes; int numContours = std::min(static_cast(contours.size()), kDetMaxCandidates); for (int i = 0; i < numContours; i++) { if (contours[i].size() < 4) continue; // Step 1: GetMiniBoxes - get ordered 4 corners of min area rect cv::RotatedRect minRect = cv::minAreaRect(contours[i]); float sside = std::min(minRect.size.width, minRect.size.height); if (sside < 3.0f) continue; auto ordered = GetMiniBoxes(minRect); // Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon float score = BoxScoreFast(probMap, ordered); if (score < boxThresh) continue; // Step 3: UnclipPolygon - expand the 4-point box auto expanded = UnclipPolygon(ordered, unclipRatio); if (expanded.size() < 4) continue; // Step 4: Re-compute GetMiniBoxes on the expanded polygon std::vector expandedInt; expandedInt.reserve(expanded.size()); for (auto& p : expanded) { expandedInt.push_back(cv::Point(static_cast(p.x), static_cast(p.y))); } cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt); // Filter by min_size + 2 = 5 (matches PaddleOCR official) float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height); if (expandedSside < 5.0f) continue; auto expandedOrdered = GetMiniBoxes(expandedRect); // Step 5: Scale to original image coordinates TextBox box; for (int j = 0; j < 4; j++) { box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast(srcW - 1)); box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast(srcH - 1)); } box.score = score; boxes.push_back(box); } SortTextBoxes(boxes); return boxes; } // Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y std::array ONNXOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) { cv::Point2f vertices[4]; rect.points(vertices); // Sort all 4 points by x-coordinate ascending std::sort(vertices, vertices + 4, [](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; }); // Left two (indices 0,1): smaller y = top-left, larger y = bottom-left cv::Point2f topLeft, bottomLeft; if (vertices[0].y <= vertices[1].y) { topLeft = vertices[0]; bottomLeft = vertices[1]; } else { topLeft = vertices[1]; bottomLeft = vertices[0]; } // Right two (indices 2,3): smaller y = top-right, larger y = bottom-right cv::Point2f topRight, bottomRight; if (vertices[2].y <= vertices[3].y) { topRight = vertices[2]; bottomRight = vertices[3]; } else { topRight = vertices[3]; bottomRight = vertices[2]; } // Order: [TL, TR, BR, BL] (clockwise from top-left) return { topLeft, topRight, bottomRight, bottomLeft }; } // Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon float ONNXOCRDetector::BoxScoreFast(const cv::Mat& probMap, const std::array& box) { int h = probMap.rows; int w = probMap.cols; // Get bounding rectangle with proper clamping (matches PaddleOCR official) float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x}); float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x}); float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y}); float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y}); int xmin = std::clamp(static_cast(std::floor(minX)), 0, w - 1); int xmax = std::clamp(static_cast(std::ceil(maxX)), 0, w - 1); int ymin = std::clamp(static_cast(std::floor(minY)), 0, h - 1); int ymax = std::clamp(static_cast(std::ceil(maxY)), 0, h - 1); if (xmin >= xmax || ymin >= ymax) return 0.0f; cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1); std::vector pts(4); for (int j = 0; j < 4; j++) { pts[j] = cv::Point(static_cast(box[j].x) - xmin, static_cast(box[j].y) - ymin); } std::vector> polys = { pts }; cv::fillPoly(mask, polys, cv::Scalar(1)); cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1)); return static_cast(cv::mean(roiMap, mask)[0]); } // Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND std::vector ONNXOCRDetector::UnclipPolygon(const std::array& box, float unclipRatio) { // Compute area using Shoelace formula and perimeter float area = 0.0f; float perimeter = 0.0f; for (int i = 0; i < 4; i++) { int j = (i + 1) % 4; area += box[i].x * box[j].y - box[j].x * box[i].y; float dx = box[j].x - box[i].x; float dy = box[j].y - box[i].y; perimeter += std::sqrt(dx * dx + dy * dy); } area = std::abs(area) * 0.5f; if (perimeter < 1.0f) return {}; float distance = area * unclipRatio / perimeter; ClipperLib::Path clipperPath; for (int i = 0; i < 4; i++) { clipperPath.push_back({ static_cast(box[i].x), static_cast(box[i].y) }); } ClipperLib::ClipperOffset offset; offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon); ClipperLib::Paths solution; offset.Execute(solution, distance); if (solution.empty() || solution[0].empty()) return {}; std::vector result; for (auto& p : solution[0]) { result.push_back(cv::Point2f(static_cast(p.X), static_cast(p.Y))); } return result; } void ONNXOCRDetector::Warmup() { std::lock_guard lock(_mutex); if (_warmedUp || !ort_session) return; // 320x320 covers the typical license-plate ROI after LPD crop + // multiple-of-32 rounding. cuDNN caches the algorithm for this // shape so the first real inference doesn't pay the picker cost. constexpr int kWarmupSide = 320; try { cv::Mat dummy(kWarmupSide, kWarmupSide, CV_8UC3, cv::Scalar(128, 128, 128)); cv::Mat dummyF; dummy.convertTo(dummyF, CV_32FC3); auto inputData = NormalizeAndPermute(dummyF); std::array inputShape = { 1, 3, kWarmupSide, kWarmupSide }; Ort::Value inputTensor = Ort::Value::CreateTensor( *memory_info_handler, inputData.data(), inputData.size(), inputShape.data(), inputShape.size()); auto t0 = std::chrono::high_resolution_clock::now(); (void)ort_session->Run( Ort::RunOptions{ nullptr }, input_node_names.data(), &inputTensor, 1, output_node_names.data(), num_outputs); auto t1 = std::chrono::high_resolution_clock::now(); double ms = std::chrono::duration(t1 - t0).count(); std::cout << "[ONNXOCRDetector] Warmup [1,3," << kWarmupSide << "," << kWarmupSide << "] " << ms << " ms" << std::endl; } catch (const Ort::Exception& e) { std::cerr << "[ONNXOCRDetector] Warmup failed: " << e.what() << std::endl; } _warmedUp = true; } } // namespace onnxocr } // namespace ANSCENTER