#include "RTOCRDetector.h" #include "include/clipper.h" #include "NV12PreprocessHelper.h" #include "ANSGpuFrameRegistry.h" #include #include // NV12→BGR fused resize via NV12PreprocessHelper (linked from ANSODEngine.dll) #include #include #include #include #include #include namespace ANSCENTER { namespace rtocr { bool RTOCRDetector::Initialize(const std::string& onnxPath, int gpuId, const std::string& engineCacheDir, int maxSideLen) { // Engine cache directory std::string cacheDir; if (!engineCacheDir.empty()) { cacheDir = engineCacheDir; } else { auto pos = onnxPath.find_last_of("/\\"); cacheDir = (pos != std::string::npos) ? onnxPath.substr(0, pos) : "."; } try { ANSCENTER::Options options; options.deviceIndex = gpuId; // FP32 required for detection: this CNN (DBNet) produces NaN in FP16. // The model has 142 Convolution + 87 Scale (fused BatchNorm) layers whose // intermediate values overflow FP16 range (65504). Mixed precision // (forcing only Sigmoid/Softmax to FP32) is insufficient because the NaN // originates deep in the conv->scale->relu backbone before reaching those layers. // Classifier and recognizer remain FP16 with mixed precision -- only the // detector needs full FP32. options.precision = ANSCENTER::Precision::FP32; options.maxBatchSize = 1; options.optBatchSize = 1; // Dynamic spatial dimensions for detection (multiples of 32) options.minInputHeight = 32; options.minInputWidth = 32; options.optInputHeight = std::min(640, maxSideLen); options.optInputWidth = std::min(640, maxSideLen); options.maxInputHeight = maxSideLen; options.maxInputWidth = maxSideLen; options.engineFileDir = cacheDir; m_poolKey = { onnxPath, static_cast(options.precision), options.maxBatchSize }; m_engine = EnginePoolManager::instance().acquire( m_poolKey, options, onnxPath, kDetSubVals, kDetDivVals, true, getPoolMaxSlotsPerGpu()); m_usingSharedPool = (m_engine != nullptr); if (!m_engine) { std::cerr << "[RTOCRDetector] Failed to build/load TRT engine for: " << onnxPath << std::endl; return false; } // Query actual profile max from the loaded engine int profMaxH = m_engine->getProfileMaxHeight(); int profMaxW = m_engine->getProfileMaxWidth(); if (profMaxH > 0 && profMaxW > 0) { m_engineMaxSideLen = std::min(profMaxH, profMaxW); } else { m_engineMaxSideLen = maxSideLen; } if (m_engineMaxSideLen < maxSideLen) { std::cout << "[RTOCRDetector] Engine built with max " << m_engineMaxSideLen << "x" << m_engineMaxSideLen << " (requested " << maxSideLen << " exceeded GPU capacity)" << std::endl; } std::cout << "[RTOCRDetector] Initialized TRT engine from: " << onnxPath << std::endl; return true; } catch (const std::exception& e) { std::cerr << "[RTOCRDetector] Initialize failed: " << e.what() << std::endl; return false; } } std::vector RTOCRDetector::Detect(const cv::Mat& image, int maxSideLen, float dbThresh, float boxThresh, float unclipRatio, bool useDilation) { std::lock_guard lock(_mutex); if (!m_engine || image.empty()) return {}; try { // Single-pass detection: resize the full image to fit within // the engine's max spatial dimension (same approach as ONNX version). int effectiveMaxSide = std::min(maxSideLen, m_engineMaxSideLen); // 1. Compute resize dimensions (multiples of 32) cv::Size resizeShape = ComputeDetResizeShape(image.rows, image.cols, effectiveMaxSide); int newH = resizeShape.height; int newW = resizeShape.width; float ratioH = static_cast(image.rows) / newH; float ratioW = static_cast(image.cols) / newW; // 2. Upload to GPU and resize — try NV12 fast path first cv::cuda::GpuMat gpuResized; bool usedNV12 = false; GpuFrameData* gpuFrame = tl_currentGpuFrame(); if (gpuFrame && gpuFrame->pixelFormat == 23 && gpuFrame->cpuYPlane && gpuFrame->cpuUvPlane && gpuFrame->width > 0 && gpuFrame->height > 0) { // NV12 fast path: fused NV12→BGR+resize kernel (1 kernel launch) // instead of CPU BGR upload (24MB) + separate resize int fW = gpuFrame->width; int fH = gpuFrame->height; int gpuIdx = m_engine ? m_engine->getOptions().deviceIndex : 0; // Get NV12 Y/UV pointers on GPU (from cache or fresh upload) const uint8_t* devY = nullptr; const uint8_t* devUV = nullptr; int yPitch = 0, uvPitch = 0; { auto regLock = ANSGpuFrameRegistry::instance().acquire_lock(); if (gpuFrame->gpuCacheValid && gpuFrame->gpuCacheDeviceIdx == gpuIdx) { // Cache hit devY = static_cast(gpuFrame->gpuCacheY); devUV = static_cast(gpuFrame->gpuCacheUV); yPitch = static_cast(gpuFrame->gpuCacheYPitch); uvPitch = static_cast(gpuFrame->gpuCacheUVPitch); } else if (!gpuFrame->gpuCacheValid) { // Cache miss — upload CPU NV12 to GPU size_t yBytes = static_cast(fH) * gpuFrame->cpuYLinesize; size_t uvBytes = static_cast(fH / 2) * gpuFrame->cpuUvLinesize; auto& reg = ANSGpuFrameRegistry::instance(); if (reg.canAllocateGpuCache(yBytes + uvBytes)) { cudaMalloc(&gpuFrame->gpuCacheY, yBytes); cudaMalloc(&gpuFrame->gpuCacheUV, uvBytes); cudaMemcpy(gpuFrame->gpuCacheY, gpuFrame->cpuYPlane, yBytes, cudaMemcpyHostToDevice); cudaMemcpy(gpuFrame->gpuCacheUV, gpuFrame->cpuUvPlane, uvBytes, cudaMemcpyHostToDevice); gpuFrame->gpuCacheValid = true; gpuFrame->gpuCacheDeviceIdx = gpuIdx; gpuFrame->gpuCacheYPitch = static_cast(gpuFrame->cpuYLinesize); gpuFrame->gpuCacheUVPitch = static_cast(gpuFrame->cpuUvLinesize); gpuFrame->gpuCacheBytes = yBytes + uvBytes; reg.onGpuCacheCreated(yBytes + uvBytes); devY = static_cast(gpuFrame->gpuCacheY); devUV = static_cast(gpuFrame->gpuCacheUV); yPitch = gpuFrame->cpuYLinesize; uvPitch = gpuFrame->cpuUvLinesize; } } } // release registry lock before GPU kernel if (devY && devUV) { // Single fused kernel: NV12→BGR + bilinear resize (1 launch, 1 output alloc) gpuResized.create(newH, newW, CV_8UC3); NV12PreprocessHelper::nv12ToBGRResize( devY, yPitch, devUV, uvPitch, gpuResized.ptr(), static_cast(gpuResized.step), newW, newH, fW, fH); usedNV12 = true; // Update ratios to map from full-res NV12 to detection output ratioH = static_cast(fH) / newH; ratioW = static_cast(fW) / newW; } } if (!usedNV12) { // Fallback: CPU resize then upload small image to GPU cv::Mat cpuResized; cv::resize(image, cpuResized, resizeShape, 0, 0, cv::INTER_LINEAR); gpuResized.upload(cpuResized); } // Keep BGR order (PaddleOCR official does NOT convert BGR->RGB) // 3. Run inference std::vector> inputs = { { gpuResized } }; std::vector>> featureVectors; if (!m_engine->runInference(inputs, featureVectors)) { std::cerr << "[RTOCRDetector] Inference failed" << std::endl; return {}; } if (featureVectors.empty() || featureVectors[0].empty()) return {}; // 4. Reshape output to probability map [H, W] std::vector& output = featureVectors[0][0]; int outputSize = static_cast(output.size()); if (outputSize < newH * newW) { std::cerr << "[RTOCRDetector] Output too small: expected at least " << newH * newW << " got " << outputSize << std::endl; return {}; } cv::Mat bitmap(newH, newW, CV_32FC1, output.data()); // 5. Threshold to binary (matches ONNX/PaddleOCR official order) cv::Mat binaryMap; cv::threshold(bitmap, binaryMap, dbThresh, 255, cv::THRESH_BINARY); binaryMap.convertTo(binaryMap, CV_8UC1); // 6. Apply dilation if requested (on binaryMap, matching ONNX version) if (useDilation) { cv::Mat kernel = cv::getStructuringElement(cv::MORPH_RECT, cv::Size(2, 2)); cv::dilate(binaryMap, binaryMap, kernel); } // 7. Find contours and build text boxes // (matches ONNX/PaddleOCR official DBPostProcess.boxes_from_bitmap flow exactly) std::vector> contours; cv::findContours(binaryMap, contours, cv::RETR_LIST, cv::CHAIN_APPROX_SIMPLE); int numCandidates = std::min(static_cast(contours.size()), kDetMaxCandidates); std::vector boxes; for (int i = 0; i < numCandidates; i++) { if (contours[i].size() < 4) continue; // Step 1: GetMiniBoxes - get ordered 4 corners of min area rect cv::RotatedRect minRect = cv::minAreaRect(contours[i]); float sside = std::min(minRect.size.width, minRect.size.height); if (sside < 3.0f) continue; auto ordered = GetMiniBoxes(minRect); // Step 2: BoxScoreFast - compute mean prob inside the 4-point box polygon float score = BoxScoreFast(bitmap, ordered); if (score < boxThresh) continue; // Step 3: UnclipPolygon - expand the 4-point box auto expanded = UnclipPolygon(ordered, unclipRatio); if (expanded.size() < 4) continue; // Step 4: Re-compute GetMiniBoxes on the expanded polygon std::vector expandedInt; expandedInt.reserve(expanded.size()); for (auto& p : expanded) { expandedInt.push_back(cv::Point(static_cast(p.x), static_cast(p.y))); } cv::RotatedRect expandedRect = cv::minAreaRect(expandedInt); // Filter by min_size + 2 = 5 (matches PaddleOCR official) float expandedSside = std::min(expandedRect.size.width, expandedRect.size.height); if (expandedSside < 5.0f) continue; auto expandedOrdered = GetMiniBoxes(expandedRect); // Step 5: Scale to original image coordinates TextBox box; for (int j = 0; j < 4; j++) { box.points[j].x = std::clamp(expandedOrdered[j].x * ratioW, 0.0f, static_cast(image.cols - 1)); box.points[j].y = std::clamp(expandedOrdered[j].y * ratioH, 0.0f, static_cast(image.rows - 1)); } box.score = score; boxes.push_back(box); } SortTextBoxes(boxes); return boxes; } catch (const std::exception& e) { std::cerr << "[RTOCRDetector] Detect failed: " << e.what() << std::endl; return {}; } } // Matches PaddleOCR official GetMiniBoxes: sort by X, assign TL/TR/BL/BR by Y std::array RTOCRDetector::GetMiniBoxes(const cv::RotatedRect& rect) { cv::Point2f vertices[4]; rect.points(vertices); // Sort all 4 points by x-coordinate ascending std::sort(vertices, vertices + 4, [](const cv::Point2f& a, const cv::Point2f& b) { return a.x < b.x; }); // Left two (indices 0,1): smaller y = top-left, larger y = bottom-left cv::Point2f topLeft, bottomLeft; if (vertices[0].y <= vertices[1].y) { topLeft = vertices[0]; bottomLeft = vertices[1]; } else { topLeft = vertices[1]; bottomLeft = vertices[0]; } // Right two (indices 2,3): smaller y = top-right, larger y = bottom-right cv::Point2f topRight, bottomRight; if (vertices[2].y <= vertices[3].y) { topRight = vertices[2]; bottomRight = vertices[3]; } else { topRight = vertices[3]; bottomRight = vertices[2]; } // Order: [TL, TR, BR, BL] (clockwise from top-left) return { topLeft, topRight, bottomRight, bottomLeft }; } // Matches PaddleOCR official box_score_fast: mean prob value inside the 4-point polygon float RTOCRDetector::BoxScoreFast(const cv::Mat& probMap, const std::array& box) { int h = probMap.rows; int w = probMap.cols; // Get bounding rectangle with proper clamping (matches PaddleOCR official) float minX = std::min({box[0].x, box[1].x, box[2].x, box[3].x}); float maxX = std::max({box[0].x, box[1].x, box[2].x, box[3].x}); float minY = std::min({box[0].y, box[1].y, box[2].y, box[3].y}); float maxY = std::max({box[0].y, box[1].y, box[2].y, box[3].y}); int xmin = std::clamp(static_cast(std::floor(minX)), 0, w - 1); int xmax = std::clamp(static_cast(std::ceil(maxX)), 0, w - 1); int ymin = std::clamp(static_cast(std::floor(minY)), 0, h - 1); int ymax = std::clamp(static_cast(std::ceil(maxY)), 0, h - 1); if (xmin >= xmax || ymin >= ymax) return 0.0f; cv::Mat mask = cv::Mat::zeros(ymax - ymin + 1, xmax - xmin + 1, CV_8UC1); std::vector pts(4); for (int j = 0; j < 4; j++) { pts[j] = cv::Point(static_cast(box[j].x) - xmin, static_cast(box[j].y) - ymin); } std::vector> polys = { pts }; cv::fillPoly(mask, polys, cv::Scalar(1)); cv::Mat roiMap = probMap(cv::Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1)); return static_cast(cv::mean(roiMap, mask)[0]); } // Matches PaddleOCR official unclip: expand 4-point box using Clipper with JT_ROUND // Uses integer coordinates for Clipper (matching PaddleOCR/ONNX version exactly) std::vector RTOCRDetector::UnclipPolygon(const std::array& box, float unclipRatio) { // Compute area using Shoelace formula and perimeter float area = 0.0f; float perimeter = 0.0f; for (int i = 0; i < 4; i++) { int j = (i + 1) % 4; area += box[i].x * box[j].y - box[j].x * box[i].y; float dx = box[j].x - box[i].x; float dy = box[j].y - box[i].y; perimeter += std::sqrt(dx * dx + dy * dy); } area = std::abs(area) * 0.5f; if (perimeter < 1.0f) return {}; float distance = area * unclipRatio / perimeter; ClipperLib::Path clipperPath; for (int i = 0; i < 4; i++) { clipperPath.push_back({ static_cast(box[i].x), static_cast(box[i].y) }); } ClipperLib::ClipperOffset offset; offset.AddPath(clipperPath, ClipperLib::jtRound, ClipperLib::etClosedPolygon); ClipperLib::Paths solution; offset.Execute(solution, distance); if (solution.empty() || solution[0].empty()) return {}; std::vector result; for (auto& p : solution[0]) { result.push_back(cv::Point2f(static_cast(p.X), static_cast(p.Y))); } return result; } RTOCRDetector::~RTOCRDetector() { try { if (m_usingSharedPool) { EnginePoolManager::instance().release(m_poolKey); m_engine.reset(); m_usingSharedPool = false; } else if (m_engine) { m_engine.reset(); } } catch (...) {} } } // namespace rtocr } // namespace ANSCENTER