#pragma once #include "ONNXOCRTypes.h" #include "ONNXEngine.h" #include #include #include namespace ANSCENTER { namespace onnxocr { class ONNXOCRRecognizer : public BasicOrtHandler { public: explicit ONNXOCRRecognizer(const std::string& onnx_path, unsigned int num_threads = 1); explicit ONNXOCRRecognizer(const std::string& onnx_path, const OrtHandlerOptions& options, unsigned int num_threads = 1); ~ONNXOCRRecognizer() override = default; // Load character dictionary (must be called before Recognize) bool LoadDictionary(const std::string& dictPath); // Recognize text from a single cropped text image TextLine Recognize(const cv::Mat& croppedImage); // Batch recognition for multiple cropped images. // Crops are grouped into a small set of fixed width buckets and // submitted to ORT as [N,3,imgH_,bucketW] tensors so cuDNN sees // shape-stable inputs and can reuse algorithms across calls. std::vector RecognizeBatch(const std::vector& croppedImages); // Pre-warm cuDNN/TRT for every bucket width by running dummy // inferences. Idempotent — no-op if already warmed up. void Warmup(); private: Ort::Value transform(const cv::Mat& mat) override; Ort::Value transformBatch(const std::vector& images) override; // Round resizedW up to the next bucket width (capped at imgMaxW_). // Used by both Recognize() and RecognizeBatch() so cuDNN only ever // sees a small finite set of input shapes. int RoundUpToBucket(int resizedW) const; // Run a single [N,3,imgH_,bucketW] inference and CTC-decode each row. void RunBatchAtWidth(const std::vector& crops, const std::vector& origIndices, int bucketW, std::vector& out); // CTC greedy decode TextLine CTCDecode(const float* outputData, int seqLen, int numClasses); std::vector keys_; int imgH_ = kRecImgH; int imgMaxW_ = kRecImgMaxW; std::mutex _mutex; bool _warmedUp = false; }; } // namespace onnxocr } // namespace ANSCENTER