Files
ANSCORE/ANSOCR/ANSONNXOCR/ONNXOCRClassifier.cpp

108 lines
3.9 KiB
C++

#include "ONNXOCRClassifier.h"
#include <opencv2/imgproc.hpp>
#include <iostream>
#include <algorithm>
#include <cmath>
namespace ANSCENTER {
namespace onnxocr {
ONNXOCRClassifier::ONNXOCRClassifier(const std::string& onnx_path, unsigned int num_threads)
: BasicOrtHandler(onnx_path, num_threads) {
}
Ort::Value ONNXOCRClassifier::transform(const cv::Mat& mat) {
cv::Mat resized;
// Direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
cv::resize(mat, resized, cv::Size(kClsImageW, kClsImageH));
resized.convertTo(resized, CV_32FC3);
// PP-LCNet uses ImageNet normalization (same as detection)
auto data = NormalizeAndPermute(resized);
input_values_handler.assign(data.begin(), data.end());
return Ort::Value::CreateTensor<float>(
*memory_info_handler, input_values_handler.data(), input_values_handler.size(),
input_node_dims.data(), input_node_dims.size());
}
Ort::Value ONNXOCRClassifier::transformBatch(const std::vector<cv::Mat>& images) {
// Not used - classifier processes single images in Classify() loop
if (!images.empty()) {
return transform(images[0]);
}
return Ort::Value(nullptr);
}
void ONNXOCRClassifier::Classify(std::vector<cv::Mat>& img_list,
std::vector<int>& cls_labels,
std::vector<float>& cls_scores,
float cls_thresh) {
std::lock_guard<std::mutex> lock(_mutex);
cls_labels.clear();
cls_scores.clear();
if (!ort_session || img_list.empty()) return;
cls_labels.resize(img_list.size(), 0);
cls_scores.resize(img_list.size(), 0.0f);
// Process one image at a time (dynamic shapes)
for (size_t i = 0; i < img_list.size(); i++) {
if (img_list[i].empty()) continue;
try {
// Preprocess: direct resize to 80x160 (PP-LCNet_x1_0_textline_ori)
// No aspect ratio preservation — matches PaddleOCR official ResizeImage
cv::Mat resized;
cv::resize(img_list[i], resized, cv::Size(kClsImageW, kClsImageH));
resized.convertTo(resized, CV_32FC3);
// PP-LCNet uses ImageNet normalization (same as detection)
auto inputData = NormalizeAndPermute(resized);
std::array<int64_t, 4> inputShape = { 1, 3, kClsImageH, kClsImageW };
Ort::Value inputTensor = Ort::Value::CreateTensor<float>(
*memory_info_handler, inputData.data(), inputData.size(),
inputShape.data(), inputShape.size());
auto outputTensors = ort_session->Run(
Ort::RunOptions{ nullptr },
input_node_names.data(), &inputTensor, 1,
output_node_names.data(), num_outputs);
float* outData = outputTensors[0].GetTensorMutableData<float>();
auto outShape = outputTensors[0].GetTensorTypeAndShapeInfo().GetShape();
int numClasses = (outShape.size() > 1) ? static_cast<int>(outShape[1]) : 2;
// Find argmax and use raw output value as score
// PaddleOCR v5 models include softmax, so output values are probabilities
// Matches PaddleOCR official: score = preds[i, argmax_idx]
int maxIdx = 0;
float maxVal = outData[0];
for (int c = 1; c < numClasses; c++) {
if (outData[c] > maxVal) {
maxVal = outData[c];
maxIdx = c;
}
}
cls_labels[i] = maxIdx;
cls_scores[i] = maxVal;
}
catch (const Ort::Exception& e) {
std::cerr << "[ONNXOCRClassifier] Inference failed for image " << i
<< ": " << e.what() << std::endl;
cls_labels[i] = 0;
cls_scores[i] = 0.0f;
}
}
}
} // namespace onnxocr
} // namespace ANSCENTER