Files
ANSCORE/engines/ONNXEngine/ONNXEngine.h

635 lines
24 KiB
C++

#pragma once
#ifndef ONNXEngine_H
#define ONNXEngine_H
#include <string>
#include <vector>
#include <iostream>
#include <typeinfo>
#include <deque>
#include <unordered_map>
#include "onnxruntime_cxx_api.h"
#include "opencv2/opencv.hpp"
#include "EPLoader.h" // brings in EngineType via ANSLicenseHelper
#define LITEORT_CHAR wchar_t
#ifdef ENGINE_EXPORTS
#define ONNXENGINE_API __declspec(dllexport)
#else
#define ONNXENGINE_API __declspec(dllimport)
#endif
namespace ANSCENTER {
// ====================================================================
// types
// ====================================================================
namespace types {
template<typename _T1 = float, typename _T2 = float>
static inline void __assert_type()
{
static_assert(
std::is_standard_layout_v<_T1> && std::is_trivially_copyable_v<_T1>
&& std::is_standard_layout_v<_T2> && std::is_trivially_copyable_v<_T2>
&& std::is_floating_point<_T2>::value
&& (std::is_integral<_T1>::value || std::is_floating_point<_T1>::value),
"not support type.");
}
template<typename T1 = float, typename T2 = float>
struct BoundingBoxType
{
typedef T1 value_type;
typedef T2 score_type;
value_type x1, y1, x2, y2;
score_type score;
const char* label_text;
unsigned int label;
bool flag;
template<typename O1, typename O2 = score_type>
BoundingBoxType<O1, O2> convert_type() const;
template<typename O1, typename O2 = score_type>
value_type iou_of(const BoundingBoxType<O1, O2>& other) const;
value_type width() const;
value_type height() const;
value_type area() const;
::cv::Rect rect() const;
::cv::Point2i tl() const;
::cv::Point2i rb() const;
BoundingBoxType() :
x1(0), y1(0), x2(0), y2(0),
score(0), label_text(nullptr), label(0), flag(false)
{
types::__assert_type<value_type, score_type>();
}
};
template class BoundingBoxType<int, float>;
template class BoundingBoxType<float, float>;
template class BoundingBoxType<double, double>;
typedef BoundingBoxType<int, float> Boxi;
typedef BoundingBoxType<float, float> Boxf;
typedef BoundingBoxType<double, double> Boxd;
typedef struct LandmarksType {
std::vector<cv::Point2f> points;
bool flag;
LandmarksType() : flag(false) {}
} Landmarks;
typedef Landmarks Landmarks2D;
typedef struct Landmarks3DType {
std::vector<cv::Point3f> points;
bool flag;
Landmarks3DType() : flag(false) {}
} Landmarks3D;
typedef struct BoxfWithLandmarksType {
Boxf box;
Landmarks landmarks;
bool flag;
BoxfWithLandmarksType() : flag(false) {}
} BoxfWithLandmarks;
typedef struct EulerAnglesType {
float yaw, pitch, roll;
bool flag;
EulerAnglesType() : flag(false) {}
} EulerAngles;
typedef struct EmotionsType {
float score;
unsigned int label;
const char* text;
bool flag;
EmotionsType() : flag(false) {}
} Emotions;
typedef struct AgeType {
float age;
unsigned int age_interval[2];
float interval_prob;
bool flag;
AgeType() : flag(false) {}
} Age;
typedef struct GenderType {
float score;
unsigned int label;
const char* text;
bool flag;
GenderType() : flag(false) {}
} Gender;
typedef struct FaceContentType {
std::vector<float> embedding;
unsigned int dim;
bool flag;
FaceContentType() : flag(false) {}
} FaceContent;
typedef struct SegmentContentType {
cv::Mat class_mat;
cv::Mat color_mat;
std::unordered_map<int, std::string> names_map;
bool flag;
SegmentContentType() : flag(false) {}
} SegmentContent;
typedef struct MattingContentType {
cv::Mat fgr_mat;
cv::Mat pha_mat;
cv::Mat merge_mat;
bool flag;
MattingContentType() : flag(false) {}
} MattingContent;
typedef struct SegmentationMaskContentType {
cv::Mat mask;
bool flag;
SegmentationMaskContentType() : flag(false) {}
} SegmentationMaskContent;
typedef struct ImageNetContentType {
std::vector<float> scores;
std::vector<const char*> texts;
std::vector<unsigned int> labels;
bool flag;
ImageNetContentType() : flag(false) {}
} ImageNetContent;
typedef ImageNetContent ClassificationContent;
typedef struct StyleContentType {
cv::Mat mat;
bool flag;
StyleContentType() : flag(false) {}
} StyleContent;
typedef struct SuperResolutionContentType {
cv::Mat mat;
bool flag;
SuperResolutionContentType() : flag(false) {}
} SuperResolutionContent;
typedef struct FaceParsingContentType {
cv::Mat label;
cv::Mat merge;
bool flag;
FaceParsingContentType() : flag(false) {}
} FaceParsingContent;
typedef SegmentationMaskContent HairSegContent;
typedef SegmentationMaskContent HeadSegContent;
typedef SegmentationMaskContent FaceHairSegContent;
typedef SegmentationMaskContent PortraitSegContent;
} // namespace types
// ====================================================================
// utils
// ====================================================================
namespace utils {
namespace transform {
enum { CHW = 0, HWC = 1 };
Ort::Value create_tensor(
const cv::Mat& mat,
const std::vector<int64_t>& tensor_dims,
const Ort::MemoryInfo& memory_info_handler,
std::vector<float>& tensor_value_handler,
unsigned int data_format = CHW);
Ort::Value create_tensor_batch(
const std::vector<cv::Mat>& batch_mats,
const std::vector<int64_t>& tensor_dims,
const Ort::MemoryInfo& memory_info_handler,
std::vector<float>& tensor_value_handler,
unsigned int data_format = CHW);
Ort::Value create_video_tensor_5d(
const std::deque<cv::Mat>& frames,
const std::vector<int64_t>& tensor_dims,
const Ort::MemoryInfo& memory_info_handler,
std::vector<float>& tensor_value_handler);
cv::Mat normalize(const cv::Mat& mat, float mean, float scale);
cv::Mat normalize(const cv::Mat& mat, const float mean[3], const float scale[3]);
void normalize(const cv::Mat& inmat, cv::Mat& outmat, float mean, float scale);
void normalize_inplace(cv::Mat& mat_inplace, float mean, float scale);
void normalize_inplace(cv::Mat& mat_inplace, const float mean[3], const float scale[3]);
} // namespace transform
} // namespace utils
// ====================================================================
// Helpers
// ====================================================================
inline static std::string OrtCompatiableGetInputName(
size_t index, OrtAllocator* allocator, Ort::Session* ort_session)
{
return std::string(ort_session->GetInputNameAllocated(index, allocator).get());
}
inline static std::string OrtCompatiableGetOutputName(
size_t index, OrtAllocator* allocator, Ort::Session* ort_session)
{
return std::string(ort_session->GetOutputNameAllocated(index, allocator).get());
}
// ====================================================================
// High-perf options for OCR sub-models that need TRT EP and full
// cuDNN workspace. Default-constructed = identical to the legacy
// behavior (CUDA EP only, minimal cuDNN workspace).
// ====================================================================
// ====================================================================
// OrtHandlerOptions
//
// Per-session knobs for the ORT execution providers. Options are
// grouped by target backend. A field set for one backend is silently
// ignored by every other backend — e.g. `trtProfileMinShapes` only
// affects TensorRT EP (NVIDIA); DirectML and OpenVINO don't read it.
//
// When adding a new backend optimization:
// - put the new field in the correct backend section below
// - NEVER reuse an NVIDIA field for AMD/Intel tuning
// - update the matching Build*OcrOptions() helper in
// PaddleOCRV5Engine.cpp to populate it
//
// The NVIDIA section is considered locked — it's been tuned end-to-end
// for the ANSALPR pipeline and should not change unless fixing a
// specific NVIDIA-observable regression.
// ====================================================================
struct OrtHandlerOptions {
// ----------------------------------------------------------------
// NVIDIA (CUDA EP + TensorRT EP) — LOCKED
//
// These fields only have effect when the resolved execution
// provider is CUDA EP or TensorRT EP. DirectML (AMD), OpenVINO
// (Intel), and CPU EP silently ignore every field below. Do not
// repurpose them for other backends.
// ----------------------------------------------------------------
// Try to attach TensorRT EP before CUDA EP. Falls back to CUDA EP
// automatically if TRT EP creation or session creation fails.
// Engines are cached on disk for fast reload.
bool preferTensorRT = false;
// Use the largest cuDNN conv workspace. cuDNN can then pick fast
// algorithms (Winograd, implicit-precomp-GEMM with big workspaces).
// Defaults off because some deployments share VRAM with TRT engines
// and need the minimal-workspace mode to avoid OOM.
bool useMaxCudnnWorkspace = false;
// Where to cache built TRT engines. Empty → default
// %TEMP%/ANSCENTER/TRTEngineCache. Only used when preferTensorRT.
std::string trtEngineCacheDir;
// FP16 builds for TRT EP. Recommended for inference; ignored if
// preferTensorRT is false.
bool trtFP16 = true;
// Dynamic-shape profile for TRT EP. When set, TRT builds ONE
// engine that handles every input shape in the [min..max] range
// instead of rebuilding per unique shape. Critical for models
// that see many (batch_size, spatial) combinations at runtime.
//
// Format: "input_name:d0xd1xd2xd3[,input2:...]"
// e.g. "x:1x3x48x320" for batch=1, C=3, H=48, W=320
//
// All three fields must be set together. An empty min implies
// no profile (fall back to static-shape-per-unique-input mode).
std::string trtProfileMinShapes;
std::string trtProfileOptShapes;
std::string trtProfileMaxShapes;
// ----------------------------------------------------------------
// Intel (OpenVINO EP) — OPEN FOR OPTIMIZATION
//
// Currently unused. Future Intel-specific tuning (cache_dir for
// kernel cache, explicit device selection, INT8 routing, etc.)
// should add fields here and wire them through the OpenVINO
// branch of initialize_handler(). Do NOT put Intel logic inside
// TryAppendCUDA or TryAppendTensorRT.
// ----------------------------------------------------------------
// (Intel fields go here — none yet)
// ----------------------------------------------------------------
// AMD (DirectML EP / MIGraphX EP) — OPEN FOR OPTIMIZATION
//
// Currently unused. Future AMD-specific tuning (graph optimization
// gate for RDNA3+, MIGraphX cache dir on Linux, etc.) should add
// fields here and wire them through the DirectML branch of
// initialize_handler(). Do NOT put AMD logic inside TryAppendCUDA
// or TryAppendTensorRT.
// ----------------------------------------------------------------
// (AMD fields go here — none yet)
};
// ====================================================================
// BasicOrtHandler
// ====================================================================
class ONNXENGINE_API BasicOrtHandler
{
protected:
const char* input_name = nullptr;
std::vector<const char*> input_node_names;
std::vector<std::string> input_node_names_;
std::vector<int64_t> input_node_dims;
std::size_t input_tensor_size = 1;
std::vector<float> input_values_handler;
std::vector<const char*> output_node_names;
std::vector<std::string> output_node_names_;
std::vector<std::vector<int64_t>> output_node_dims;
int num_outputs = 1;
Ort::Env* ort_env = nullptr; // ← pointer, no in-class init
Ort::Session* ort_session = nullptr;
Ort::MemoryInfo* memory_info_handler = nullptr;
std::wstring onnx_path_w; // ← owns the wstring storage
const LITEORT_CHAR* onnx_path = nullptr; // ← points into onnx_path_w
const char* log_id = nullptr;
protected:
const unsigned int num_threads;
EngineType m_engineType;
// Per-session high-perf options. Default = legacy behavior.
OrtHandlerOptions m_handlerOptions;
protected:
// Default: hardware auto-detection via ANSLicenseHelper through EPLoader
explicit BasicOrtHandler(const std::string& _onnx_path,
unsigned int _num_threads = 1);
// Explicit engine override per-session
explicit BasicOrtHandler(const std::string& _onnx_path,
EngineType engineType,
unsigned int _num_threads = 1);
// Engine override + per-session high-perf options (TRT EP, max
// cuDNN workspace, etc.). Used by OCR sub-models that need
// shape-stable, high-throughput inference.
explicit BasicOrtHandler(const std::string& _onnx_path,
EngineType engineType,
const OrtHandlerOptions& options,
unsigned int _num_threads = 1);
// Auto-detect engine via EPLoader, but with high-perf options.
explicit BasicOrtHandler(const std::string& _onnx_path,
const OrtHandlerOptions& options,
unsigned int _num_threads = 1);
virtual ~BasicOrtHandler();
BasicOrtHandler(const BasicOrtHandler&) = delete;
BasicOrtHandler& operator=(const BasicOrtHandler&) = delete;
public:
// Resolved EP type (after EPLoader fallback). Subclasses use this
// to branch on actual EP at inference time.
EngineType getEngineType() const { return m_engineType; }
// Spin up a tiny CPU-only ORT session just long enough to read
// the name of the model's first input, then tear it down. Used
// by callers that need to build TRT profile-shape strings
// (which require the input name) BEFORE the real session is
// created. Returns an empty string on failure.
static std::string QueryModelInputName(const std::string& onnxPath);
private:
void initialize_handler();
protected:
virtual Ort::Value transform(const cv::Mat& mat) = 0;
virtual Ort::Value transformBatch(const std::vector<cv::Mat>& images) = 0;
// EP-specific session option builders
bool TryAppendCUDA(Ort::SessionOptions& opts);
bool TryAppendTensorRT(Ort::SessionOptions& opts);
bool TryAppendDirectML(Ort::SessionOptions& opts);
bool TryAppendOpenVINO(Ort::SessionOptions& opts);
};
// ====================================================================
// SCRFD — face detection
// ====================================================================
class SCRFD : public BasicOrtHandler
{
public:
explicit SCRFD(const std::string& _onnx_path,unsigned int _num_threads = 1);
explicit SCRFD(const std::string& _onnx_path,EngineType engineType,unsigned int _num_threads = 1);
~SCRFD() override = default;
void detect(const cv::Mat& mat,
std::vector<types::BoxfWithLandmarks>& detected_boxes_kps,
float score_threshold = 0.3f,
float iou_threshold = 0.45f,
unsigned int topk = 400);
private:
typedef struct { float cx, cy, stride; } SCRFDPoint;
typedef struct { float ratio; int dw, dh; bool flag; } SCRFDScaleParams;
const float mean_vals[3] = { 127.5f, 127.5f, 127.5f };
const float scale_vals[3] = { 1.f / 128.f, 1.f / 128.f, 1.f / 128.f };
unsigned int fmc = 3;
bool use_kps = false;
unsigned int num_anchors = 2;
std::vector<int> feat_stride_fpn = { 8, 16, 32 };
std::unordered_map<int, std::vector<SCRFDPoint>> center_points;
bool center_points_is_update = false;
static constexpr unsigned int nms_pre = 1000;
static constexpr unsigned int max_nms = 30000;
Ort::Value transform(const cv::Mat& mat_rs) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
void initial_context();
void resize_unscale(const cv::Mat& mat, cv::Mat& mat_rs,
int target_height, int target_width,
SCRFDScaleParams& scale_params);
void generate_points(int target_height, int target_width);
void generate_bboxes_kps(const SCRFDScaleParams& scale_params,
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection,
std::vector<Ort::Value>& output_tensors,
float score_threshold,
float img_height, float img_width);
void generate_bboxes_single_stride(
const SCRFDScaleParams& scale_params,
Ort::Value& score_pred, Ort::Value& bbox_pred,
unsigned int stride, float score_threshold,
float img_height, float img_width,
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection);
void generate_bboxes_kps_single_stride(
const SCRFDScaleParams& scale_params,
Ort::Value& score_pred, Ort::Value& bbox_pred, Ort::Value& kps_pred,
unsigned int stride, float score_threshold,
float img_height, float img_width,
std::vector<types::BoxfWithLandmarks>& bbox_kps_collection);
void nms_bboxes_kps(std::vector<types::BoxfWithLandmarks>& input,
std::vector<types::BoxfWithLandmarks>& output,
float iou_threshold, unsigned int topk);
};
// ====================================================================
// GlintArcFace — face recognition
// ====================================================================
class GlintArcFace : public BasicOrtHandler
{
public:
explicit GlintArcFace(const std::string& _onnx_path,
unsigned int _num_threads = 1)
: BasicOrtHandler(_onnx_path, _num_threads)
{
}
explicit GlintArcFace(const std::string& _onnx_path,
EngineType engineType,
unsigned int _num_threads = 1)
: BasicOrtHandler(_onnx_path, engineType, _num_threads) {
}
~GlintArcFace() override = default;
void detect(const cv::Mat& mat, types::FaceContent& face_content);
void detectBatch(const std::vector<cv::Mat>& images,
std::vector<types::FaceContent>& face_contents);
private:
static constexpr float mean_val = 127.5f;
static constexpr float scale_val = 1.f / 127.5f;
Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
};
// ====================================================================
// GlintCosFace — face recognition
// ====================================================================
class GlintCosFace : public BasicOrtHandler
{
public:
explicit GlintCosFace(const std::string& _onnx_path,
unsigned int _num_threads = 1)
: BasicOrtHandler(_onnx_path, _num_threads)
{
}
explicit GlintCosFace(const std::string& _onnx_path,
EngineType engineType,
unsigned int _num_threads = 1)
: BasicOrtHandler(_onnx_path, engineType, _num_threads)
{
}
~GlintCosFace() override = default;
void detect(const cv::Mat& mat, types::FaceContent& face_content);
void detectBatch(const std::vector<cv::Mat>& images,
std::vector<types::FaceContent>& face_contents);
private:
static constexpr float mean_val = 127.5f;
static constexpr float scale_val = 1.f / 127.5f;
Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
};
// ====================================================================
// MOVINET — action recognition
// ====================================================================
class MOVINET : public BasicOrtHandler
{
public:
explicit MOVINET(const std::string& _onnx_path,
unsigned int _num_threads = 1);
explicit MOVINET(const std::string& _onnx_path,
int _temporal, int _width, int _height, int _channels = 3,
unsigned int _num_threads = 1);
explicit MOVINET(const std::string& _onnx_path,
EngineType engineType,
unsigned int _num_threads = 1);
explicit MOVINET(const std::string& _onnx_path,
EngineType engineType,
int _temporal, int _width, int _height, int _channels = 3,
unsigned int _num_threads = 1);
~MOVINET() override = default;
void inference(const std::deque<cv::Mat>& frames,
std::pair<int, float>& out_result);
private:
struct InputConfig {
int temporal = 16;
int width = 172;
int height = 172;
int channels = 3;
} input_params;
struct OutputConfig {
int num_classes = 2;
} output_params;
std::string _MoviNetInputName;
std::string _MoviNetOutputName;
std::vector<float> input_tensor_values;
void init_io_names();
Ort::Value transform(const std::deque<cv::Mat>& frames);
std::pair<int, float> post_processing(const float* pOutput);
// Required by BasicOrtHandler pure virtuals
Ort::Value transform(const cv::Mat& mat) override;
Ort::Value transformBatch(const std::vector<cv::Mat>& images) override;
};
// ====================================================================
// BoundingBoxType template implementations
// ====================================================================
template<typename T1, typename T2>
template<typename O1, typename O2>
inline ANSCENTER::types::BoundingBoxType<O1, O2>
ANSCENTER::types::BoundingBoxType<T1, T2>::convert_type() const
{
types::__assert_type<O1, O2>();
types::__assert_type<value_type, score_type>();
BoundingBoxType<O1, O2> other;
other.x1 = static_cast<O1>(x1);
other.y1 = static_cast<O1>(y1);
other.x2 = static_cast<O1>(x2);
other.y2 = static_cast<O1>(y2);
other.score = static_cast<O2>(score);
other.label_text = label_text;
other.label = label;
other.flag = flag;
return other;
}
} // namespace ANSCENTER
#endif // ONNXEngine_H