ANSCORE/modules/ANSOCR/ANSOdOCR.cpp

#include "ANSOdOCR.h"
#include "Utility.h"
#include <opencv2/highgui.hpp>
#include <omp.h>
#include "ANSYOLOOD.h"
#include "ANSTENSORRTOD.h"
namespace ANSCENTER {
	bool ANSODOCR::Initialize(std::string licenseKey, OCRModelConfig modelConfig,
		const std::string& modelZipFilePath, const std::string& modelZipPassword, int engineMode) {
		try
		{
			bool result = ANSOCRBase::Init(licenseKey, modelConfig, modelZipFilePath, modelZipPassword, engineMode);
			if (!result) return false;

			// Check if detector and ocr model files exist
			_modelConfig.detectionModelFile = CreateFilePath(_modelFolder, "lpd.onnx");
			_modelConfig.recognizerModelFile = CreateFilePath(_modelFolder, "ocr.onnx");

			if (!FileExist(_modelConfig.detectionModelFile)) {
				this->_logger.LogFatal("ANSODOCR::Initialize", "Invalid detector model file", __FILE__, __LINE__);
				_licenseValid = false;
				return false;
			}

			if (!FileExist(_modelConfig.recognizerModelFile)) {
				this->_logger.LogFatal("ANSODOCR::Initialize", "Invalid OCR recognizer model file", __FILE__, __LINE__);
				_licenseValid = false;
				return false;
			}
			try {

				// Check the hardware type
				engineType = ANSCENTER::ANSLicenseHelper::CheckHardwareInformation();//  EngineType::CPU;//
				if (engineType == ANSCENTER::EngineType::NVIDIA_GPU) {
					// Use TensorRT YoloV11
					this->_lpDetector = std::make_unique <ANSCENTER::TENSORRTOD>();
					this->_ocrDetector = std::make_unique<ANSCENTER::TENSORRTOD>();

				}
				else {
					// Use ONNX YoloV11
					this->_lpDetector = std::make_unique <ANSCENTER::YOLOOD>();
					this->_ocrDetector = std::make_unique<ANSCENTER::YOLOOD>();
				}

				// Run initialization
				_ocrModelConfig.detectionScoreThreshold = modelConfig.clsThreshold;
				_lpdmodelConfig.detectionScoreThreshold = modelConfig.detectionBoxThreshold;
				if (_lpdmodelConfig.detectionScoreThreshold < 0.25)_lpdmodelConfig.detectionScoreThreshold = 0.25;
				if (_ocrModelConfig.detectionScoreThreshold < 0.25)_ocrModelConfig.detectionScoreThreshold = 0.25;
				_lpdmodelConfig.modelConfThreshold = 0.5;
				_lpdmodelConfig.modelMNSThreshold = 0.5;
				_ocrModelConfig.modelConfThreshold = 0.5;
				_ocrModelConfig.modelMNSThreshold = 0.5;

				if (!this->_lpDetector->LoadModelFromFolder("", _lpdmodelConfig, "lpd", "lpd.names", _modelFolder, _lpdLabels)) {
					return false;
				}

				if (!this->_ocrDetector->LoadModelFromFolder("", _ocrModelConfig, "ocr", "ocr.names", _modelFolder, _ocrLabels)) {
					return false;
				}
				return _isInitialized;
			}
			catch (...) {
				_licenseValid = false;
				this->_logger.LogFatal("ANSODOCR::Initialize", "Failed to create OCR objects", __FILE__, __LINE__);
				return false;
			}
		}
		catch (std::exception& e) {
			// Handle any other exception that occurs during initialization
			this->_logger.LogFatal("ANSODOCR::Initialize", e.what(), __FILE__, __LINE__);
			_licenseValid = false;
			return false;
		}
	}

	std::vector<ANSCENTER::OCRObject> ANSODOCR::RunInference(const cv::Mat& input) {
		std::vector<ANSCENTER::OCRObject> output;
		if (input.empty()) return output;
		if ((input.cols < 10) || (input.rows < 10)) return output;
		return RunInference(input, "OCRCPUCAM");
	}

	std::vector<ANSCENTER::OCRObject> ANSODOCR::RunInference(const cv::Mat& input, std::string cameraId) {
		std::lock_guard<std::mutex> lock(_mutex);
		std::vector<ANSCENTER::OCRObject> OCRObjects;
		OCRObjects.clear();

		if (!_licenseValid) {
			this->_logger.LogError("ANSODOCR::RunInference", "Invalid License", __FILE__, __LINE__);
			return OCRObjects;
		}
		if (!_isInitialized) {
			this->_logger.LogError("ANSODOCR::RunInference", "Model is not initialized", __FILE__, __LINE__);
			return OCRObjects;
		}
		if (input.empty() || input.cols < 10 || input.rows < 10) {
			this->_logger.LogError("ANSODOCR::RunInference", "Input image is invalid or too small", __FILE__, __LINE__);
			return OCRObjects;
		}

		try {
			// Convert grayscale to BGR if necessary
			cv::Mat im;
			if (input.channels() == 1) {
				cv::cvtColor(input, im, cv::COLOR_GRAY2BGR);
			}
			else {
				im = input.clone();
			}

			// Check ppocr instance
			if (!this->_ocrDetector) {
				this->_logger.LogFatal("ANSODOCR::RunInference", "PPOCR instance is null", __FILE__, __LINE__);
				return OCRObjects;
			}

			std::vector<PaddleOCR::OCRPredictResult> res_ocr = ppocr->ocr(im);

			for (size_t n = 0; n < res_ocr.size(); ++n) {
				if (res_ocr[n].box.size() != 4) {
					this->_logger.LogError("ANSODOCR::RunInference", "Invalid OCR box size", __FILE__, __LINE__);
					continue;
				}

				cv::Point rook_points[4];
				for (size_t m = 0; m < 4; ++m) {
					rook_points[m] = cv::Point(
						static_cast<int>(res_ocr[n].box[m][0]),
						static_cast<int>(res_ocr[n].box[m][1])
					);
				}

				int x = std::max(0, rook_points[0].x);
				int y = std::max(0, rook_points[0].y);
				int width = rook_points[1].x - rook_points[0].x;
				int height = rook_points[2].y - rook_points[1].y;

				// Clamp width and height
				width = std::max(1, std::min(im.cols - x, width));
				height = std::max(1, std::min(im.rows - y, height));

				// Skip invalid boxes
				if (width <= 1 || height <= 1) {
					this->_logger.LogError("ANSODOCR::RunInference", "Invalid bounding box dimension", __FILE__, __LINE__);
					continue;
				}

				ANSCENTER::OCRObject ocrObject;
				ocrObject.box = cv::Rect(x, y, width, height);
				ocrObject.classId = res_ocr[n].cls_label;
				ocrObject.confidence = res_ocr[n].score;
				ocrObject.className = res_ocr[n].text;
				ocrObject.extraInfo = "cls label: " + std::to_string(res_ocr[n].cls_label)
					+ "; cls score: " + std::to_string(res_ocr[n].cls_score);
				ocrObject.cameraId = cameraId;

				OCRObjects.push_back(ocrObject);
			}

			im.release();
		}
		catch (const std::exception& e) {
			this->_logger.LogFatal("ANSODOCR::RunInference", e.what(), __FILE__, __LINE__);
		}
		catch (...) {
			this->_logger.LogFatal("ANSODOCR::RunInference", "Unknown exception occurred", __FILE__, __LINE__);
		}

		return OCRObjects;
	}


	std::vector<ANSCENTER::OCRObject> ANSODOCR::RunInference(const cv::Mat& input, std::vector<cv::Rect> Bbox) {
		std::lock_guard<std::mutex> lock(_mutex);
		std::vector<ANSCENTER::OCRObject> OCRObjects;
		OCRObjects.clear();
		if (!_licenseValid) {
			this->_logger.LogError("ANSODOCR::RunInference", "Invalid License", __FILE__, __LINE__);
			return OCRObjects;
		}
		if (!_isInitialized) {
			this->_logger.LogError("ANSODOCR::RunInference", "Model is not initialized", __FILE__, __LINE__);
			return OCRObjects;
		}
		try {
			if (input.empty()) {
				this->_logger.LogError("ANSODOCR::RunInference", "Input image is empty", __FILE__, __LINE__);
				return OCRObjects;
			}
			if ((input.cols < 10) || (input.rows < 10)) return OCRObjects;
			if (Bbox.size() > 0) {
				// Convert grayscale images to 3-channel BGR if needed
				cv::Mat frame;
				if (input.channels() == 1) {
					cv::cvtColor(input, frame, cv::COLOR_GRAY2BGR);
				}
				else {
					frame = input.clone();
				}
				int fWidth = frame.cols;
				int fHeight = frame.rows;
				for (std::vector<cv::Rect>::iterator it = Bbox.begin(); it != Bbox.end(); it++) {
					int x1, y1, x2, y2, width, height;
					x1 = (*it).x;
					y1 = (*it).y;
					x2 = (*it).x + (*it).width;
					y2 = (*it).y + (*it).height;
					x1 = std::max(0, x1);
					y1 = std::max(0, y1);
					width = std::min(fWidth - x1, (*it).width);
					height = std::min(fHeight - y1, (*it).height);
					if ((x1 >= 0) && (y1 >= 0) && (width >= 5) && (height >= 5))
					{
						// Get cropped objects
						cv::Rect objectPos(x1, y1, width, height);
						cv::Mat croppedObject = frame(objectPos);
						std::vector<ANSCENTER::OCRObject> OCRTempObjects;
						OCRTempObjects.clear();
						OCRTempObjects = RunInference(croppedObject);
						if (OCRTempObjects.size() > 0) {
							for (int i = 0; i < OCRTempObjects.size(); i++) {
								ANSCENTER::OCRObject detectionObject;
								detectionObject = OCRTempObjects[i];
								// Correct bounding box position as the croppedObject x,y will be orignial (0,0)
								detectionObject.box.x = OCRTempObjects[i].box.x + x1;
								detectionObject.box.y = OCRTempObjects[i].box.y + y1;
								detectionObject.box.width = OCRTempObjects[i].box.width;
								detectionObject.box.height = OCRTempObjects[i].box.height;

								detectionObject.box.x = std::max(0, detectionObject.box.x);
								detectionObject.box.y = std::max(0, detectionObject.box.y);
								detectionObject.box.width = std::min(fWidth - detectionObject.box.x, detectionObject.box.width);
								detectionObject.box.height = std::min(fHeight - detectionObject.box.y, detectionObject.box.height);

								OCRObjects.push_back(detectionObject);
							}
						}
					}
				}
			}
			else {
				// Convert grayscale images to 3-channel BGR if needed
				cv::Mat frame;
				if (input.channels() == 1) {
					cv::cvtColor(input, frame, cv::COLOR_GRAY2BGR);
				}
				else {
					frame = input.clone();
				}
				std::vector<PaddleOCR::OCRPredictResult>  res_ocr = ppocr->ocr(frame);
				if (res_ocr.size() > 0) {
					for (int n = 0; n < res_ocr.size(); n++) { // number of detections
						cv::Point rook_points[4];
						for (int m = 0; m < res_ocr[n].box.size(); m++) {
							rook_points[m] =
								cv::Point(int(res_ocr[n].box[m][0]), int(res_ocr[n].box[m][1]));
						}
						ANSCENTER::OCRObject ocrObject;
						ocrObject.box.x = rook_points[0].x;
						ocrObject.box.y = rook_points[0].y;
						ocrObject.box.width = rook_points[1].x - rook_points[0].x;
						ocrObject.box.height = rook_points[2].y - rook_points[1].y;

						ocrObject.box.x = std::max(0, ocrObject.box.x);
						ocrObject.box.y = std::max(0, ocrObject.box.y);
						ocrObject.box.width = std::min(frame.cols - ocrObject.box.x, ocrObject.box.width);
						ocrObject.box.height = std::min(frame.rows - ocrObject.box.y, ocrObject.box.height);

						ocrObject.classId = res_ocr[n].cls_label;
						ocrObject.confidence = res_ocr[n].score;
						ocrObject.className = res_ocr[n].text;
						std::string extraInformation = "cls label:" +
							std::to_string(res_ocr[n].cls_label) +
							";" +
							"cls score:" + std::to_string(res_ocr[n].cls_score);
						ocrObject.extraInfo = extraInformation;
						// Add extra information for cls score  cls label
						OCRObjects.push_back(ocrObject);
					}
				}
				frame.release();
				return OCRObjects;
			}
			return OCRObjects;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSODOCR::RunInference", e.what(), __FILE__, __LINE__);
			return OCRObjects;
		}
	}

	std::vector<ANSCENTER::OCRObject> ANSODOCR::RunInference(const cv::Mat& input, std::vector<cv::Rect> Bbox, std::string cameraId) {
		std::lock_guard<std::mutex> lock(_mutex);
		std::vector<ANSCENTER::OCRObject> OCRObjects;
		OCRObjects.clear();
		if (!_licenseValid) {
			this->_logger.LogError("ANSODOCR::RunInference", "Invalid License", __FILE__, __LINE__);
			return OCRObjects;
		}
		if (!_isInitialized) {
			this->_logger.LogError("ANSODOCR::RunInference", "Model is not initialized", __FILE__, __LINE__);
			return OCRObjects;
		}
		try {
			if (input.empty()) {
				this->_logger.LogError("ANSODOCR::RunInference", "Input image is empty", __FILE__, __LINE__);
				return OCRObjects;
			}
			if ((input.cols < 10) || (input.rows < 10)) return OCRObjects;
			if (Bbox.size() > 0) {
				// Convert grayscale images to 3-channel BGR if needed
				cv::Mat frame;
				if (input.channels() == 1) {
					cv::cvtColor(input, frame, cv::COLOR_GRAY2BGR);
				}
				else {
					frame = input.clone();
				}
				int fWidth = frame.cols;
				int fHeight = frame.rows;
				for (std::vector<cv::Rect>::iterator it = Bbox.begin(); it != Bbox.end(); it++) {
					int x1, y1, x2, y2, width, height;
					x1 = (*it).x;
					y1 = (*it).y;
					x2 = (*it).x + (*it).width;
					y2 = (*it).y + (*it).height;
					x1 = std::max(0, x1);
					y1 = std::max(0, y1);
					width = std::min(fWidth - x1, (*it).width);
					height = std::min(fHeight - y1, (*it).height);
					if ((x1 >= 0) && (y1 >= 0) && (width >= 5) && (height >= 5)) {
						// Get cropped objects
						cv::Rect objectPos(x1, y1, width, height);
						cv::Mat croppedObject = frame(objectPos);
						std::vector<ANSCENTER::OCRObject> OCRTempObjects;
						OCRTempObjects.clear();
						OCRTempObjects = RunInference(croppedObject);
						if (OCRTempObjects.size() > 0) {
							for (int i = 0; i < OCRTempObjects.size(); i++) {
								ANSCENTER::OCRObject detectionObject;
								detectionObject = OCRTempObjects[i];
								// Correct bounding box position as the croppedObject x,y will be orignial (0,0)
								detectionObject.box.x = OCRTempObjects[i].box.x + x1;
								detectionObject.box.y = OCRTempObjects[i].box.y + y1;
								detectionObject.box.width = OCRTempObjects[i].box.width;
								detectionObject.box.height = OCRTempObjects[i].box.height;
								detectionObject.box.x = std::max(0, detectionObject.box.x);
								detectionObject.box.y = std::max(0, detectionObject.box.y);
								detectionObject.box.width = std::min(fWidth - detectionObject.box.x, detectionObject.box.width);
								detectionObject.box.height = std::min(fHeight - detectionObject.box.y, detectionObject.box.height);
								detectionObject.cameraId = cameraId;
								OCRObjects.push_back(detectionObject);
							}
						}
					}
				}
			}
			else {
				auto im = input.clone();
				std::vector<PaddleOCR::OCRPredictResult>  res_ocr = ppocr->ocr(im);
				if (res_ocr.size() > 0) {
					for (int n = 0; n < res_ocr.size(); n++) { // number of detections
						cv::Point rook_points[4];
						for (int m = 0; m < res_ocr[n].box.size(); m++) {
							rook_points[m] =
								cv::Point(int(res_ocr[n].box[m][0]), int(res_ocr[n].box[m][1]));
						}
						ANSCENTER::OCRObject ocrObject;
						ocrObject.box.x = rook_points[0].x;
						ocrObject.box.y = rook_points[0].y;
						ocrObject.box.width = rook_points[1].x - rook_points[0].x;
						ocrObject.box.height = rook_points[2].y - rook_points[1].y;
						ocrObject.box.x = std::max(0, ocrObject.box.x);
						ocrObject.box.y = std::max(0, ocrObject.box.y);
						ocrObject.box.width = std::min(im.cols - ocrObject.box.x, ocrObject.box.width);
						ocrObject.box.height = std::min(im.rows - ocrObject.box.y, ocrObject.box.height);
						ocrObject.classId = res_ocr[n].cls_label;
						ocrObject.confidence = res_ocr[n].score;
						ocrObject.className = res_ocr[n].text;
						std::string extraInformation = "cls label:" +
							std::to_string(res_ocr[n].cls_label) +
							";" +
							"cls score:" + std::to_string(res_ocr[n].cls_score);
						ocrObject.extraInfo = extraInformation;
						ocrObject.cameraId = cameraId;
						// Add extra information for cls score  cls label
						OCRObjects.push_back(ocrObject);
					}
				}
				im.release();
				return OCRObjects;
			}
			return OCRObjects;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSODOCR::RunInference", e.what(), __FILE__, __LINE__);
			return OCRObjects;
		}
	}
	ANSODOCR::~ANSODOCR() {
		try {
			Destroy();
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSODOCR::~ANSODOCR()", e.what(), __FILE__, __LINE__);
		}
		this->ANSOCRBase::~ANSOCRBase();
	}
	bool ANSODOCR::Destroy() {
		try {
			if (this->_ocrDetector) this->_ocrDetector.reset();
			if (this->_lpDetector) this->_lpDetector.reset();
			return true;
		}
		catch (std::exception& e) {
			this->_logger.LogFatal("ANSODOCR::Destroy", e.what(), __FILE__, __LINE__);
			return false;
		}
	}
}