Support UTF8 to UTF16 LE

2026-03-31 14:10:21 +11:00
parent 0c24096c80
commit 70be68d0fc
14 changed files with 790 additions and 11 deletions
--- a/modules/ANSOCR/dllmain.cpp
+++ b/modules/ANSOCR/dllmain.cpp
@@ -422,6 +422,123 @@ extern "C" ANSOCR_API int SetANSOCRALPRFormat(ANSCENTER::ANSOCRBase** Handle, co
 	}
 }

+// Unicode conversion utilities for LabVIEW wrapper classes
+// Converts input string to UTF-16LE. Handles both:
+// - JSON Unicode escapes (\uXXXX) from ensure_ascii=true output
+// - Raw UTF-8 encoded strings
+// Pure ASCII input is passed through directly (no conversion overhead).
+extern "C" ANSOCR_API int ANSOCR_ConvertUTF8ToUTF16LE(const char* utf8Str, LStrHandle result) {
+	try {
+		if (!utf8Str || !result) return -1;
+		int len = (int)strlen(utf8Str);
+		if (len == 0) return 0;
+
+		// Check if input contains \uXXXX escapes or non-ASCII bytes
+		bool hasUnicodeEscapes = false;
+		bool hasNonAscii = false;
+		for (int i = 0; i < len; i++) {
+			if ((unsigned char)utf8Str[i] >= 0x80) hasNonAscii = true;
+			if (i + 1 < len && utf8Str[i] == '\\' && utf8Str[i + 1] == 'u') hasUnicodeEscapes = true;
+		}
+
+		// Pure ASCII with no escapes — pass through directly
+		if (!hasNonAscii && !hasUnicodeEscapes) {
+			MgErr error = DSSetHandleSize(result, sizeof(int32) + len * sizeof(uChar));
+			if (error != noErr) return -2;
+			(*result)->cnt = len;
+			memcpy((*result)->str, utf8Str, len);
+			return 1;
+		}
+
+		// If contains \uXXXX escapes, decode them to UTF-16LE directly
+		if (hasUnicodeEscapes) {
+			std::string utf16le;
+			utf16le.reserve(len * 2);
+			for (int i = 0; i < len; ) {
+				if (i + 5 < len && utf8Str[i] == '\\' && utf8Str[i + 1] == 'u') {
+					char hex[5] = { utf8Str[i + 2], utf8Str[i + 3], utf8Str[i + 4], utf8Str[i + 5], 0 };
+					uint16_t cp = (uint16_t)strtoul(hex, nullptr, 16);
+					utf16le += static_cast<char>(cp & 0xFF);
+					utf16le += static_cast<char>((cp >> 8) & 0xFF);
+					i += 6;
+				} else {
+					// ASCII or raw UTF-8 byte — convert as single char
+					utf16le += utf8Str[i];
+					utf16le += '\0';
+					i++;
+				}
+			}
+			int size = (int)utf16le.size();
+			MgErr error = DSSetHandleSize(result, sizeof(int32) + size * sizeof(uChar));
+			if (error != noErr) return -2;
+			(*result)->cnt = size;
+			memcpy((*result)->str, utf16le.data(), size);
+			return 1;
+		}
+
+		// Raw UTF-8 — convert via Windows API
+#ifdef _WIN32
+		int wideLen = MultiByteToWideChar(CP_UTF8, 0, utf8Str, len, nullptr, 0);
+		if (wideLen <= 0) return 0;
+		std::wstring wideStr(wideLen, 0);
+		MultiByteToWideChar(CP_UTF8, 0, utf8Str, len, &wideStr[0], wideLen);
+		int size = wideLen * (int)sizeof(wchar_t);
+		MgErr error = DSSetHandleSize(result, sizeof(int32) + size * sizeof(uChar));
+		if (error != noErr) return -2;
+		(*result)->cnt = size;
+		memcpy((*result)->str, wideStr.data(), size);
+		return 1;
+#else
+		return 0;
+#endif
+	}
+	catch (...) { return -1; }
+}
+
+extern "C" ANSOCR_API int ANSOCR_ConvertUTF16LEToUTF8(const unsigned char* utf16leBytes, int byteLen, LStrHandle result) {
+	try {
+		if (!utf16leBytes || byteLen <= 0 || !result) return -1;
+		// Check if input is already pure ASCII (no high bytes, or not valid UTF-16LE)
+		// If all bytes < 0x80 and byteLen has no null bytes in odd positions pattern,
+		// treat as already-UTF8 ASCII and pass through
+		bool isAlreadyAscii = true;
+		bool isUtf16le = (byteLen >= 2 && byteLen % 2 == 0);
+		if (isUtf16le) {
+			// Check if all high bytes (odd indices) are 0x00 — means pure ASCII in UTF-16LE
+			for (int i = 1; i < byteLen; i += 2) {
+				if (utf16leBytes[i] != 0x00) { isAlreadyAscii = false; break; }
+			}
+			if (isAlreadyAscii) {
+				// Extract just the low bytes (ASCII characters)
+				int asciiLen = byteLen / 2;
+				MgErr error = DSSetHandleSize(result, sizeof(int32) + asciiLen * sizeof(uChar));
+				if (error != noErr) return -2;
+				(*result)->cnt = asciiLen;
+				for (int i = 0; i < asciiLen; i++) {
+					(*result)->str[i] = utf16leBytes[i * 2];
+				}
+				return 1;
+			}
+		}
+#ifdef _WIN32
+		int wideLen = byteLen / (int)sizeof(wchar_t);
+		const wchar_t* wideStr = reinterpret_cast<const wchar_t*>(utf16leBytes);
+		int utf8Len = WideCharToMultiByte(CP_UTF8, 0, wideStr, wideLen, nullptr, 0, nullptr, nullptr);
+		if (utf8Len <= 0) return 0;
+		std::string utf8Str(utf8Len, 0);
+		WideCharToMultiByte(CP_UTF8, 0, wideStr, wideLen, &utf8Str[0], utf8Len, nullptr, nullptr);
+		MgErr error = DSSetHandleSize(result, sizeof(int32) + utf8Len * sizeof(uChar));
+		if (error != noErr) return -2;
+		(*result)->cnt = utf8Len;
+		memcpy((*result)->str, utf8Str.data(), utf8Len);
+		return 1;
+#else
+		return 0;
+#endif
+	}
+	catch (...) { return -1; }
+}
+
 extern "C" ANSOCR_API std::string  RunInferenceImagePath(ANSCENTER::ANSOCRBase** Handle, const char* imageFilePath) {
 	if (!Handle || !*Handle) return "";
 	OCRHandleGuard guard(AcquireOCRHandle(*Handle));