Fix mixed UTF16 issue (LabVIEW) and fix ANSFR for Intel

2026-04-08 08:47:10 +10:00
parent 866e0282e2
commit a4a8caaa86
10 changed files with 594 additions and 38 deletions
--- a/modules/ANSUtilities/ANSUtilities.cpp
+++ b/modules/ANSUtilities/ANSUtilities.cpp
@@ -1003,6 +1003,131 @@ namespace ANSCENTER
 #endif
    }

+    std::vector<unsigned char> ANSUtilities::RepairLabVIEWUTF16LE(const unsigned char* data, int len) {
+        std::vector<unsigned char> repaired;
+        if (!data || len <= 0) return repaired;
+        repaired.reserve(len + 32);
+
+        // Helper: emit a BMP codepoint as UTF-16LE pair
+        auto emitU16 = [&](uint16_t cp) {
+            repaired.push_back(static_cast<unsigned char>(cp & 0xFF));
+            repaired.push_back(static_cast<unsigned char>((cp >> 8) & 0xFF));
+        };
+
+        for (int i = 0; i < len; ) {
+            unsigned char b = data[i];
+
+            // --- 1. Detect embedded UTF-8 multi-byte sequences ---
+            // LabVIEW text controls may mix UTF-8 encoded characters into a
+            // UTF-16LE stream. UTF-8 lead bytes (C2-F4) followed by valid
+            // continuation bytes (80-BF) are a strong signal.
+            // We decode the UTF-8 codepoint and re-encode as UTF-16LE.
+
+            // 2-byte UTF-8: 110xxxxx 10xxxxxx  (U+0080 .. U+07FF)
+            if (b >= 0xC2 && b <= 0xDF && i + 1 < len) {
+                unsigned char b1 = data[i + 1];
+                if ((b1 & 0xC0) == 0x80) {
+                    uint32_t cp = ((b & 0x1F) << 6) | (b1 & 0x3F);
+                    emitU16(static_cast<uint16_t>(cp));
+                    i += 2;
+                    continue;
+                }
+            }
+            // 3-byte UTF-8: 1110xxxx 10xxxxxx 10xxxxxx  (U+0800 .. U+FFFF)
+            if (b >= 0xE0 && b <= 0xEF && i + 2 < len) {
+                unsigned char b1 = data[i + 1];
+                unsigned char b2 = data[i + 2];
+                if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80) {
+                    uint32_t cp = ((b & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
+                    // Reject overlong encodings and surrogates
+                    if (cp >= 0x0800 && (cp < 0xD800 || cp > 0xDFFF)) {
+                        emitU16(static_cast<uint16_t>(cp));
+                        i += 3;
+                        continue;
+                    }
+                }
+            }
+            // 4-byte UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  (U+10000 .. U+10FFFF)
+            if (b >= 0xF0 && b <= 0xF4 && i + 3 < len) {
+                unsigned char b1 = data[i + 1];
+                unsigned char b2 = data[i + 2];
+                unsigned char b3 = data[i + 3];
+                if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80 && (b3 & 0xC0) == 0x80) {
+                    uint32_t cp = ((b & 0x07) << 18) | ((b1 & 0x3F) << 12)
+                                | ((b2 & 0x3F) << 6) | (b3 & 0x3F);
+                    if (cp >= 0x10000 && cp <= 0x10FFFF) {
+                        // Surrogate pair
+                        cp -= 0x10000;
+                        emitU16(static_cast<uint16_t>(0xD800 + (cp >> 10)));
+                        emitU16(static_cast<uint16_t>(0xDC00 + (cp & 0x3FF)));
+                        i += 4;
+                        continue;
+                    }
+                }
+            }
+
+            // --- 2. Normal UTF-16LE pair (low byte + 0x00 high byte) ---
+            if (i + 1 < len && data[i + 1] == 0x00) {
+                repaired.push_back(data[i]);
+                repaired.push_back(0x00);
+                i += 2;
+            }
+            // --- 3. Lone space byte — LabVIEW dropped the 0x00 high byte ---
+            else if (b == 0x20 && (i + 1 >= len || data[i + 1] != 0x00)) {
+                repaired.push_back(0x20);
+                repaired.push_back(0x00);
+                i += 1;
+            }
+            // --- 4. Non-ASCII UTF-16LE pair (e.g. ễ = C5 1E) ---
+            else if (i + 1 < len) {
+                repaired.push_back(data[i]);
+                repaired.push_back(data[i + 1]);
+                i += 2;
+            }
+            // --- 5. Trailing odd byte — skip ---
+            else {
+                i++;
+            }
+        }
+        return repaired;
+    }
+
+    bool ANSUtilities::IsValidUTF8(const unsigned char* data, int len) {
+        if (!data || len <= 0) return false;
+        bool hasMultiByte = false;
+        for (int i = 0; i < len; ) {
+            unsigned char b = data[i];
+            if (b <= 0x7F) {
+                // ASCII — valid, but alone doesn't prove UTF-8
+                i++;
+            } else if (b >= 0xC2 && b <= 0xDF) {
+                // 2-byte sequence
+                if (i + 1 >= len || (data[i + 1] & 0xC0) != 0x80) return false;
+                hasMultiByte = true;
+                i += 2;
+            } else if (b >= 0xE0 && b <= 0xEF) {
+                // 3-byte sequence
+                if (i + 2 >= len || (data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) return false;
+                uint32_t cp = ((b & 0x0F) << 12) | ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F);
+                if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) return false; // overlong or surrogate
+                hasMultiByte = true;
+                i += 3;
+            } else if (b >= 0xF0 && b <= 0xF4) {
+                // 4-byte sequence
+                if (i + 3 >= len || (data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 || (data[i + 3] & 0xC0) != 0x80) return false;
+                uint32_t cp = ((b & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) | ((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F);
+                if (cp < 0x10000 || cp > 0x10FFFF) return false;
+                hasMultiByte = true;
+                i += 4;
+            } else {
+                return false; // invalid lead byte (C0, C1, F5-FF)
+            }
+        }
+        // Only confirm UTF-8 if we found at least one multi-byte sequence.
+        // Pure ASCII is ambiguous — let the caller decide.
+        return hasMultiByte;
+    }
+
    std::string ANSUtilities::ConvertUTF16LEToUnicodeEscapes(const char* utf16leBytes, int byteLen) {
        if (!utf16leBytes || byteLen <= 0) return "";
        int offset = 0;
@@ -1013,13 +1138,18 @@ namespace ANSCENTER
            offset = 2;
        }
        int remaining = byteLen - offset;
-        if (remaining <= 0 || remaining % 2 != 0) return "";
+        if (remaining <= 0) return "";
+        // Drop trailing odd byte if present (e.g. null terminator appended by LabVIEW)
+        if (remaining % 2 != 0) remaining--;

+        int endPos = offset + remaining;  // safe end position (even-aligned)
        std::string result;
        result.reserve(remaining * 3);
-        for (int i = offset; i + 1 < byteLen; i += 2) {
+        for (int i = offset; i + 1 < endPos; i += 2) {
            uint16_t codepoint = static_cast<unsigned char>(utf16leBytes[i])
                               | (static_cast<unsigned char>(utf16leBytes[i + 1]) << 8);
+            // Pass through printable ASCII including space (0x20-0x7E)
+            // Escape control characters and non-ASCII as \uXXXX
            if (codepoint >= 0x20 && codepoint <= 0x7E) {
                result += static_cast<char>(codepoint);
            } else {
@@ -1038,10 +1168,16 @@ namespace ANSCENTER
        size_t i = 0;
        while (i < utf8Str.size()) {
            unsigned char c = static_cast<unsigned char>(utf8Str[i]);
-            if (c <= 0x7F) {
-                // ASCII byte -- pass through as-is (including \r, \n, \t, space, etc.)
+            if (c >= 0x20 && c <= 0x7E) {
+                // Printable ASCII including space (0x20-0x7E) — pass through as-is
                result += utf8Str[i];
                i++;
+            } else if (c <= 0x7F) {
+                // Control chars (0x00-0x1F), DEL (0x7F) — escape as \uXXXX
+                char buf[7];
+                snprintf(buf, sizeof(buf), "\\u%04x", c);
+                result += buf;
+                i++;
            } else {
                // Multi-byte UTF-8 sequence -- decode to Unicode codepoint
                uint32_t codepoint = 0;