Fix mixed UTF16 issue (LabVIEW) and fix ANSFR for Intel

This commit is contained in:
2026-04-08 08:47:10 +10:00
parent 866e0282e2
commit a4a8caaa86
10 changed files with 594 additions and 38 deletions

View File

@@ -1003,6 +1003,131 @@ namespace ANSCENTER
#endif
}
std::vector<unsigned char> ANSUtilities::RepairLabVIEWUTF16LE(const unsigned char* data, int len) {
std::vector<unsigned char> repaired;
if (!data || len <= 0) return repaired;
repaired.reserve(len + 32);
// Helper: emit a BMP codepoint as UTF-16LE pair
auto emitU16 = [&](uint16_t cp) {
repaired.push_back(static_cast<unsigned char>(cp & 0xFF));
repaired.push_back(static_cast<unsigned char>((cp >> 8) & 0xFF));
};
for (int i = 0; i < len; ) {
unsigned char b = data[i];
// --- 1. Detect embedded UTF-8 multi-byte sequences ---
// LabVIEW text controls may mix UTF-8 encoded characters into a
// UTF-16LE stream. UTF-8 lead bytes (C2-F4) followed by valid
// continuation bytes (80-BF) are a strong signal.
// We decode the UTF-8 codepoint and re-encode as UTF-16LE.
// 2-byte UTF-8: 110xxxxx 10xxxxxx (U+0080 .. U+07FF)
if (b >= 0xC2 && b <= 0xDF && i + 1 < len) {
unsigned char b1 = data[i + 1];
if ((b1 & 0xC0) == 0x80) {
uint32_t cp = ((b & 0x1F) << 6) | (b1 & 0x3F);
emitU16(static_cast<uint16_t>(cp));
i += 2;
continue;
}
}
// 3-byte UTF-8: 1110xxxx 10xxxxxx 10xxxxxx (U+0800 .. U+FFFF)
if (b >= 0xE0 && b <= 0xEF && i + 2 < len) {
unsigned char b1 = data[i + 1];
unsigned char b2 = data[i + 2];
if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80) {
uint32_t cp = ((b & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
// Reject overlong encodings and surrogates
if (cp >= 0x0800 && (cp < 0xD800 || cp > 0xDFFF)) {
emitU16(static_cast<uint16_t>(cp));
i += 3;
continue;
}
}
}
// 4-byte UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (U+10000 .. U+10FFFF)
if (b >= 0xF0 && b <= 0xF4 && i + 3 < len) {
unsigned char b1 = data[i + 1];
unsigned char b2 = data[i + 2];
unsigned char b3 = data[i + 3];
if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80 && (b3 & 0xC0) == 0x80) {
uint32_t cp = ((b & 0x07) << 18) | ((b1 & 0x3F) << 12)
| ((b2 & 0x3F) << 6) | (b3 & 0x3F);
if (cp >= 0x10000 && cp <= 0x10FFFF) {
// Surrogate pair
cp -= 0x10000;
emitU16(static_cast<uint16_t>(0xD800 + (cp >> 10)));
emitU16(static_cast<uint16_t>(0xDC00 + (cp & 0x3FF)));
i += 4;
continue;
}
}
}
// --- 2. Normal UTF-16LE pair (low byte + 0x00 high byte) ---
if (i + 1 < len && data[i + 1] == 0x00) {
repaired.push_back(data[i]);
repaired.push_back(0x00);
i += 2;
}
// --- 3. Lone space byte — LabVIEW dropped the 0x00 high byte ---
else if (b == 0x20 && (i + 1 >= len || data[i + 1] != 0x00)) {
repaired.push_back(0x20);
repaired.push_back(0x00);
i += 1;
}
// --- 4. Non-ASCII UTF-16LE pair (e.g. ễ = C5 1E) ---
else if (i + 1 < len) {
repaired.push_back(data[i]);
repaired.push_back(data[i + 1]);
i += 2;
}
// --- 5. Trailing odd byte — skip ---
else {
i++;
}
}
return repaired;
}
bool ANSUtilities::IsValidUTF8(const unsigned char* data, int len) {
if (!data || len <= 0) return false;
bool hasMultiByte = false;
for (int i = 0; i < len; ) {
unsigned char b = data[i];
if (b <= 0x7F) {
// ASCII — valid, but alone doesn't prove UTF-8
i++;
} else if (b >= 0xC2 && b <= 0xDF) {
// 2-byte sequence
if (i + 1 >= len || (data[i + 1] & 0xC0) != 0x80) return false;
hasMultiByte = true;
i += 2;
} else if (b >= 0xE0 && b <= 0xEF) {
// 3-byte sequence
if (i + 2 >= len || (data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) return false;
uint32_t cp = ((b & 0x0F) << 12) | ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F);
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) return false; // overlong or surrogate
hasMultiByte = true;
i += 3;
} else if (b >= 0xF0 && b <= 0xF4) {
// 4-byte sequence
if (i + 3 >= len || (data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 || (data[i + 3] & 0xC0) != 0x80) return false;
uint32_t cp = ((b & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) | ((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F);
if (cp < 0x10000 || cp > 0x10FFFF) return false;
hasMultiByte = true;
i += 4;
} else {
return false; // invalid lead byte (C0, C1, F5-FF)
}
}
// Only confirm UTF-8 if we found at least one multi-byte sequence.
// Pure ASCII is ambiguous — let the caller decide.
return hasMultiByte;
}
std::string ANSUtilities::ConvertUTF16LEToUnicodeEscapes(const char* utf16leBytes, int byteLen) {
if (!utf16leBytes || byteLen <= 0) return "";
int offset = 0;
@@ -1013,13 +1138,18 @@ namespace ANSCENTER
offset = 2;
}
int remaining = byteLen - offset;
if (remaining <= 0 || remaining % 2 != 0) return "";
if (remaining <= 0) return "";
// Drop trailing odd byte if present (e.g. null terminator appended by LabVIEW)
if (remaining % 2 != 0) remaining--;
int endPos = offset + remaining; // safe end position (even-aligned)
std::string result;
result.reserve(remaining * 3);
for (int i = offset; i + 1 < byteLen; i += 2) {
for (int i = offset; i + 1 < endPos; i += 2) {
uint16_t codepoint = static_cast<unsigned char>(utf16leBytes[i])
| (static_cast<unsigned char>(utf16leBytes[i + 1]) << 8);
// Pass through printable ASCII including space (0x20-0x7E)
// Escape control characters and non-ASCII as \uXXXX
if (codepoint >= 0x20 && codepoint <= 0x7E) {
result += static_cast<char>(codepoint);
} else {
@@ -1038,10 +1168,16 @@ namespace ANSCENTER
size_t i = 0;
while (i < utf8Str.size()) {
unsigned char c = static_cast<unsigned char>(utf8Str[i]);
if (c <= 0x7F) {
// ASCII byte -- pass through as-is (including \r, \n, \t, space, etc.)
if (c >= 0x20 && c <= 0x7E) {
// Printable ASCII including space (0x20-0x7E) — pass through as-is
result += utf8Str[i];
i++;
} else if (c <= 0x7F) {
// Control chars (0x00-0x1F), DEL (0x7F) — escape as \uXXXX
char buf[7];
snprintf(buf, sizeof(buf), "\\u%04x", c);
result += buf;
i++;
} else {
// Multi-byte UTF-8 sequence -- decode to Unicode codepoint
uint32_t codepoint = 0;