Fix mixed UTF16 issue (LabVIEW) and fix ANSFR for Intel
This commit is contained in:
@@ -1003,6 +1003,131 @@ namespace ANSCENTER
|
||||
#endif
|
||||
}
|
||||
|
||||
std::vector<unsigned char> ANSUtilities::RepairLabVIEWUTF16LE(const unsigned char* data, int len) {
|
||||
std::vector<unsigned char> repaired;
|
||||
if (!data || len <= 0) return repaired;
|
||||
repaired.reserve(len + 32);
|
||||
|
||||
// Helper: emit a BMP codepoint as UTF-16LE pair
|
||||
auto emitU16 = [&](uint16_t cp) {
|
||||
repaired.push_back(static_cast<unsigned char>(cp & 0xFF));
|
||||
repaired.push_back(static_cast<unsigned char>((cp >> 8) & 0xFF));
|
||||
};
|
||||
|
||||
for (int i = 0; i < len; ) {
|
||||
unsigned char b = data[i];
|
||||
|
||||
// --- 1. Detect embedded UTF-8 multi-byte sequences ---
|
||||
// LabVIEW text controls may mix UTF-8 encoded characters into a
|
||||
// UTF-16LE stream. UTF-8 lead bytes (C2-F4) followed by valid
|
||||
// continuation bytes (80-BF) are a strong signal.
|
||||
// We decode the UTF-8 codepoint and re-encode as UTF-16LE.
|
||||
|
||||
// 2-byte UTF-8: 110xxxxx 10xxxxxx (U+0080 .. U+07FF)
|
||||
if (b >= 0xC2 && b <= 0xDF && i + 1 < len) {
|
||||
unsigned char b1 = data[i + 1];
|
||||
if ((b1 & 0xC0) == 0x80) {
|
||||
uint32_t cp = ((b & 0x1F) << 6) | (b1 & 0x3F);
|
||||
emitU16(static_cast<uint16_t>(cp));
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
// 3-byte UTF-8: 1110xxxx 10xxxxxx 10xxxxxx (U+0800 .. U+FFFF)
|
||||
if (b >= 0xE0 && b <= 0xEF && i + 2 < len) {
|
||||
unsigned char b1 = data[i + 1];
|
||||
unsigned char b2 = data[i + 2];
|
||||
if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80) {
|
||||
uint32_t cp = ((b & 0x0F) << 12) | ((b1 & 0x3F) << 6) | (b2 & 0x3F);
|
||||
// Reject overlong encodings and surrogates
|
||||
if (cp >= 0x0800 && (cp < 0xD800 || cp > 0xDFFF)) {
|
||||
emitU16(static_cast<uint16_t>(cp));
|
||||
i += 3;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
// 4-byte UTF-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx (U+10000 .. U+10FFFF)
|
||||
if (b >= 0xF0 && b <= 0xF4 && i + 3 < len) {
|
||||
unsigned char b1 = data[i + 1];
|
||||
unsigned char b2 = data[i + 2];
|
||||
unsigned char b3 = data[i + 3];
|
||||
if ((b1 & 0xC0) == 0x80 && (b2 & 0xC0) == 0x80 && (b3 & 0xC0) == 0x80) {
|
||||
uint32_t cp = ((b & 0x07) << 18) | ((b1 & 0x3F) << 12)
|
||||
| ((b2 & 0x3F) << 6) | (b3 & 0x3F);
|
||||
if (cp >= 0x10000 && cp <= 0x10FFFF) {
|
||||
// Surrogate pair
|
||||
cp -= 0x10000;
|
||||
emitU16(static_cast<uint16_t>(0xD800 + (cp >> 10)));
|
||||
emitU16(static_cast<uint16_t>(0xDC00 + (cp & 0x3FF)));
|
||||
i += 4;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// --- 2. Normal UTF-16LE pair (low byte + 0x00 high byte) ---
|
||||
if (i + 1 < len && data[i + 1] == 0x00) {
|
||||
repaired.push_back(data[i]);
|
||||
repaired.push_back(0x00);
|
||||
i += 2;
|
||||
}
|
||||
// --- 3. Lone space byte — LabVIEW dropped the 0x00 high byte ---
|
||||
else if (b == 0x20 && (i + 1 >= len || data[i + 1] != 0x00)) {
|
||||
repaired.push_back(0x20);
|
||||
repaired.push_back(0x00);
|
||||
i += 1;
|
||||
}
|
||||
// --- 4. Non-ASCII UTF-16LE pair (e.g. ễ = C5 1E) ---
|
||||
else if (i + 1 < len) {
|
||||
repaired.push_back(data[i]);
|
||||
repaired.push_back(data[i + 1]);
|
||||
i += 2;
|
||||
}
|
||||
// --- 5. Trailing odd byte — skip ---
|
||||
else {
|
||||
i++;
|
||||
}
|
||||
}
|
||||
return repaired;
|
||||
}
|
||||
|
||||
bool ANSUtilities::IsValidUTF8(const unsigned char* data, int len) {
|
||||
if (!data || len <= 0) return false;
|
||||
bool hasMultiByte = false;
|
||||
for (int i = 0; i < len; ) {
|
||||
unsigned char b = data[i];
|
||||
if (b <= 0x7F) {
|
||||
// ASCII — valid, but alone doesn't prove UTF-8
|
||||
i++;
|
||||
} else if (b >= 0xC2 && b <= 0xDF) {
|
||||
// 2-byte sequence
|
||||
if (i + 1 >= len || (data[i + 1] & 0xC0) != 0x80) return false;
|
||||
hasMultiByte = true;
|
||||
i += 2;
|
||||
} else if (b >= 0xE0 && b <= 0xEF) {
|
||||
// 3-byte sequence
|
||||
if (i + 2 >= len || (data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80) return false;
|
||||
uint32_t cp = ((b & 0x0F) << 12) | ((data[i + 1] & 0x3F) << 6) | (data[i + 2] & 0x3F);
|
||||
if (cp < 0x0800 || (cp >= 0xD800 && cp <= 0xDFFF)) return false; // overlong or surrogate
|
||||
hasMultiByte = true;
|
||||
i += 3;
|
||||
} else if (b >= 0xF0 && b <= 0xF4) {
|
||||
// 4-byte sequence
|
||||
if (i + 3 >= len || (data[i + 1] & 0xC0) != 0x80 || (data[i + 2] & 0xC0) != 0x80 || (data[i + 3] & 0xC0) != 0x80) return false;
|
||||
uint32_t cp = ((b & 0x07) << 18) | ((data[i + 1] & 0x3F) << 12) | ((data[i + 2] & 0x3F) << 6) | (data[i + 3] & 0x3F);
|
||||
if (cp < 0x10000 || cp > 0x10FFFF) return false;
|
||||
hasMultiByte = true;
|
||||
i += 4;
|
||||
} else {
|
||||
return false; // invalid lead byte (C0, C1, F5-FF)
|
||||
}
|
||||
}
|
||||
// Only confirm UTF-8 if we found at least one multi-byte sequence.
|
||||
// Pure ASCII is ambiguous — let the caller decide.
|
||||
return hasMultiByte;
|
||||
}
|
||||
|
||||
std::string ANSUtilities::ConvertUTF16LEToUnicodeEscapes(const char* utf16leBytes, int byteLen) {
|
||||
if (!utf16leBytes || byteLen <= 0) return "";
|
||||
int offset = 0;
|
||||
@@ -1013,13 +1138,18 @@ namespace ANSCENTER
|
||||
offset = 2;
|
||||
}
|
||||
int remaining = byteLen - offset;
|
||||
if (remaining <= 0 || remaining % 2 != 0) return "";
|
||||
if (remaining <= 0) return "";
|
||||
// Drop trailing odd byte if present (e.g. null terminator appended by LabVIEW)
|
||||
if (remaining % 2 != 0) remaining--;
|
||||
|
||||
int endPos = offset + remaining; // safe end position (even-aligned)
|
||||
std::string result;
|
||||
result.reserve(remaining * 3);
|
||||
for (int i = offset; i + 1 < byteLen; i += 2) {
|
||||
for (int i = offset; i + 1 < endPos; i += 2) {
|
||||
uint16_t codepoint = static_cast<unsigned char>(utf16leBytes[i])
|
||||
| (static_cast<unsigned char>(utf16leBytes[i + 1]) << 8);
|
||||
// Pass through printable ASCII including space (0x20-0x7E)
|
||||
// Escape control characters and non-ASCII as \uXXXX
|
||||
if (codepoint >= 0x20 && codepoint <= 0x7E) {
|
||||
result += static_cast<char>(codepoint);
|
||||
} else {
|
||||
@@ -1038,10 +1168,16 @@ namespace ANSCENTER
|
||||
size_t i = 0;
|
||||
while (i < utf8Str.size()) {
|
||||
unsigned char c = static_cast<unsigned char>(utf8Str[i]);
|
||||
if (c <= 0x7F) {
|
||||
// ASCII byte -- pass through as-is (including \r, \n, \t, space, etc.)
|
||||
if (c >= 0x20 && c <= 0x7E) {
|
||||
// Printable ASCII including space (0x20-0x7E) — pass through as-is
|
||||
result += utf8Str[i];
|
||||
i++;
|
||||
} else if (c <= 0x7F) {
|
||||
// Control chars (0x00-0x1F), DEL (0x7F) — escape as \uXXXX
|
||||
char buf[7];
|
||||
snprintf(buf, sizeof(buf), "\\u%04x", c);
|
||||
result += buf;
|
||||
i++;
|
||||
} else {
|
||||
// Multi-byte UTF-8 sequence -- decode to Unicode codepoint
|
||||
uint32_t codepoint = 0;
|
||||
|
||||
Reference in New Issue
Block a user