Add more unicode APIs
This commit is contained in:
@@ -1031,6 +1031,54 @@ namespace ANSCENTER
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ANSUtilities::ConvertUTF8ToUnicodeEscapes(const std::string& utf8Str) {
|
||||
if (utf8Str.empty()) return "";
|
||||
std::string result;
|
||||
result.reserve(utf8Str.size() * 2);
|
||||
size_t i = 0;
|
||||
while (i < utf8Str.size()) {
|
||||
unsigned char c = static_cast<unsigned char>(utf8Str[i]);
|
||||
if (c <= 0x7F) {
|
||||
// ASCII byte -- pass through as-is (including \r, \n, \t, space, etc.)
|
||||
result += utf8Str[i];
|
||||
i++;
|
||||
} else {
|
||||
// Multi-byte UTF-8 sequence -- decode to Unicode codepoint
|
||||
uint32_t codepoint = 0;
|
||||
int seqLen = 0;
|
||||
if ((c & 0xE0) == 0xC0) { codepoint = c & 0x1F; seqLen = 2; }
|
||||
else if ((c & 0xF0) == 0xE0) { codepoint = c & 0x0F; seqLen = 3; }
|
||||
else if ((c & 0xF8) == 0xF0) { codepoint = c & 0x07; seqLen = 4; }
|
||||
else { result += utf8Str[i]; i++; continue; } // invalid lead byte
|
||||
|
||||
bool valid = true;
|
||||
for (int j = 1; j < seqLen && i + j < utf8Str.size(); j++) {
|
||||
unsigned char b = static_cast<unsigned char>(utf8Str[i + j]);
|
||||
if ((b & 0xC0) != 0x80) { valid = false; break; }
|
||||
codepoint = (codepoint << 6) | (b & 0x3F);
|
||||
}
|
||||
if (!valid || i + seqLen > utf8Str.size()) {
|
||||
result += utf8Str[i]; i++; continue; // skip invalid
|
||||
}
|
||||
if (codepoint <= 0xFFFF) {
|
||||
char buf[7];
|
||||
snprintf(buf, sizeof(buf), "\\u%04x", codepoint);
|
||||
result += buf;
|
||||
} else {
|
||||
// Surrogate pair for codepoints above U+FFFF
|
||||
codepoint -= 0x10000;
|
||||
uint16_t high = 0xD800 + (uint16_t)(codepoint >> 10);
|
||||
uint16_t low = 0xDC00 + (uint16_t)(codepoint & 0x3FF);
|
||||
char buf[14];
|
||||
snprintf(buf, sizeof(buf), "\\u%04x\\u%04x", high, low);
|
||||
result += buf;
|
||||
}
|
||||
i += seqLen;
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ANSUtilities::ConvertUnicodeEscapesToUTF8(const std::string& escapedStr) {
|
||||
if (escapedStr.empty()) return "";
|
||||
// First decode \uXXXX to UTF-16LE, then convert to UTF-8
|
||||
@@ -1052,5 +1100,39 @@ namespace ANSCENTER
|
||||
}
|
||||
return ConvertUTF16LEToUTF8(utf16le.data(), (int)utf16le.size());
|
||||
}
|
||||
|
||||
std::string ANSUtilities::DoubleEscapeUnicode(const std::string& str) {
|
||||
if (str.empty()) return "";
|
||||
std::string result;
|
||||
result.reserve(str.size() + str.size() / 4);
|
||||
for (size_t i = 0; i < str.size(); i++) {
|
||||
if (str[i] == '\\' && i + 1 < str.size() && str[i + 1] == 'u') {
|
||||
result += "\\\\u";
|
||||
i++; // skip 'u', loop advances past '\'
|
||||
} else {
|
||||
result += str[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string ANSUtilities::ConvertUTF8ToDoubleEscapedUnicode(const std::string& utf8Str) {
|
||||
return DoubleEscapeUnicode(ConvertUTF8ToUnicodeEscapes(utf8Str));
|
||||
}
|
||||
|
||||
std::string ANSUtilities::UnescapeDoubleEscapedUnicode(const std::string& str) {
|
||||
if (str.empty()) return "";
|
||||
std::string result;
|
||||
result.reserve(str.size());
|
||||
for (size_t i = 0; i < str.size(); i++) {
|
||||
if (str[i] == '\\' && i + 2 < str.size() && str[i + 1] == '\\' && str[i + 2] == 'u') {
|
||||
result += "\\u";
|
||||
i += 2; // skip past '\\' and 'u', loop will advance past first '\'
|
||||
} else {
|
||||
result += str[i];
|
||||
}
|
||||
}
|
||||
return result;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user