From c6808fefa63bea9ff2df41af6de7f9017ef3a3de Mon Sep 17 00:00:00 2001 From: Samuel Gomes <47574584+a740g@users.noreply.github.com> Date: Thu, 6 Jun 2024 03:54:26 +0530 Subject: [PATCH] Extend UTF32 class to handle UTF-16 naked and BOM LE/BE strings --- internal/c/parts/video/font/font.cpp | 41 ++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/internal/c/parts/video/font/font.cpp b/internal/c/parts/video/font/font.cpp index 324678a90..09324606a 100644 --- a/internal/c/parts/video/font/font.cpp +++ b/internal/c/parts/video/font/font.cpp @@ -41,7 +41,10 @@ extern const uint8_t charset8x16[256][16][8]; void pset_and_clip(int32_t x, int32_t y, uint32_t col); /// @brief A simple class that manages conversions from various encodings to UTF32 -struct UTF32 { +class UTF32 { + static const uint32_t MAX_UNICODE_CODEPOINT = 0x10FFFFu; + + public: std::u32string string; // UTF32 string UTF32 &operator=(const UTF32 &) = delete; @@ -74,14 +77,42 @@ struct UTF32 { return string.size(); } - /// @brief Converts an UTF-16 string to UTF-32 - /// @param str The UTF-16 string + /// @brief Converts an UTF-16 LE/BE string (with BOM or naked) to UTF-32 + /// @param str The UTF-16 string. If no BOM is present, little-endian is assumed /// @param len The size of the string in bytes /// @return The number of codepoints that were converted size_t ConvertUTF16(const uint8_t *str, size_t len) { try { - string = std::wstring_convert, char32_t>().from_bytes( - (const char *)str, (const char *)str + len); + if (len > 2) { + // Detect BOM + if (str[0] == 0xFF && str[1] == 0xFE) { + // Little-endian + string = std::wstring_convert< + std::codecvt_utf16(std::codecvt_mode::consume_header | std::codecvt_mode::little_endian)>, + char32_t>() + .from_bytes((const char *)str, (const char *)str + len); + } else if (str[0] == 0xFE && str[1] == 0xFF) { + // Default is big-endian + string = + std::wstring_convert, char32_t>().from_bytes( + (const char *)str, (const char *)str + len); + } else { + // No BOM, assuming little-endian by default + string = std::wstring_convert< + std::codecvt_utf16(std::codecvt_mode::consume_header | std::codecvt_mode::little_endian)>, + char32_t>() + .from_bytes((const char *)str, (const char *)str + len); + } + } else { + // Short string, assuming little-endian by default + string = std::wstring_convert< + std::codecvt_utf16(std::codecvt_mode::consume_header | std::codecvt_mode::little_endian)>, + char32_t>() + .from_bytes((const char *)str, (const char *)str + len); + } } catch (...) { string.clear(); }