unicode : switch to multimap based nfd_map (#5799)

* switch to multimap based nfd_map due to compile time issues * simplify multimap keys * dont construct new locale every time
2025-07-01 13:05:52 +00:00 · 2024-03-01 03:15:36 -06:00
parent 5cb02b4a01
commit 9600d59e01
2 changed files with 312 additions and 265 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
        std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
        std::vector<uint32_t> nfd_codepoints;
        for (uint32_t code : codepoints) {
-            auto it = nfd_map.find(code);
-            if (it != nfd_map.end()) {
-                for (uint32_t c : it->second) {
-                    nfd_codepoints.push_back(c);
+            auto it = nfd_map.equal_range(code);
+            if (it.first != it.second) {
+                for (auto jt = it.first; jt != it.second; jt++) {
+                    nfd_codepoints.push_back(jt->second);
                }
            } else {
                nfd_codepoints.push_back(code);
@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
    }

    uint32_t to_lower(uint32_t code) {
+        static const std::locale locale("en_US.UTF-8");
 #if defined(_WIN32)
        if (code > 0xFFFF) {
            return code;
        }
 #endif
-        return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
+        return std::tolower(wchar_t(code), locale);
    }

    bool is_ascii_punct(uint32_t code) {