llama : optimize long word tokenization with WPM (#8034)

ggml-ci
2025-08-16 05:02:58 -04:00 · 2024-06-21 08:51:28 +03:00
parent 80ea089d77
commit a927b0f3dd
2 changed files with 13 additions and 5 deletions
--- a/unicode.cpp
+++ b/unicode.cpp
@@ -596,6 +596,7 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c

 std::vector<uint32_t> unicode_cpts_from_utf8(const std::string & utf8) {
    std::vector<uint32_t> result;
+    result.reserve(utf8.size());
    size_t offset = 0;
    while (offset < utf8.size()) {
        result.push_back(unicode_cpt_from_utf8(utf8, offset));