llama : implement Unigram tokenizer needed by T5 and FLAN-T5 model families (#5763)

* llama : add T5 model architecture, tensors and model header parameters * llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2025-06-28 20:25:20 +00:00 · 2024-06-25 21:14:35 +02:00
parent e6bf007744
commit 6fcbf68235
4 changed files with 586 additions and 38 deletions
--- a/unicode.cpp
+++ b/unicode.cpp
@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    return result;
 }

-static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
+uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
    assert(offset < utf8.size());
    if (!(utf8[offset + 0] & 0x80)) {
        auto result = utf8[offset + 0];