mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 20:25:20 +00:00
llama : implement Unigram tokenizer needed by T5 and FLAN-T5 model families (#5763)
* llama : add T5 model architecture, tensors and model header parameters * llama : add implementation of Unigram tokenizer with SentencePiece-like text normalization using precompiled charsmap --------- Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
This commit is contained in:
@ -23,7 +23,7 @@ static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
return result;
|
||||
}
|
||||
|
||||
static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset) {
|
||||
assert(offset < utf8.size());
|
||||
if (!(utf8[offset + 0] & 0x80)) {
|
||||
auto result = utf8[offset + 0];
|
||||
|
Reference in New Issue
Block a user