llama : support BailingMoE (Ling) (#12634)

2025-07-12 06:09:18 +00:00 · 2025-03-30 22:21:03 +02:00
parent 4663bd353c
commit 2c3f8b850a
13 changed files with 404 additions and 0 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@ -407,6 +407,13 @@ struct llm_tokenizer_bpe : llm_tokenizer {
                    "(?=(\\d{3})+(?!\\d))",
                };
                break;
+            case LLAMA_VOCAB_PRE_TYPE_BAILINGMOE:
+                regex_exprs = {
+                    // original regex from tokenizer.json
+                    // "'(?i:[sdmt]|ll|ve|re)|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+"
+                    "'(?:[sSdDmMtT]|[lL][lL]|[vV][eE]|[rR][eE])|[^\\r\\n\\p{L}\\p{N}]?+\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]++[\\r\\n]*|\\s*[\\r\\n]|\\s+(?!\\S)|\\s+",
+                };
+                break;
            default:
                // default regex for BPE tokenization pre-processing
                regex_exprs = {
@ -1619,6 +1626,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                tokenizer_pre == "trillion") {
                pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
                clean_spaces = false;
+            } else if (
+                tokenizer_pre == "bailingmoe") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_BAILINGMOE;
+                clean_spaces = false;
            } else {
                throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
            }