From 00d53800e00bb22a26bf710fa6bd1150e412cc1d Mon Sep 17 00:00:00 2001 From: compilade Date: Mon, 24 Mar 2025 06:47:24 -0400 Subject: [PATCH] llama-vocab : add SuperBPE pre-tokenizer (#12532) --- convert_hf_to_gguf.py | 3 +++ convert_hf_to_gguf_update.py | 1 + include/llama.h | 1 + src/llama-vocab.cpp | 10 ++++++++++ 4 files changed, 15 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index d21edce16..d9fa57027 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -705,6 +705,9 @@ class Model: if chkhsh == "ccc2ef013c104be7bae2965776d611e1d7a8a2a9c547dd93a682c9a9fc80352e": # ref: https://huggingface.co/Xenova/gpt-4o res = "gpt-4o" + if chkhsh == "7dec86086fcc38b66b7bc1575a160ae21cf705be7718b9d5598190d7c12db76f": + # ref: https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k + res = "superbpe" if res is None: logger.warning("\n") diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 07d3ce0e4..ca90cf592 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -110,6 +110,7 @@ models = [ {"name": "deepseek-v3", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-V3"}, {"name": "deepseek-r1-qwen", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"}, {"name": "gpt-4o", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Xenova/gpt-4o", }, + {"name": "superbpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k", }, ] diff --git a/include/llama.h b/include/llama.h index 6a44be404..25a9f8278 100644 --- a/include/llama.h +++ b/include/llama.h @@ -107,6 +107,7 @@ extern "C" { LLAMA_VOCAB_PRE_TYPE_MINERVA = 27, LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28, LLAMA_VOCAB_PRE_TYPE_GPT4O = 29, + LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30, }; enum llama_rope_type { diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index a708d8b88..2ddc8108f 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -400,6 +400,12 @@ struct llm_tokenizer_bpe : llm_tokenizer { "[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))*((?=[\\p{L}])([^A-Z]))+(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|[^\\r\\n\\p{L}\\p{N}]?((?=[\\p{L}])([^a-z]))+((?=[\\p{L}])([^A-Z]))*(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])?|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+", }; break; + case LLAMA_VOCAB_PRE_TYPE_SUPERBPE: + regex_exprs = { + "\\p{N}+", + "(?=(\\d{3})+(?!\\d))", + }; + break; default: // default regex for BPE tokenization pre-processing regex_exprs = { @@ -1604,6 +1610,10 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "gpt-4o") { pre_type = LLAMA_VOCAB_PRE_TYPE_GPT4O; clean_spaces = false; + } else if ( + tokenizer_pre == "superbpe") { + pre_type = LLAMA_VOCAB_PRE_TYPE_SUPERBPE; + clean_spaces = false; } else { throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str())); }