2024-05-05 15:07:48 +10:00
#!/usr/bin/env python3
2024-06-05 19:07:24 +02:00
# -*- coding: utf-8 -*-
2024-05-05 15:07:48 +10:00
2024-05-04 05:36:41 +10:00
import logging
2024-04-29 16:58:41 +03:00
import os
2024-05-17 15:11:45 +03:00
import pathlib
import re
2024-04-29 16:58:41 +03:00
import requests
import sys
import json
2024-09-11 15:29:51 +03:00
import shutil
2025-05-30 12:24:37 +02:00
import argparse
2024-04-29 16:58:41 +03:00
from hashlib import sha256
from enum import IntEnum , auto
2024-05-04 05:36:41 +10:00
from transformers import AutoTokenizer
2024-05-04 08:32:32 +03:00
logging . basicConfig ( level = logging . DEBUG )
2024-07-05 07:53:33 +03:00
logger = logging . getLogger ( " convert_hf_to_gguf_update " )
2024-05-17 15:11:45 +03:00
sess = requests . Session ( )
2024-05-04 05:36:41 +10:00
2025-05-30 12:24:37 +02:00
convert_py_pth = pathlib . Path ( " convert_hf_to_gguf.py " )
convert_py = convert_py_pth . read_text ( encoding = " utf-8 " )
hf_token_pth = pathlib . Path . home ( ) / " .cache " / " huggingface " / " token "
hf_token = hf_token_pth . read_text ( encoding = " utf-8 " ) . strip ( ) if hf_token_pth . exists ( ) else None
2024-04-29 16:58:41 +03:00
class TOKENIZER_TYPE ( IntEnum ) :
SPM = auto ( )
BPE = auto ( )
WPM = auto ( )
2024-07-04 15:46:11 +02:00
UGM = auto ( )
2024-04-29 16:58:41 +03:00
2024-05-04 05:36:41 +10:00
2025-05-30 12:24:37 +02:00
DOC_STRING = """
This script downloads the tokenizer models of the specified models from Huggingface and
generates the get_vocab_base_pre ( ) function for convert_hf_to_gguf . py
/ ! \\ It is intended to be used by contributors and is not meant to be run by end users
This is necessary in order to analyze the type of pre - tokenizer used by the model and
provide the necessary information to llama . cpp via the GGUF header in order to implement
the same pre - tokenizer .
ref : https : / / github . com / ggml - org / llama . cpp / pull / 6920
Instructions :
- Add a new model to the " models " list
- Run the script with your huggingface token
By default , token will be read from ~ / . cache / huggingface / token
- The convert_hf_to_gguf . py script will have had its get_vocab_base_pre ( ) function updated
- Update llama . cpp with the new pre - tokenizer if necessary
"""
# TODO: generate tokenizer tests for llama.cpp
parser = argparse . ArgumentParser ( description = DOC_STRING , formatter_class = argparse . RawTextHelpFormatter )
parser . add_argument (
" --full " , action = " store_true " ,
help = " download full list of models - make sure you have access to all of them " ,
)
parser . add_argument (
" hf_token " ,
help = " optional HF token " ,
nargs = " ? " ,
)
args = parser . parse_args ( )
hf_token = args . hf_token if args . hf_token is not None else hf_token
if hf_token is None :
logger . error ( " HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token " )
sys . exit ( 1 )
2024-04-29 16:58:41 +03:00
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
2024-07-22 15:44:53 +02:00
CHK_TXT = ' \n \n \n \n \n \n \t \t \t \t \n \n \n \n \n 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \' \' \' \' \' \' ``````` \" \" \" \" ......!!!!!!?????? I \' ve been \' told he \' s there, \' RE you sure? \' M not sure I \' ll make it, \' D you like some tea? We \' Ve a \' lL '
2024-04-29 16:58:41 +03:00
# TODO: add models here, base models preferred
models = [
2025-01-20 14:35:07 +01:00
{ " name " : " llama-spm " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/meta-llama/Llama-2-7b-hf " , } ,
{ " name " : " llama-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/meta-llama/Meta-Llama-3-8B " , } ,
{ " name " : " phi-3 " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/microsoft/Phi-3-mini-4k-instruct " , } ,
{ " name " : " deepseek-llm " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/deepseek-ai/deepseek-llm-7b-base " , } ,
{ " name " : " deepseek-coder " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base " , } ,
{ " name " : " falcon " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/tiiuae/falcon-7b " , } ,
{ " name " : " bert-bge " , " tokt " : TOKENIZER_TYPE . WPM , " repo " : " https://huggingface.co/BAAI/bge-small-en-v1.5 " , } ,
{ " name " : " falcon3 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/tiiuae/Falcon3-7B-Base " , } ,
{ " name " : " bert-bge-large " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/BAAI/bge-large-zh-v1.5 " , } ,
{ " name " : " mpt " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/mosaicml/mpt-7b " , } ,
{ " name " : " starcoder " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/bigcode/starcoder2-3b " , } ,
{ " name " : " gpt-2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/openai-community/gpt2 " , } ,
{ " name " : " stablelm2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b " , } ,
{ " name " : " refact " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/smallcloudai/Refact-1_6-base " , } ,
{ " name " : " command-r " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/CohereForAI/c4ai-command-r-v01 " , } ,
{ " name " : " qwen2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/Qwen/Qwen1.5-7B " , } ,
{ " name " : " olmo " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/allenai/OLMo-1.7-7B-hf " , } ,
{ " name " : " dbrx " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/databricks/dbrx-base " , } ,
{ " name " : " jina-v1-en " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-reranker-v1-tiny-en " , } ,
{ " name " : " jina-v2-en " , " tokt " : TOKENIZER_TYPE . WPM , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-en " , } , # WPM!
{ " name " : " jina-v2-es " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-es " , } ,
{ " name " : " jina-v2-de " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-de " , } ,
{ " name " : " smaug-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/abacusai/Smaug-Llama-3-70B-Instruct " , } ,
{ " name " : " poro-chat " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/LumiOpen/Poro-34B-chat " , } ,
{ " name " : " jina-v2-code " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/jinaai/jina-embeddings-v2-base-code " , } ,
{ " name " : " viking " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/LumiOpen/Viking-7B " , } , # Also used for Viking 13B and 33B
{ " name " : " gemma " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/google/gemma-2b " , } ,
{ " name " : " gemma-2 " , " tokt " : TOKENIZER_TYPE . SPM , " repo " : " https://huggingface.co/google/gemma-2-9b " , } ,
{ " name " : " jais " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/core42/jais-13b " , } ,
{ " name " : " t5 " , " tokt " : TOKENIZER_TYPE . UGM , " repo " : " https://huggingface.co/google-t5/t5-small " , } ,
{ " name " : " codeshell " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/WisdomShell/CodeShell-7B " , } ,
{ " name " : " tekken " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/mistralai/Mistral-Nemo-Base-2407 " , } ,
{ " name " : " smollm " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/HuggingFaceTB/SmolLM-135M " , } ,
{ ' name ' : " bloom " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/bigscience/bloom " , } ,
{ ' name ' : " gpt3-finnish " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/TurkuNLP/gpt3-finnish-small " , } ,
{ " name " : " exaone " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/LGAI-EXAONE/EXAONE-3.0-7.8B-Instruct " , } ,
{ " name " : " phi-2 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/microsoft/phi-2 " , } ,
{ " name " : " chameleon " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/facebook/chameleon-7b " , } ,
{ " name " : " roberta-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/sentence-transformers/stsb-roberta-base " } ,
{ " name " : " gigachat " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/ai-sage/GigaChat-20B-A3B-instruct " } ,
{ " name " : " megrez " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/Infinigence/Megrez-3B-Instruct " } ,
{ " name " : " deepseek-v3 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/deepseek-ai/DeepSeek-V3 " } ,
{ " name " : " deepseek-r1-qwen " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B " } ,
2025-02-28 12:44:11 +01:00
{ " name " : " gpt-4o " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/Xenova/gpt-4o " , } ,
2025-03-24 06:47:24 -04:00
{ " name " : " superbpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/UW/OLMo2-8B-SuperBPE-t180k " , } ,
2025-03-31 03:38:33 +09:00
{ " name " : " trillion " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/trillionlabs/Trillion-7B-preview " , } ,
2025-03-30 22:21:03 +02:00
{ " name " : " bailingmoe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/inclusionAI/Ling-lite " , } ,
2025-04-07 23:06:44 +02:00
{ " name " : " llama4 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct " , } ,
2025-04-23 20:21:59 +02:00
{ " name " : " pixtral " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/mistral-community/pixtral-12b " , } ,
2025-05-10 22:08:07 +02:00
{ " name " : " seed-coder " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base " , } ,
2024-05-04 05:36:41 +10:00
]
2024-04-29 16:58:41 +03:00
2025-05-30 12:24:37 +02:00
# some models are known to be broken upstream, so we will skip them as exceptions
pre_computed_hashes = [
# chatglm-bpe has 2 hashes, why?
{ " name " : " chatglm-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/THUDM/glm-4-9b-chat " , " chkhsh " : " b6e8e1518dc4305be2fe39c313ed643381c4da5db34a98f6a04c093f8afbe99b " } ,
{ " name " : " chatglm-bpe " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/THUDM/glm-4-9b-chat " , " chkhsh " : " 81d72c7348a9f0ebe86f23298d37debe0a5e71149e29bd283904c02262b27516 " } ,
{ " name " : " glm4 " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/THUDM/glm-4-9b-hf " , " chkhsh " : " a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2 " } ,
{ " name " : " minerva-7b " , " tokt " : TOKENIZER_TYPE . BPE , " repo " : " https://huggingface.co/sapienzanlp/Minerva-7B-base-v1.0 " , " chkhsh " : " 1431a23e583c97432bc230bff598d103ddb5a1f89960c8f1d1051aaa944d0b35 " } ,
]
2024-05-04 05:36:41 +10:00
2024-04-29 16:58:41 +03:00
def download_file_with_auth ( url , token , save_path ) :
headers = { " Authorization " : f " Bearer { token } " }
2024-05-17 15:11:45 +03:00
response = sess . get ( url , headers = headers )
response . raise_for_status ( )
os . makedirs ( os . path . dirname ( save_path ) , exist_ok = True )
2024-07-22 15:44:53 +02:00
with open ( save_path , ' wb ' ) as downloaded_file :
downloaded_file . write ( response . content )
2024-05-17 15:11:45 +03:00
logger . info ( f " File { save_path } downloaded successfully " )
2024-05-04 05:36:41 +10:00
2024-04-29 16:58:41 +03:00
2024-05-17 15:11:45 +03:00
def download_model ( model ) :
2024-04-29 16:58:41 +03:00
name = model [ " name " ]
repo = model [ " repo " ]
tokt = model [ " tokt " ]
2024-05-17 15:11:45 +03:00
os . makedirs ( f " models/tokenizers/ { name } " , exist_ok = True )
2024-04-29 16:58:41 +03:00
2024-05-17 15:11:45 +03:00
files = [ " config.json " , " tokenizer.json " , " tokenizer_config.json " ]
2024-07-04 15:46:11 +02:00
2025-02-28 12:44:11 +01:00
if name == " gpt-4o " :
# Xenova/gpt-4o is tokenizer-only, it does not contain config.json
files = [ " tokenizer.json " , " tokenizer_config.json " ]
2024-05-17 15:11:45 +03:00
if tokt == TOKENIZER_TYPE . SPM :
files . append ( " tokenizer.model " )
2024-04-29 16:58:41 +03:00
2024-07-04 15:46:11 +02:00
if tokt == TOKENIZER_TYPE . UGM :
files . append ( " spiece.model " )
2024-09-11 15:29:51 +03:00
if os . path . isdir ( repo ) :
# If repo is a path on the file system, copy the directory
for file in files :
src_path = os . path . join ( repo , file )
dst_path = f " models/tokenizers/ { name } / { file } "
if os . path . isfile ( dst_path ) :
logger . info ( f " { name } : File { dst_path } already exists - skipping " )
continue
if os . path . isfile ( src_path ) :
shutil . copy2 ( src_path , dst_path )
logger . info ( f " { name } : Copied { src_path } to { dst_path } " )
else :
logger . warning ( f " { name } : Source file { src_path } does not exist " )
else :
# If repo is a URL, download the files
for file in files :
save_path = f " models/tokenizers/ { name } / { file } "
if os . path . isfile ( save_path ) :
logger . info ( f " { name } : File { save_path } already exists - skipping " )
continue
2025-05-30 12:24:37 +02:00
download_file_with_auth ( f " { repo } /resolve/main/ { file } " , hf_token , save_path )
# get list of existing models and chkhsh from the convert_hf_to_gguf.py file
# returns mapping res --> chkhsh
def get_existing_models ( convert_py ) :
pattern = r ' if chkhsh == " ([a-f0-9] {64} ) " : \ s* \ n \ s*.* \ s*res = " ([^ " ]+) " '
matches = re . findall ( pattern , convert_py )
output = { }
for chkhsh , res in matches :
output [ res ] = chkhsh
return output
2024-04-29 16:58:41 +03:00
2025-05-30 12:24:37 +02:00
existing_models = { }
all_models = models . copy ( )
if not args . full :
# Filter out models that already exist in convert_hf_to_gguf.py
existing_models = get_existing_models ( convert_py )
all_models = models . copy ( )
models = [ model for model in all_models if model [ " name " ] not in existing_models ]
2024-05-05 01:19:30 -04:00
2025-05-30 12:24:37 +02:00
logging . info ( f " Downloading { len ( models ) } models... " )
2024-05-17 15:11:45 +03:00
for model in models :
try :
download_model ( model )
except Exception as e :
logger . error ( f " Failed to download model { model [ ' name ' ] } . Error: { e } " )
2024-04-29 16:58:41 +03:00
2024-07-05 07:53:33 +03:00
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
2024-04-29 16:58:41 +03:00
src_ifs = " "
2025-05-30 12:24:37 +02:00
for model in [ * all_models , * pre_computed_hashes ] :
2024-04-29 16:58:41 +03:00
name = model [ " name " ]
tokt = model [ " tokt " ]
2025-05-30 12:24:37 +02:00
chkhsh = model . get ( " chkhsh " )
2024-04-29 16:58:41 +03:00
2024-07-04 15:46:11 +02:00
if tokt == TOKENIZER_TYPE . SPM or tokt == TOKENIZER_TYPE . UGM :
2024-04-29 16:58:41 +03:00
continue
2024-05-11 10:18:35 +02:00
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os . path . exists ( f " models/tokenizers/ { name } " ) :
logger . warning ( f " Directory for tokenizer { name } not found. Skipping... " )
continue
2024-04-29 16:58:41 +03:00
# create the tokenizer
2025-05-30 12:24:37 +02:00
if chkhsh is not None :
# if the model has a pre-computed hash, use it
logger . info ( f " Using pre-computed hash for model { name } : { chkhsh } " )
elif name in existing_models :
# if the model already exists in convert_hf_to_gguf.py, skip compute hash
chkhsh = existing_models [ name ]
else :
# otherwise, compute the hash of the tokenizer
try :
logger . info ( f " Loading tokenizer from { f ' models/tokenizers/ { name } ' } ... " )
if name == " t5 " :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " , use_fast = False )
else :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " )
except OSError as e :
logger . error ( f " Error loading tokenizer for model { name } . The model may not exist or is not accessible with the provided token. Error: { e } " )
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer . encode ( CHK_TXT )
chkhsh = sha256 ( str ( chktok ) . encode ( ) ) . hexdigest ( )
logger . info ( f " model: { name } " )
logger . info ( f " tokt: { tokt } " )
logger . info ( f " repo: { model [ ' repo ' ] } " )
logger . info ( f " chktok: { chktok } " )
logger . info ( f " chkhsh: { chkhsh } " )
# print the "pre_tokenizer" content from the tokenizer.json
with open ( f " models/tokenizers/ { name } /tokenizer.json " , " r " , encoding = " utf-8 " ) as f :
cfg = json . load ( f )
normalizer = cfg [ " normalizer " ]
logger . info ( " normalizer: " + json . dumps ( normalizer , indent = 4 ) )
pre_tokenizer = cfg [ " pre_tokenizer " ]
logger . info ( " pre_tokenizer: " + json . dumps ( pre_tokenizer , indent = 4 ) )
if " ignore_merges " in cfg [ " model " ] :
logger . info ( " ignore_merges: " + json . dumps ( cfg [ " model " ] [ " ignore_merges " ] , indent = 4 ) )
logger . info ( " " )
2024-04-29 16:58:41 +03:00
2024-07-02 15:48:13 +05:30
src_ifs + = f " if chkhsh == \" { chkhsh } \" : \n "
2024-04-29 16:58:41 +03:00
src_ifs + = f " # ref: { model [ ' repo ' ] } \n "
2024-07-02 15:48:13 +05:30
src_ifs + = f " res = \" { name } \" \n "
2024-04-29 16:58:41 +03:00
2024-05-04 05:36:41 +10:00
src_func = f """
def get_vocab_base_pre ( self , tokenizer ) - > str :
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
2024-07-22 15:44:53 +02:00
chktxt = { repr ( CHK_TXT ) }
2024-05-04 05:36:41 +10:00
chktok = tokenizer . encode ( chktxt )
chkhsh = sha256 ( str ( chktok ) . encode ( ) ) . hexdigest ( )
2024-05-04 08:32:32 +03:00
logger . debug ( f " chktok: {{ chktok }} " )
logger . debug ( f " chkhsh: {{ chkhsh }} " )
2024-05-04 05:36:41 +10:00
res = None
2024-07-05 07:53:33 +03:00
# NOTE: if you get an error here, you need to update the convert_hf_to_gguf_update.py script
2024-05-04 05:36:41 +10:00
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{ src_ifs }
if res is None :
2024-05-04 08:32:32 +03:00
logger . warning ( " \\ n " )
logger . warning ( " ************************************************************************************** " )
logger . warning ( " ** WARNING: The BPE pre-tokenizer was not recognized! " )
logger . warning ( " ** There are 2 possible reasons for this: " )
2024-07-05 07:53:33 +03:00
logger . warning ( " ** - the model has not been added to convert_hf_to_gguf_update.py yet " )
2024-05-04 08:32:32 +03:00
logger . warning ( " ** - the pre-tokenization config has changed upstream " )
2024-07-05 07:53:33 +03:00
logger . warning ( " ** Check your model files and convert_hf_to_gguf_update.py and update them accordingly. " )
2025-02-15 16:40:57 +02:00
logger . warning ( " ** ref: https://github.com/ggml-org/llama.cpp/pull/6920 " )
2024-05-04 08:32:32 +03:00
logger . warning ( " ** " )
logger . warning ( f " ** chkhsh: {{ chkhsh }} " )
logger . warning ( " ************************************************************************************** " )
logger . warning ( " \\ n " )
2024-05-04 05:36:41 +10:00
raise NotImplementedError ( " BPE pre-tokenizer was not recognized - update get_vocab_base_pre() " )
2024-05-04 08:32:32 +03:00
logger . debug ( f " tokenizer.ggml.pre: {{ repr(res) }} " )
logger . debug ( f " chkhsh: {{ chkhsh }} " )
2024-05-04 05:36:41 +10:00
return res
"""
2024-05-17 15:11:45 +03:00
convert_py = re . sub (
r " (# Marker: Start get_vocab_base_pre)(.+?)( +# Marker: End get_vocab_base_pre) " ,
lambda m : m . group ( 1 ) + src_func + m . group ( 3 ) ,
convert_py ,
flags = re . DOTALL | re . MULTILINE ,
)
2024-06-20 20:59:59 +01:00
convert_py_pth . write_text ( convert_py , encoding = " utf-8 " )
2024-05-04 05:36:41 +10:00
2024-07-05 07:53:33 +03:00
logger . info ( " +++ convert_hf_to_gguf.py was updated " )
2024-04-29 16:58:41 +03:00
# generate tests for each tokenizer model
tests = [
2024-05-04 08:32:32 +03:00
" ied 4 ½ months " ,
2025-05-28 14:49:28 +01:00
" Äpfel " ,
2024-04-29 16:58:41 +03:00
" " ,
" " ,
" " ,
" " ,
" \t " ,
" \n " ,
" \n \n " ,
" \n \n \n " ,
" \t \n " ,
" Hello world " ,
" Hello world " ,
" Hello World " ,
" Hello World " ,
" Hello World! " ,
" Hello, world! " ,
" Hello, world! " ,
" this is 🦙.cpp " ,
" w048 7tuijk dsdfhu " ,
" нещо на Български " ,
" កាន់តែពិសេសអាចខលចេញ " ,
" 🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token) " ,
" Hello " ,
" Hello " ,
" Hello " ,
" Hello " ,
" Hello " ,
" Hello \n Hello " ,
" ( " ,
" \n = " ,
" ' era " ,
" Hello, y ' all! How are you 😁 ?我想在apple工作1314151天~ " ,
2024-07-04 15:46:11 +02:00
" !!!!!! " ,
2024-04-29 16:58:41 +03:00
" 3 " ,
" 33 " ,
" 333 " ,
" 3333 " ,
" 33333 " ,
" 333333 " ,
" 3333333 " ,
" 33333333 " ,
" 333333333 " ,
2024-07-04 10:41:03 +03:00
" Cửa Việt " , # llama-bpe fails on this
" discards " ,
2024-07-22 15:44:53 +02:00
CHK_TXT ,
2024-04-29 16:58:41 +03:00
]
# write the tests to ./models/ggml-vocab-{name}.gguf.inp
# the format is:
#
# test0
# __ggml_vocab_test__
# test1
# __ggml_vocab_test__
# ...
#
# with each model, encode all tests and write the results in ./models/ggml-vocab-{name}.gguf.out
# for each test, write the resulting tokens on a separate line
for model in models :
name = model [ " name " ]
tokt = model [ " tokt " ]
2024-05-11 10:18:35 +02:00
# Skip if the tokenizer folder does not exist or there are other download issues previously
if not os . path . exists ( f " models/tokenizers/ { name } " ) :
logger . warning ( f " Directory for tokenizer { name } not found. Skipping... " )
continue
2024-04-29 16:58:41 +03:00
# create the tokenizer
2024-05-11 10:18:35 +02:00
try :
2024-07-04 15:46:11 +02:00
if name == " t5 " :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " , use_fast = False )
else :
tokenizer = AutoTokenizer . from_pretrained ( f " models/tokenizers/ { name } " )
2024-05-11 10:18:35 +02:00
except OSError as e :
logger . error ( f " Failed to load tokenizer for model { name } . Error: { e } " )
continue # Skip this model and continue with the next one in the loop
2024-04-29 16:58:41 +03:00
2025-05-30 12:24:37 +02:00
if not os . path . exists ( f " models/ggml-vocab- { name } .gguf " ) :
logger . info ( f " Skip vocab files for model { name } , no GGUF file found " )
continue
2024-04-30 11:05:25 +03:00
with open ( f " models/ggml-vocab- { name } .gguf.inp " , " w " , encoding = " utf-8 " ) as f :
2024-04-29 16:58:41 +03:00
for text in tests :
f . write ( f " { text } " )
f . write ( " \n __ggml_vocab_test__ \n " )
with open ( f " models/ggml-vocab- { name } .gguf.out " , " w " ) as f :
for text in tests :
res = tokenizer . encode ( text , add_special_tokens = False )
for r in res :
f . write ( f " { r } " )
f . write ( " \n " )
2024-05-04 05:36:41 +10:00
logger . info ( f " Tests for { name } written in ./models/ggml-vocab- { name } .gguf.* " )
2024-04-29 16:58:41 +03:00
# generate commands for creating vocab files
2024-05-04 05:36:41 +10:00
logger . info ( " \n Run the following commands to generate the vocab files for testing: \n " )
2024-04-29 16:58:41 +03:00
for model in models :
name = model [ " name " ]
2024-07-05 07:53:33 +03:00
print ( f " python3 convert_hf_to_gguf.py models/tokenizers/ { name } / --outfile models/ggml-vocab- { name } .gguf --vocab-only " ) # noqa: NP100
2024-04-29 16:58:41 +03:00
2024-05-04 05:36:41 +10:00
logger . info ( " \n " )