mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-10 10:47:51 -04:00
ci : check that pre-tokenizer hashes are up-to-date (#15032)
* torch is not required for convert_hf_to_gguf_update * add --check-missing parameter * check that pre-tokenizer hashes are up-to-date
This commit is contained in:
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
Normal file
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Check Pre-Tokenizer Hashes
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- 'convert_hf_to_gguf.py'
|
||||
- 'convert_hf_to_gguf_update.py'
|
||||
|
||||
jobs:
|
||||
pre-tokenizer-hashes:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v5
|
||||
with:
|
||||
python-version: '3.11'
|
||||
|
||||
- name: Install Python dependencies
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
|
||||
|
||||
- name: Update pre-tokenizer hashes
|
||||
run: |
|
||||
cp convert_hf_to_gguf.py /tmp
|
||||
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||
|
||||
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||
run: |
|
||||
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||
echo "Differences found:"
|
||||
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||
exit 1
|
||||
fi
|
||||
echo "Model pre-tokenizer hashes are up to date."
|
@@ -59,6 +59,10 @@ parser.add_argument(
|
||||
"--full", action="store_true",
|
||||
help="download full list of models - make sure you have access to all of them",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--check-missing", action="store_true",
|
||||
help="only check for missing pre-tokenizer hashes",
|
||||
)
|
||||
parser.add_argument(
|
||||
"hf_token",
|
||||
help="optional HF token",
|
||||
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
|
||||
if hf_token is None:
|
||||
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||
|
||||
if args.check_missing and args.full:
|
||||
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
|
||||
args.check_missing = False
|
||||
|
||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||
# will be updated with time - contributions welcome
|
||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||
@@ -222,12 +230,13 @@ if not args.full:
|
||||
all_models = models.copy()
|
||||
models = [model for model in all_models if model["name"] not in existing_models]
|
||||
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
if not args.check_missing:
|
||||
logging.info(f"Downloading {len(models)} models...")
|
||||
for model in models:
|
||||
try:
|
||||
download_model(model)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||
|
||||
|
||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||
|
@@ -1,7 +1 @@
|
||||
-r ./requirements-convert_legacy_llama.txt
|
||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
||||
torch~=2.2.1; platform_machine != "s390x"
|
||||
|
||||
# torch s390x packages can only be found from nightly builds
|
||||
--extra-index-url https://download.pytorch.org/whl/nightly
|
||||
torch>=0.0.0.dev0; platform_machine == "s390x"
|
||||
|
Reference in New Issue
Block a user