ci : check that pre-tokenizer hashes are up-to-date (#15032)

* torch is not required for convert_hf_to_gguf_update * add --check-missing parameter * check that pre-tokenizer hashes are up-to-date
2025-08-10 18:54:09 -04:00 · 2025-08-02 14:39:01 +02:00
parent 711d5e6fe6
commit 2bf3fbf0b5
3 changed files with 60 additions and 12 deletions
--- a/.github/workflows/pre-tokenizer-hashes.yml
+++ b/.github/workflows/pre-tokenizer-hashes.yml
@@ -0,0 +1,45 @@
 name: Check Pre-Tokenizer Hashes
 on:
    push:
        paths:
            - 'convert_hf_to_gguf.py'
            - 'convert_hf_to_gguf_update.py'
    pull_request:
        paths:
            - 'convert_hf_to_gguf.py'
            - 'convert_hf_to_gguf_update.py'
 jobs:
    pre-tokenizer-hashes:
        runs-on: ubuntu-latest
        steps:
        - name: Checkout repository
          uses: actions/checkout@v4
        - name: Set up Python
          uses: actions/setup-python@v5
          with:
              python-version: '3.11'
        - name: Install Python dependencies
          run: |
              python3 -m venv .venv
              .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
        - name: Update pre-tokenizer hashes
          run: |
              cp convert_hf_to_gguf.py /tmp
              .venv/bin/python convert_hf_to_gguf_update.py --check-missing
        - name: Check if committed pre-tokenizer hashes matches generated version
          run: |
              if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
                  echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
                  echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
                  echo "Differences found:"
                  diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
                  exit 1
              fi
              echo "Model pre-tokenizer hashes are up to date."
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -59,6 +59,10 @@ parser.add_argument(
    "--full", action="store_true",
    help="download full list of models - make sure you have access to all of them",
 )
 parser.add_argument(
    "--check-missing", action="store_true",
    help="only check for missing pre-tokenizer hashes",
 )
 parser.add_argument(
    "hf_token",
    help="optional HF token",
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
 if hf_token is None:
    logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
 if args.check_missing and args.full:
    logger.warning("Downloading full list of models requested, ignoring --check-missing!")
    args.check_missing = False
 # TODO: this string has to exercise as much pre-tokenizer functionality as possible
 #       will be updated with time - contributions welcome
 CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n  \n   \n    \n     \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
@@ -222,12 +230,13 @@ if not args.full:
    all_models = models.copy()
    models = [model for model in all_models if model["name"] not in existing_models]
-logging.info(f"Downloading {len(models)} models...")
+if not args.check_missing:
-for model in models:
+    logging.info(f"Downloading {len(models)} models...")
-    try:
+    for model in models:
-        download_model(model)
+        try:
-    except Exception as e:
+            download_model(model)
-        logger.error(f"Failed to download model {model['name']}. Error: {e}")
+        except Exception as e:
            logger.error(f"Failed to download model {model['name']}. Error: {e}")
 # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
--- a/requirements/requirements-convert_hf_to_gguf_update.txt
+++ b/requirements/requirements-convert_hf_to_gguf_update.txt
@@ -1,7 +1 @@
 -r ./requirements-convert_legacy_llama.txt
 --extra-index-url https://download.pytorch.org/whl/cpu
 torch~=2.2.1; platform_machine != "s390x"
 # torch s390x packages can only be found from nightly builds
 --extra-index-url https://download.pytorch.org/whl/nightly
 torch>=0.0.0.dev0; platform_machine == "s390x"