diff --git a/.github/workflows/pre-tokenizer-hashes.yml b/.github/workflows/pre-tokenizer-hashes.yml new file mode 100644 index 000000000..dff998e23 --- /dev/null +++ b/.github/workflows/pre-tokenizer-hashes.yml @@ -0,0 +1,45 @@ +name: Check Pre-Tokenizer Hashes + +on: + push: + paths: + - 'convert_hf_to_gguf.py' + - 'convert_hf_to_gguf_update.py' + pull_request: + paths: + - 'convert_hf_to_gguf.py' + - 'convert_hf_to_gguf_update.py' + +jobs: + pre-tokenizer-hashes: + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.11' + + - name: Install Python dependencies + run: | + python3 -m venv .venv + .venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt + + - name: Update pre-tokenizer hashes + run: | + cp convert_hf_to_gguf.py /tmp + .venv/bin/python convert_hf_to_gguf_update.py --check-missing + + - name: Check if committed pre-tokenizer hashes matches generated version + run: | + if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then + echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)." + echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes" + echo "Differences found:" + diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true + exit 1 + fi + echo "Model pre-tokenizer hashes are up to date." diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py index 5e21c1f47..211b81ff3 100755 --- a/convert_hf_to_gguf_update.py +++ b/convert_hf_to_gguf_update.py @@ -59,6 +59,10 @@ parser.add_argument( "--full", action="store_true", help="download full list of models - make sure you have access to all of them", ) +parser.add_argument( + "--check-missing", action="store_true", + help="only check for missing pre-tokenizer hashes", +) parser.add_argument( "hf_token", help="optional HF token", @@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token if hf_token is None: logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token") +if args.check_missing and args.full: + logger.warning("Downloading full list of models requested, ignoring --check-missing!") + args.check_missing = False + # TODO: this string has to exercise as much pre-tokenizer functionality as possible # will be updated with time - contributions welcome CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL' @@ -222,12 +230,13 @@ if not args.full: all_models = models.copy() models = [model for model in all_models if model["name"] not in existing_models] -logging.info(f"Downloading {len(models)} models...") -for model in models: - try: - download_model(model) - except Exception as e: - logger.error(f"Failed to download model {model['name']}. Error: {e}") +if not args.check_missing: + logging.info(f"Downloading {len(models)} models...") + for model in models: + try: + download_model(model) + except Exception as e: + logger.error(f"Failed to download model {model['name']}. Error: {e}") # generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function: diff --git a/requirements/requirements-convert_hf_to_gguf_update.txt b/requirements/requirements-convert_hf_to_gguf_update.txt index 431c596c1..afe2747d4 100644 --- a/requirements/requirements-convert_hf_to_gguf_update.txt +++ b/requirements/requirements-convert_hf_to_gguf_update.txt @@ -1,7 +1 @@ -r ./requirements-convert_legacy_llama.txt ---extra-index-url https://download.pytorch.org/whl/cpu -torch~=2.2.1; platform_machine != "s390x" - -# torch s390x packages can only be found from nightly builds ---extra-index-url https://download.pytorch.org/whl/nightly -torch>=0.0.0.dev0; platform_machine == "s390x"