mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-10 18:54:09 -04:00
ci : check that pre-tokenizer hashes are up-to-date (#15032)
* torch is not required for convert_hf_to_gguf_update * add --check-missing parameter * check that pre-tokenizer hashes are up-to-date
This commit is contained in:
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
Normal file
45
.github/workflows/pre-tokenizer-hashes.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
name: Check Pre-Tokenizer Hashes
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
paths:
|
||||||
|
- 'convert_hf_to_gguf.py'
|
||||||
|
- 'convert_hf_to_gguf_update.py'
|
||||||
|
pull_request:
|
||||||
|
paths:
|
||||||
|
- 'convert_hf_to_gguf.py'
|
||||||
|
- 'convert_hf_to_gguf_update.py'
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
pre-tokenizer-hashes:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- name: Checkout repository
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: '3.11'
|
||||||
|
|
||||||
|
- name: Install Python dependencies
|
||||||
|
run: |
|
||||||
|
python3 -m venv .venv
|
||||||
|
.venv/bin/pip install -r requirements/requirements-convert_hf_to_gguf_update.txt
|
||||||
|
|
||||||
|
- name: Update pre-tokenizer hashes
|
||||||
|
run: |
|
||||||
|
cp convert_hf_to_gguf.py /tmp
|
||||||
|
.venv/bin/python convert_hf_to_gguf_update.py --check-missing
|
||||||
|
|
||||||
|
- name: Check if committed pre-tokenizer hashes matches generated version
|
||||||
|
run: |
|
||||||
|
if ! diff -q convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py; then
|
||||||
|
echo "Model pre-tokenizer hashes (in convert_hf_to_gguf.py) do not match generated hashes (from convert_hf_to_gguf_update.py)."
|
||||||
|
echo "To fix: run ./convert_hf_to_gguf_update.py and commit the updated convert_hf_to_gguf.py along with your changes"
|
||||||
|
echo "Differences found:"
|
||||||
|
diff convert_hf_to_gguf.py /tmp/convert_hf_to_gguf.py || true
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
echo "Model pre-tokenizer hashes are up to date."
|
@@ -59,6 +59,10 @@ parser.add_argument(
|
|||||||
"--full", action="store_true",
|
"--full", action="store_true",
|
||||||
help="download full list of models - make sure you have access to all of them",
|
help="download full list of models - make sure you have access to all of them",
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--check-missing", action="store_true",
|
||||||
|
help="only check for missing pre-tokenizer hashes",
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"hf_token",
|
"hf_token",
|
||||||
help="optional HF token",
|
help="optional HF token",
|
||||||
@@ -70,6 +74,10 @@ hf_token = args.hf_token if args.hf_token is not None else hf_token
|
|||||||
if hf_token is None:
|
if hf_token is None:
|
||||||
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||||
|
|
||||||
|
if args.check_missing and args.full:
|
||||||
|
logger.warning("Downloading full list of models requested, ignoring --check-missing!")
|
||||||
|
args.check_missing = False
|
||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
CHK_TXT = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
|
||||||
@@ -222,12 +230,13 @@ if not args.full:
|
|||||||
all_models = models.copy()
|
all_models = models.copy()
|
||||||
models = [model for model in all_models if model["name"] not in existing_models]
|
models = [model for model in all_models if model["name"] not in existing_models]
|
||||||
|
|
||||||
logging.info(f"Downloading {len(models)} models...")
|
if not args.check_missing:
|
||||||
for model in models:
|
logging.info(f"Downloading {len(models)} models...")
|
||||||
try:
|
for model in models:
|
||||||
download_model(model)
|
try:
|
||||||
except Exception as e:
|
download_model(model)
|
||||||
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"Failed to download model {model['name']}. Error: {e}")
|
||||||
|
|
||||||
|
|
||||||
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
# generate the source code for the convert_hf_to_gguf.py:get_vocab_base_pre() function:
|
||||||
|
@@ -1,7 +1 @@
|
|||||||
-r ./requirements-convert_legacy_llama.txt
|
-r ./requirements-convert_legacy_llama.txt
|
||||||
--extra-index-url https://download.pytorch.org/whl/cpu
|
|
||||||
torch~=2.2.1; platform_machine != "s390x"
|
|
||||||
|
|
||||||
# torch s390x packages can only be found from nightly builds
|
|
||||||
--extra-index-url https://download.pytorch.org/whl/nightly
|
|
||||||
torch>=0.0.0.dev0; platform_machine == "s390x"
|
|
||||||
|
Reference in New Issue
Block a user