convert : make hf token optional (#14717)

* make hf token optional

* fail if we can't get necessary tokenizer config
This commit is contained in:
Sigbjørn Skjæret
2025-07-16 23:17:43 +02:00
committed by GitHub
parent 496957e1cb
commit 19e5943d9e

View File

@ -7,7 +7,6 @@ import pathlib
import re import re
import requests import requests
import sys
import json import json
import shutil import shutil
import argparse import argparse
@ -69,8 +68,7 @@ args = parser.parse_args()
hf_token = args.hf_token if args.hf_token is not None else hf_token hf_token = args.hf_token if args.hf_token is not None else hf_token
if hf_token is None: if hf_token is None:
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token") logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
sys.exit(1)
# TODO: this string has to exercise as much pre-tokenizer functionality as possible # TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome # will be updated with time - contributions welcome
@ -151,7 +149,7 @@ pre_computed_hashes = [
def download_file_with_auth(url, token, save_path): def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"} headers = {"Authorization": f"Bearer {token}"} if token else None
response = sess.get(url, headers=headers) response = sess.get(url, headers=headers)
response.raise_for_status() response.raise_for_status()
os.makedirs(os.path.dirname(save_path), exist_ok=True) os.makedirs(os.path.dirname(save_path), exist_ok=True)
@ -250,10 +248,9 @@ for model in [*pre_computed_hashes, *all_models]:
else: else:
# otherwise, compute the hash of the tokenizer # otherwise, compute the hash of the tokenizer
# Skip if the tokenizer folder does not exist or there are other download issues previously # Fail if the tokenizer folder with config does not exist or there are other download issues previously
if not os.path.exists(f"models/tokenizers/{name}"): if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
logger.warning(f"Directory for tokenizer {name} not found. Skipping...") raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
continue
try: try:
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...") logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
@ -261,9 +258,8 @@ for model in [*pre_computed_hashes, *all_models]:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False) tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
else: else:
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}") tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
except OSError as e: except Exception as e:
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}") raise OSError(f"Error loading tokenizer for model {name}.") from e
continue # Skip to the next model if the tokenizer can't be loaded
chktok = tokenizer.encode(CHK_TXT) chktok = tokenizer.encode(CHK_TXT)
chkhsh = sha256(str(chktok).encode()).hexdigest() chkhsh = sha256(str(chktok).encode()).hexdigest()