mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-20 09:37:40 +00:00
convert : make hf token optional (#14717)
* make hf token optional * fail if we can't get necessary tokenizer config
This commit is contained in:
@ -7,7 +7,6 @@ import pathlib
|
|||||||
import re
|
import re
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
import sys
|
|
||||||
import json
|
import json
|
||||||
import shutil
|
import shutil
|
||||||
import argparse
|
import argparse
|
||||||
@ -69,8 +68,7 @@ args = parser.parse_args()
|
|||||||
hf_token = args.hf_token if args.hf_token is not None else hf_token
|
hf_token = args.hf_token if args.hf_token is not None else hf_token
|
||||||
|
|
||||||
if hf_token is None:
|
if hf_token is None:
|
||||||
logger.error("HF token is required. Please provide it as an argument or set it in ~/.cache/huggingface/token")
|
logger.warning("HF token not found. You can provide it as an argument or set it in ~/.cache/huggingface/token")
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
|
||||||
# will be updated with time - contributions welcome
|
# will be updated with time - contributions welcome
|
||||||
@ -151,7 +149,7 @@ pre_computed_hashes = [
|
|||||||
|
|
||||||
|
|
||||||
def download_file_with_auth(url, token, save_path):
|
def download_file_with_auth(url, token, save_path):
|
||||||
headers = {"Authorization": f"Bearer {token}"}
|
headers = {"Authorization": f"Bearer {token}"} if token else None
|
||||||
response = sess.get(url, headers=headers)
|
response = sess.get(url, headers=headers)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
os.makedirs(os.path.dirname(save_path), exist_ok=True)
|
||||||
@ -250,10 +248,9 @@ for model in [*pre_computed_hashes, *all_models]:
|
|||||||
else:
|
else:
|
||||||
# otherwise, compute the hash of the tokenizer
|
# otherwise, compute the hash of the tokenizer
|
||||||
|
|
||||||
# Skip if the tokenizer folder does not exist or there are other download issues previously
|
# Fail if the tokenizer folder with config does not exist or there are other download issues previously
|
||||||
if not os.path.exists(f"models/tokenizers/{name}"):
|
if not os.path.isfile(f"models/tokenizers/{name}/tokenizer_config.json"):
|
||||||
logger.warning(f"Directory for tokenizer {name} not found. Skipping...")
|
raise OSError(f"Config for tokenizer {name} not found. The model may not exist or is not accessible with the provided token.")
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
|
logger.info(f"Loading tokenizer from {f'models/tokenizers/{name}'}...")
|
||||||
@ -261,9 +258,8 @@ for model in [*pre_computed_hashes, *all_models]:
|
|||||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}", use_fast=False)
|
||||||
else:
|
else:
|
||||||
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||||
except OSError as e:
|
except Exception as e:
|
||||||
logger.error(f"Error loading tokenizer for model {name}. The model may not exist or is not accessible with the provided token. Error: {e}")
|
raise OSError(f"Error loading tokenizer for model {name}.") from e
|
||||||
continue # Skip to the next model if the tokenizer can't be loaded
|
|
||||||
|
|
||||||
chktok = tokenizer.encode(CHK_TXT)
|
chktok = tokenizer.encode(CHK_TXT)
|
||||||
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
chkhsh = sha256(str(chktok).encode()).hexdigest()
|
||||||
|
Reference in New Issue
Block a user