mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-29 04:35:05 +00:00
convert : avoid calls to tokenizer.added_tokens_decoder (#12473)
tokenizer.added_tokens_decoder returns a fresh dict every time relatively slowly (~0.04s on average) which results in massive slowdowns when we have a huge number of added tokens
This commit is contained in:
@ -529,6 +529,8 @@ class Model:
|
|||||||
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
reverse_vocab = {id_: encoded_tok for encoded_tok, id_ in tokenizer.vocab.items()}
|
||||||
added_vocab = tokenizer.get_added_vocab()
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
|
|
||||||
|
added_tokens_decoder = tokenizer.added_tokens_decoder
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if i not in reverse_vocab:
|
if i not in reverse_vocab:
|
||||||
tokens.append(f"[PAD{i}]")
|
tokens.append(f"[PAD{i}]")
|
||||||
@ -538,13 +540,13 @@ class Model:
|
|||||||
if token in added_vocab:
|
if token in added_vocab:
|
||||||
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
# The tokenizer in llama.cpp assumes the CONTROL and USER_DEFINED tokens are pre-normalized.
|
||||||
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
# To avoid unexpected issues - we make sure to normalize non-normalized tokens
|
||||||
if not tokenizer.added_tokens_decoder[i].normalized:
|
if not added_tokens_decoder[i].normalized:
|
||||||
previous_token = token
|
previous_token = token
|
||||||
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
token = tokenizer.decode(tokenizer.encode(token, add_special_tokens=False))
|
||||||
if previous_token != token:
|
if previous_token != token:
|
||||||
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
logger.info(f"{repr(previous_token)} is encoded and decoded back to {repr(token)} using AutoTokenizer")
|
||||||
|
|
||||||
if tokenizer.added_tokens_decoder[i].special or self.does_token_look_special(token):
|
if added_tokens_decoder[i].special or self.does_token_look_special(token):
|
||||||
toktypes.append(gguf.TokenType.CONTROL)
|
toktypes.append(gguf.TokenType.CONTROL)
|
||||||
else:
|
else:
|
||||||
# NOTE: this was added for Gemma.
|
# NOTE: this was added for Gemma.
|
||||||
|
Reference in New Issue
Block a user