Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425)

* Update brute force test: add_special
* Update brute force test: default values for add_bos_token and add_eos_token
* Enable rtrim when pre-inserting BOS

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* Revert "server : fix test regexes"
This commit is contained in:
jaime-m-p
2024-05-21 14:39:48 +02:00
committed by GitHub
parent 917dc8cfa6
commit d7e852c1bc
5 changed files with 28 additions and 23 deletions

View File

@@ -1749,7 +1749,7 @@ class Phi3MiniModel(Model):
token_id = int(token_id)
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1765,7 +1765,7 @@ class Phi3MiniModel(Model):
token_id = int(foken_data["id"])
token = foken_data["content"].encode("utf-8")
if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
assert(tokens[token_id] == token)
assert tokens[token_id] == token
tokens[token_id] = token
scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED