vocab : fix ugm tokenizer precision (#13743)

This commit is contained in:
Sigbjørn Skjæret
2025-05-24 12:29:09 +02:00
committed by GitHub
parent ffd0eae60b
commit c3a2624339

View File

@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session {
} }
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores // initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX}); std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
// at the beginning tokenization score is zero // at the beginning tokenization score is zero
tokenization_results[0] = { vocab.token_unk(), 0, 0 }; tokenization_results[0] = { vocab.token_unk(), 0, 0 };
@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session {
const double challenger_score = current_best.score_sum + token_score; const double challenger_score = current_best.score_sum + token_score;
struct best_tokenization & current_champ = tokenization_results[prefix_offset]; struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) { if (challenger_score > current_champ.score_sum) {
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score }; struct best_tokenization challenger = { token_id, input_offset, challenger_score };
current_champ = challenger; current_champ = challenger;
} }
} }
@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session {
prefix_offset = input_offset + n_utf8_code_units; prefix_offset = input_offset + n_utf8_code_units;
struct best_tokenization & current_champ = tokenization_results[prefix_offset]; struct best_tokenization & current_champ = tokenization_results[prefix_offset];
if (challenger_score > current_champ.score_sum) { if (challenger_score > current_champ.score_sum) {
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score }; struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
current_champ = challenger; current_champ = challenger;
} }
} }
@ -1007,7 +1007,7 @@ private:
struct best_tokenization { struct best_tokenization {
llama_token token_id; llama_token token_id;
size_t input_offset; size_t input_offset;
float score_sum; double score_sum;
}; };
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) { struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {