mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 20:25:20 +00:00
vocab : fix ugm tokenizer precision (#13743)
This commit is contained in:
@ -835,7 +835,7 @@ struct llm_tokenizer_ugm_session {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
// initialize score_sum to -FLT_MAX so it will be always lower than sums of token scores
|
||||||
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -FLT_MAX});
|
std::vector<struct best_tokenization> tokenization_results(input_len + 1, {vocab.token_unk(), 0, -DBL_MAX});
|
||||||
// at the beginning tokenization score is zero
|
// at the beginning tokenization score is zero
|
||||||
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
|
tokenization_results[0] = { vocab.token_unk(), 0, 0 };
|
||||||
|
|
||||||
@ -867,7 +867,7 @@ struct llm_tokenizer_ugm_session {
|
|||||||
const double challenger_score = current_best.score_sum + token_score;
|
const double challenger_score = current_best.score_sum + token_score;
|
||||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||||
if (challenger_score > current_champ.score_sum) {
|
if (challenger_score > current_champ.score_sum) {
|
||||||
struct best_tokenization challenger = { token_id, input_offset, (float) challenger_score };
|
struct best_tokenization challenger = { token_id, input_offset, challenger_score };
|
||||||
current_champ = challenger;
|
current_champ = challenger;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -881,7 +881,7 @@ struct llm_tokenizer_ugm_session {
|
|||||||
prefix_offset = input_offset + n_utf8_code_units;
|
prefix_offset = input_offset + n_utf8_code_units;
|
||||||
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
struct best_tokenization & current_champ = tokenization_results[prefix_offset];
|
||||||
if (challenger_score > current_champ.score_sum) {
|
if (challenger_score > current_champ.score_sum) {
|
||||||
struct best_tokenization challenger = { vocab.token_unk(), input_offset, (float) challenger_score };
|
struct best_tokenization challenger = { vocab.token_unk(), input_offset, challenger_score };
|
||||||
current_champ = challenger;
|
current_champ = challenger;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1007,7 +1007,7 @@ private:
|
|||||||
struct best_tokenization {
|
struct best_tokenization {
|
||||||
llama_token token_id;
|
llama_token token_id;
|
||||||
size_t input_offset;
|
size_t input_offset;
|
||||||
float score_sum;
|
double score_sum;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
|
struct normalization_result normalize_prefix(const std::string & input, size_t input_offset) {
|
||||||
|
Reference in New Issue
Block a user