mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
speculative : do not discard the last drafted token
This commit is contained in:
@ -252,11 +252,6 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
// add drafted token for each sequence
|
// add drafted token for each sequence
|
||||||
const llama_token id = cur_p->data[0].id;
|
const llama_token id = cur_p->data[0].id;
|
||||||
|
|
||||||
// only collect very high-confidence draft tokens
|
|
||||||
if (cur_p->data[0].p < params.p_min) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
common_sampler_accept(smpl, id, true);
|
common_sampler_accept(smpl, id, true);
|
||||||
|
|
||||||
result.push_back(id);
|
result.push_back(id);
|
||||||
@ -265,6 +260,11 @@ llama_tokens common_speculative_gen_draft(
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// only collect very high-confidence draft tokens
|
||||||
|
if (cur_p->data[0].p < params.p_min) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
common_batch_add(batch, id, n_past + i + 1, { 0 }, true);
|
||||||
|
|
||||||
// evaluate the drafted tokens on the draft model
|
// evaluate the drafted tokens on the draft model
|
||||||
|
@ -274,7 +274,7 @@ struct server_task {
|
|||||||
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
params.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
|
||||||
|
|
||||||
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
|
params.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
|
||||||
params.speculative.n_min = std::max(params.speculative.n_min, 2);
|
params.speculative.n_min = std::max(params.speculative.n_min, 0);
|
||||||
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
params.speculative.n_max = std::max(params.speculative.n_max, 0);
|
||||||
|
|
||||||
// Use OpenAI API logprobs only if n_probs wasn't provided
|
// Use OpenAI API logprobs only if n_probs wasn't provided
|
||||||
|
Reference in New Issue
Block a user