mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 20:25:20 +00:00
server : fix speculative decoding with context shift
ggml-ci
This commit is contained in:
@ -2325,7 +2325,7 @@ struct server_context {
|
|||||||
llama_token id = slot.sampled;
|
llama_token id = slot.sampled;
|
||||||
|
|
||||||
struct common_speculative_params params_spec;
|
struct common_speculative_params params_spec;
|
||||||
params_spec.n_draft = slot.params.speculative.n_max;
|
params_spec.n_draft = std::min(slot.params.speculative.n_max, slot.n_ctx - slot.n_past - 1);
|
||||||
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
params_spec.n_reuse = llama_n_ctx(slot.ctx_dft) - slot.params.speculative.n_max;
|
||||||
params_spec.p_min = slot.params.speculative.p_min;
|
params_spec.p_min = slot.params.speculative.p_min;
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user