mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 12:25:03 +00:00
speculative : update default params
This commit is contained in:
@ -178,10 +178,10 @@ struct common_params_speculative {
|
|||||||
|
|
||||||
int32_t n_ctx = 0; // draft context size
|
int32_t n_ctx = 0; // draft context size
|
||||||
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
|
||||||
int32_t n_min = 5; // minimum number of draft tokens to use for speculative decoding
|
int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding
|
||||||
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
|
||||||
float p_split = 0.1f; // speculative decoding split probability
|
float p_split = 0.1f; // speculative decoding split probability
|
||||||
float p_min = 0.9f; // minimum speculative decoding probability (greedy)
|
float p_min = 0.75f; // minimum speculative decoding probability (greedy)
|
||||||
|
|
||||||
struct cpu_params cpuparams;
|
struct cpu_params cpuparams;
|
||||||
struct cpu_params cpuparams_batch;
|
struct cpu_params cpuparams_batch;
|
||||||
|
@ -9,7 +9,7 @@ struct common_speculative_params {
|
|||||||
int n_draft = 16; // max drafted tokens
|
int n_draft = 16; // max drafted tokens
|
||||||
int n_reuse = 256;
|
int n_reuse = 256;
|
||||||
|
|
||||||
float p_min = 0.9f; // min probability required to accept a token in the draft
|
float p_min = 0.75f; // min probability required to accept a token in the draft
|
||||||
};
|
};
|
||||||
|
|
||||||
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
struct common_speculative * common_speculative_init(struct llama_context * ctx_dft);
|
||||||
|
Reference in New Issue
Block a user