mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 03:55:20 +00:00
server : add server parameters for draft model cache type (#13782)
Co-authored-by: aa956 <27946957+aa956@users.noreply.github.com>
This commit is contained in:
@ -1969,10 +1969,8 @@ struct server_context {
|
||||
params_dft.n_ctx = params_base.speculative.n_ctx == 0 ? params_base.n_ctx / params_base.n_parallel : params_base.speculative.n_ctx;
|
||||
params_dft.n_gpu_layers = params_base.speculative.n_gpu_layers;
|
||||
params_dft.n_parallel = 1;
|
||||
|
||||
// force F16 KV cache for the draft model for extra performance
|
||||
params_dft.cache_type_k = GGML_TYPE_F16;
|
||||
params_dft.cache_type_v = GGML_TYPE_F16;
|
||||
params_dft.cache_type_k = params_base.speculative.cache_type_k;
|
||||
params_dft.cache_type_v = params_base.speculative.cache_type_v;
|
||||
|
||||
llama_init_dft = common_init_from_params(params_dft);
|
||||
|
||||
|
Reference in New Issue
Block a user