mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-30 12:55:17 +00:00
llama : remove logits_all flag + reorder llama_context_params
ggml-ci
This commit is contained in:
@ -1096,7 +1096,6 @@ struct llama_context_params common_context_params_to_llama(const common_params &
|
|||||||
cparams.n_threads = params.cpuparams.n_threads;
|
cparams.n_threads = params.cpuparams.n_threads;
|
||||||
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
cparams.n_threads_batch = params.cpuparams_batch.n_threads == -1 ?
|
||||||
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
params.cpuparams.n_threads : params.cpuparams_batch.n_threads;
|
||||||
cparams.logits_all = false;
|
|
||||||
cparams.embeddings = params.embedding;
|
cparams.embeddings = params.embedding;
|
||||||
cparams.rope_scaling_type = params.rope_scaling_type;
|
cparams.rope_scaling_type = params.rope_scaling_type;
|
||||||
cparams.rope_freq_base = params.rope_freq_base;
|
cparams.rope_freq_base = params.rope_freq_base;
|
||||||
|
@ -351,19 +351,17 @@ extern "C" {
|
|||||||
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
enum ggml_type type_k; // data type for K cache [EXPERIMENTAL]
|
||||||
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
enum ggml_type type_v; // data type for V cache [EXPERIMENTAL]
|
||||||
|
|
||||||
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
|
||||||
// TODO: move at the end of the struct
|
|
||||||
bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
|
|
||||||
bool embeddings; // if true, extract embeddings (together with logits)
|
|
||||||
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
|
||||||
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
|
||||||
bool no_perf; // whether to measure performance timings
|
|
||||||
|
|
||||||
// Abort callback
|
// Abort callback
|
||||||
// if it returns true, execution of llama_decode() will be aborted
|
// if it returns true, execution of llama_decode() will be aborted
|
||||||
// currently works only with CPU execution
|
// currently works only with CPU execution
|
||||||
ggml_abort_callback abort_callback;
|
ggml_abort_callback abort_callback;
|
||||||
void * abort_callback_data;
|
void * abort_callback_data;
|
||||||
|
|
||||||
|
// Keep the booleans together and at the end of the struct to avoid misalignment during copy-by-value.
|
||||||
|
bool embeddings; // if true, extract embeddings (together with logits)
|
||||||
|
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
|
||||||
|
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
|
||||||
|
bool no_perf; // whether to measure performance timings
|
||||||
};
|
};
|
||||||
|
|
||||||
// model quantization parameters
|
// model quantization parameters
|
||||||
|
@ -1851,13 +1851,12 @@ llama_context_params llama_context_default_params() {
|
|||||||
/*.cb_eval_user_data =*/ nullptr,
|
/*.cb_eval_user_data =*/ nullptr,
|
||||||
/*.type_k =*/ GGML_TYPE_F16,
|
/*.type_k =*/ GGML_TYPE_F16,
|
||||||
/*.type_v =*/ GGML_TYPE_F16,
|
/*.type_v =*/ GGML_TYPE_F16,
|
||||||
/*.logits_all =*/ false,
|
/*.abort_callback =*/ nullptr,
|
||||||
|
/*.abort_callback_data =*/ nullptr,
|
||||||
/*.embeddings =*/ false,
|
/*.embeddings =*/ false,
|
||||||
/*.offload_kqv =*/ true,
|
/*.offload_kqv =*/ true,
|
||||||
/*.flash_attn =*/ false,
|
/*.flash_attn =*/ false,
|
||||||
/*.no_perf =*/ true,
|
/*.no_perf =*/ true,
|
||||||
/*.abort_callback =*/ nullptr,
|
|
||||||
/*.abort_callback_data =*/ nullptr,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
Reference in New Issue
Block a user