mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-27 11:43:35 -04:00
context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870)
ggml-ci
This commit is contained in:
@ -105,7 +105,7 @@ llama_context::llama_context(
|
|||||||
|
|
||||||
{
|
{
|
||||||
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
|
||||||
const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
|
||||||
|
|
||||||
if (!supports_set_rows && !cparams.kv_unified) {
|
if (!supports_set_rows && !cparams.kv_unified) {
|
||||||
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
|
||||||
@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (!supports_set_rows) {
|
||||||
|
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||||
|
// overlap with device computation.
|
||||||
|
ggml_backend_sched_reset(sched.get());
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: hacky solution
|
// TODO: hacky solution
|
||||||
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
if (model.arch == LLM_ARCH_T5 && t_embd) {
|
||||||
//cross.t_embd = t_embd;
|
//cross.t_embd = t_embd;
|
||||||
@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
|
|||||||
// wait for the computation to finish (automatically done when obtaining the model output)
|
// wait for the computation to finish (automatically done when obtaining the model output)
|
||||||
//synchronize();
|
//synchronize();
|
||||||
|
|
||||||
|
if (!supports_set_rows) {
|
||||||
|
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
|
||||||
|
// overlap with device computation.
|
||||||
|
ggml_backend_sched_reset(sched.get());
|
||||||
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -287,6 +287,10 @@ private:
|
|||||||
|
|
||||||
bool has_evaluated_once = false;
|
bool has_evaluated_once = false;
|
||||||
|
|
||||||
|
// env: LLAMA_SET_ROWS (temporary)
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||||
|
bool supports_set_rows = false;
|
||||||
|
|
||||||
// perf
|
// perf
|
||||||
mutable int64_t t_start_us = 0;
|
mutable int64_t t_start_us = 0;
|
||||||
mutable int64_t t_load_us = 0;
|
mutable int64_t t_load_us = 0;
|
||||||
|
Reference in New Issue
Block a user