context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870)

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-07-25 14:28:06 +03:00
committed by GitHub
parent 749e0d27f0
commit c1dbea752a
2 changed files with 17 additions and 1 deletions

View File

@ -105,7 +105,7 @@ llama_context::llama_context(
{ {
const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false; supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
if (!supports_set_rows && !cparams.kv_unified) { if (!supports_set_rows && !cparams.kv_unified) {
LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__); LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
} }
} }
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
// TODO: hacky solution // TODO: hacky solution
if (model.arch == LLM_ARCH_T5 && t_embd) { if (model.arch == LLM_ARCH_T5 && t_embd) {
//cross.t_embd = t_embd; //cross.t_embd = t_embd;
@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
// wait for the computation to finish (automatically done when obtaining the model output) // wait for the computation to finish (automatically done when obtaining the model output)
//synchronize(); //synchronize();
if (!supports_set_rows) {
// Reset state for the next token before backend sync, to allow the CPU activities in the reset to
// overlap with device computation.
ggml_backend_sched_reset(sched.get());
}
return 0; return 0;
} }

View File

@ -287,6 +287,10 @@ private:
bool has_evaluated_once = false; bool has_evaluated_once = false;
// env: LLAMA_SET_ROWS (temporary)
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
bool supports_set_rows = false;
// perf // perf
mutable int64_t t_start_us = 0; mutable int64_t t_start_us = 0;
mutable int64_t t_load_us = 0; mutable int64_t t_load_us = 0;