diff --git a/src/llama-context.cpp b/src/llama-context.cpp index a91d157e2..84f9ccab4 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -105,7 +105,7 @@ llama_context::llama_context( { const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS"); - const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false; + supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false; if (!supports_set_rows && !cparams.kv_unified) { LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__); @@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) { } } + if (!supports_set_rows) { + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + } + // TODO: hacky solution if (model.arch == LLM_ARCH_T5 && t_embd) { //cross.t_embd = t_embd; @@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) { // wait for the computation to finish (automatically done when obtaining the model output) //synchronize(); + if (!supports_set_rows) { + // Reset state for the next token before backend sync, to allow the CPU activities in the reset to + // overlap with device computation. + ggml_backend_sched_reset(sched.get()); + } + return 0; } diff --git a/src/llama-context.h b/src/llama-context.h index fdbe61207..5c3a1c098 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -287,6 +287,10 @@ private: bool has_evaluated_once = false; + // env: LLAMA_SET_ROWS (temporary) + // ref: https://github.com/ggml-org/llama.cpp/pull/14285 + bool supports_set_rows = false; + // perf mutable int64_t t_start_us = 0; mutable int64_t t_load_us = 0;