context : restore preemptive sched reset when LLAMA_SET_ROWS=0 (#14870)

ggml-ci
2025-07-27 11:43:35 -04:00 · 2025-07-25 14:28:06 +03:00
parent 749e0d27f0
commit c1dbea752a
2 changed files with 17 additions and 1 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -105,7 +105,7 @@ llama_context::llama_context(
    {
        const char * LLAMA_SET_ROWS = getenv("LLAMA_SET_ROWS");
-        const bool supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
+        supports_set_rows = LLAMA_SET_ROWS ? (atoi(LLAMA_SET_ROWS) != 0) : false;
        if (!supports_set_rows && !cparams.kv_unified) {
            LLAMA_LOG_WARN("%s: non-unified KV cache requires ggml_set_rows() - forcing unified KV cache\n", __func__);
@ -899,6 +899,12 @@ int llama_context::encode(const llama_batch & batch_inp) {
        }
    }
    if (!supports_set_rows) {
        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
        // overlap with device computation.
        ggml_backend_sched_reset(sched.get());
    }
    // TODO: hacky solution
    if (model.arch == LLM_ARCH_T5 && t_embd) {
        //cross.t_embd = t_embd;
@ -1229,6 +1235,12 @@ int llama_context::decode(const llama_batch & batch_inp) {
    // wait for the computation to finish (automatically done when obtaining the model output)
    //synchronize();
    if (!supports_set_rows) {
        // Reset state for the next token before backend sync, to allow the CPU activities in the reset to
        // overlap with device computation.
        ggml_backend_sched_reset(sched.get());
    }
    return 0;
 }
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -287,6 +287,10 @@ private:
    bool has_evaluated_once = false;
    // env: LLAMA_SET_ROWS (temporary)
    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
    bool supports_set_rows = false;
    // perf
    mutable int64_t t_start_us  = 0;
    mutable int64_t t_load_us   = 0;