llama : reuse compute graphs (#14482)

* llama : reuse compute graphs ggml-ci * llama-bench : add graph reuse parameter ggml-ci * cont : remove the parameter and the sched resets ggml-ci * graph : rename update() to can_reuse() ggml-ci * params : remove is_same() ggml-ci * graph : set res->params in llm_graph_context constructor ggml-ci * graph : avoid set_max_nodes in llm_graph_result ggml-ci * kv-cache : reuse llama_context's graph result instance ggml-ci * context : reset the previous graph result upon memory updates ggml-ci * batch : llama_ubatch now carries its data instead of pointing to balloc ggml-ci * merge : fix build ggml-ci * graph : fix can_reuse() checks when flash-attention is disabled * graph : move llm_graph_result impl in source file + debug env ggml-ci
2025-08-27 18:49:40 -04:00 · 2025-07-17 19:08:33 +03:00
parent 086cf81e88
commit 01612b7409
12 changed files with 548 additions and 289 deletions
--- a/src/llama-kv-cache-unified.h
+++ b/src/llama-kv-cache-unified.h
@@ -154,6 +154,9 @@ public:

    uint32_t get_n_kv() const;

+    // TODO: temporary
+    bool get_supports_set_rows() const;
+
    // get views of the current state of the cache
    ggml_tensor * get_k(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
    ggml_tensor * get_v(ggml_context * ctx, int32_t il, uint32_t n_kv, const slot_info & sinfo) const;
@@ -227,7 +230,7 @@ private:

    // env: LLAMA_SET_ROWS (temporary)
    // ref: https://github.com/ggml-org/llama.cpp/pull/14285
-    int supports_set_rows = false;
+    bool supports_set_rows = false;

    const llama_swa_type swa_type = LLAMA_SWA_TYPE_NONE;

@@ -270,15 +273,13 @@ private:
                          float   freq_base,
                          float   freq_scale) const;

-    llm_graph_result_ptr build_graph_shift(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf) const;
+    ggml_cgraph * build_graph_shift(
+               llm_graph_result * res,
+                  llama_context * lctx) const;

-    llm_graph_result_ptr build_graph_defrag(
-            const llama_cparams & cparams,
-                   ggml_context * ctx,
-                    ggml_cgraph * gf,
+    ggml_cgraph * build_graph_defrag(
+               llm_graph_result * res,
+                  llama_context * lctx,
              const defrag_info & dinfo) const;

    struct cell_ranges_t {
@@ -340,6 +341,9 @@ public:

    uint32_t get_n_kv() const;

+    // TODO: temporary
+    bool get_supports_set_rows() const;
+
    // get views of the current state of the cache
    ggml_tensor * get_k(ggml_context * ctx, int32_t il) const;
    ggml_tensor * get_v(ggml_context * ctx, int32_t il) const;