From 6378112cb5c91125f32bcf35e7f556ee6be40fb9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Sun, 23 Feb 2025 19:39:22 +0200
Subject: [PATCH] graph : remove the build_kv_... API from llama_graph_i

ggml-ci
---
 src/llama-context.cpp | 19 +++++++++++++++++
 src/llama-context.h   | 47 ++++++++++++++++++++++++++++---------------
 src/llama-graph.cpp   | 18 -----------------
 src/llama-graph.h     |  9 ---------
 4 files changed, 50 insertions(+), 43 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index d98f4662c..5ad1e2a61 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1842,6 +1842,25 @@ ggml_tensor * llama_context::build_attn(
     return cur;
 }
 
+void llama_context::build_kv_self_shift(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+}
+
+void llama_context::build_kv_self_defrag(
+        ggml_context * ctx0,
+        ggml_cgraph * gf) {
+    GGML_UNUSED(ctx0);
+    GGML_UNUSED(gf);
+
+    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
+}
+
+
 //
 // perf
 //
diff --git a/src/llama-context.h b/src/llama-context.h
index 3e9baabfb..09c8f4842 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -171,7 +171,7 @@ protected:
     // graph
     //
 
-    // zero-out inputs and create the ctx_context for the compute graph
+    // zero-out inputs and create the ctx_compute for the compute graph
     virtual ggml_cgraph * graph_init();
 
     // TODO: add encode/decode graphs
@@ -187,73 +187,74 @@ protected:
 
     ggml_context_ptr ctx_compute;
 
+public:
     //
-    // graph build API (generic)
+    // graph build
     //
 
     virtual void build_cb(
              ggml_tensor * cur,
               const char * name,
       const llama_ubatch & ubatch,
-                     int   il);
+                     int   il) override;
 
     // apply control vector for layer il
     virtual ggml_tensor * build_cvec(
             ggml_context * ctx0,
              ggml_tensor * cur,
-                     int   il);
+                     int   il) override;
 
     // do mat_mul, while optionally apply lora
     virtual ggml_tensor * build_lora_mm(
             ggml_context * ctx0,
              ggml_tensor * w,
-             ggml_tensor * cur);
+             ggml_tensor * cur) override;
 
     // do mat_mul_id, while optionally apply lora
     virtual ggml_tensor * build_lora_mm_id(
             ggml_context * ctx0,
              ggml_tensor * w,   // struct ggml_tensor * as
              ggml_tensor * cur, // struct ggml_tensor * b
-             ggml_tensor * ids);
+             ggml_tensor * ids) override;
 
-    virtual ggml_tensor * build_rope_factors(int il);
+    virtual ggml_tensor * build_rope_factors(int il) override;
 
     virtual ggml_tensor * build_rope_shift(
             ggml_context * ctx0,
              ggml_tensor * cur,
              ggml_tensor * shift,
              ggml_tensor * factors,
-             ggml_backend_buffer * bbuf);
+             ggml_backend_buffer * bbuf) override;
 
     virtual ggml_tensor * build_inp_embd(
             ggml_context * ctx0,
              ggml_tensor * tok_embd,
-      const llama_ubatch & ubatch);
+      const llama_ubatch & ubatch) override;
 
     virtual ggml_tensor * build_inp_pos(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual ggml_tensor * build_inp_pos_bucket(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual ggml_tensor * build_inp_out_ids(
-            ggml_context * ctx0);
+            ggml_context * ctx0) override;
 
     virtual ggml_tensor * build_inp_mean(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual ggml_tensor * build_inp_cls(
             ggml_context * ctx0,
-                 int32_t   n_tokens);
+                 int32_t   n_tokens) override;
 
     virtual void build_attn_inp(
             ggml_context * ctx0,
                  int32_t   n_tokens,
                     bool   causal,
-                    bool   swa);
+                    bool   swa) override;
 
     virtual ggml_tensor * build_attn(
             ggml_context * ctx0,
@@ -266,7 +267,17 @@ protected:
              ggml_tensor * kq_b,
                  int32_t   n_tokens,
                  float     kq_scale,
-                 int       il);
+                 int       il) override;
+
+protected:
+    virtual void build_kv_self_shift(
+            ggml_context * ctx0,
+            ggml_cgraph * gf);
+
+    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
+    virtual void build_kv_self_defrag(
+            ggml_context * ctx0,
+            ggml_cgraph * gf);
 
 public:
     //
@@ -434,6 +445,7 @@ protected:
 
     virtual ggml_cgraph * graph_init() override;
 
+public:
     //
     // graph build
     //
@@ -463,6 +475,7 @@ protected:
                  float     kq_scale,
                  int       il) override;
 
+protected:
     virtual void build_kv_self_shift(
             ggml_context * ctx0,
             ggml_cgraph * gf) override;
@@ -548,6 +561,7 @@ protected:
 
     virtual ggml_cgraph * graph_init() override;
 
+public:
     //
     // graph build
     //
@@ -600,6 +614,7 @@ protected:
       const llama_ubatch & ubatch,
                      int   il) override;
 
+protected:
     //
     // state save/load
     //
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 3ac96908d..25922260d 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -32,24 +32,6 @@ ggml_tensor * llama_graph_i::build_attn(
     return nullptr;
 }
 
-void llama_graph_i::build_kv_self_shift(
-        ggml_context * ctx0,
-        ggml_cgraph * gf) {
-    GGML_UNUSED(ctx0);
-    GGML_UNUSED(gf);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-}
-
-void llama_graph_i::build_kv_self_defrag(
-        ggml_context * ctx0,
-        ggml_cgraph * gf) {
-    GGML_UNUSED(ctx0);
-    GGML_UNUSED(gf);
-
-    LLAMA_LOG_ERROR("%s: not implemented\n", __func__);
-}
-
 ggml_tensor * llama_graph_i::build_inp_self_k_shift(
         ggml_context * ctx0) {
     GGML_UNUSED(ctx0);
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 5df90e76d..3433caf63 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -117,15 +117,6 @@ public:
                  float     kq_scale,
                  int       il);
 
-    virtual void build_kv_self_shift(
-            ggml_context * ctx0,
-            ggml_cgraph * gf);
-
-    // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
-    virtual void build_kv_self_defrag(
-            ggml_context * ctx0,
-            ggml_cgraph * gf);
-
     virtual ggml_tensor * build_inp_self_k_shift(
             ggml_context * ctx0);