From 6378112cb5c91125f32bcf35e7f556ee6be40fb9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Feb 2025 19:39:22 +0200 Subject: [PATCH] graph : remove the build_kv_... API from llama_graph_i ggml-ci --- src/llama-context.cpp | 19 +++++++++++++++++ src/llama-context.h | 47 ++++++++++++++++++++++++++++--------------- src/llama-graph.cpp | 18 ----------------- src/llama-graph.h | 9 --------- 4 files changed, 50 insertions(+), 43 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index d98f4662c..5ad1e2a61 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -1842,6 +1842,25 @@ ggml_tensor * llama_context::build_attn( return cur; } +void llama_context::build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); +} + +void llama_context::build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) { + GGML_UNUSED(ctx0); + GGML_UNUSED(gf); + + LLAMA_LOG_ERROR("%s: not implemented\n", __func__); +} + + // // perf // diff --git a/src/llama-context.h b/src/llama-context.h index 3e9baabfb..09c8f4842 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -171,7 +171,7 @@ protected: // graph // - // zero-out inputs and create the ctx_context for the compute graph + // zero-out inputs and create the ctx_compute for the compute graph virtual ggml_cgraph * graph_init(); // TODO: add encode/decode graphs @@ -187,73 +187,74 @@ protected: ggml_context_ptr ctx_compute; +public: // - // graph build API (generic) + // graph build // virtual void build_cb( ggml_tensor * cur, const char * name, const llama_ubatch & ubatch, - int il); + int il) override; // apply control vector for layer il virtual ggml_tensor * build_cvec( ggml_context * ctx0, ggml_tensor * cur, - int il); + int il) override; // do mat_mul, while optionally apply lora virtual ggml_tensor * build_lora_mm( ggml_context * ctx0, ggml_tensor * w, - ggml_tensor * cur); + ggml_tensor * cur) override; // do mat_mul_id, while optionally apply lora virtual ggml_tensor * build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, // struct ggml_tensor * as ggml_tensor * cur, // struct ggml_tensor * b - ggml_tensor * ids); + ggml_tensor * ids) override; - virtual ggml_tensor * build_rope_factors(int il); + virtual ggml_tensor * build_rope_factors(int il) override; virtual ggml_tensor * build_rope_shift( ggml_context * ctx0, ggml_tensor * cur, ggml_tensor * shift, ggml_tensor * factors, - ggml_backend_buffer * bbuf); + ggml_backend_buffer * bbuf) override; virtual ggml_tensor * build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, - const llama_ubatch & ubatch); + const llama_ubatch & ubatch) override; virtual ggml_tensor * build_inp_pos( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual ggml_tensor * build_inp_pos_bucket( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual ggml_tensor * build_inp_out_ids( - ggml_context * ctx0); + ggml_context * ctx0) override; virtual ggml_tensor * build_inp_mean( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual ggml_tensor * build_inp_cls( ggml_context * ctx0, - int32_t n_tokens); + int32_t n_tokens) override; virtual void build_attn_inp( ggml_context * ctx0, int32_t n_tokens, bool causal, - bool swa); + bool swa) override; virtual ggml_tensor * build_attn( ggml_context * ctx0, @@ -266,7 +267,17 @@ protected: ggml_tensor * kq_b, int32_t n_tokens, float kq_scale, - int il); + int il) override; + +protected: + virtual void build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf); + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf); public: // @@ -434,6 +445,7 @@ protected: virtual ggml_cgraph * graph_init() override; +public: // // graph build // @@ -463,6 +475,7 @@ protected: float kq_scale, int il) override; +protected: virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * gf) override; @@ -548,6 +561,7 @@ protected: virtual ggml_cgraph * graph_init() override; +public: // // graph build // @@ -600,6 +614,7 @@ protected: const llama_ubatch & ubatch, int il) override; +protected: // // state save/load // diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 3ac96908d..25922260d 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -32,24 +32,6 @@ ggml_tensor * llama_graph_i::build_attn( return nullptr; } -void llama_graph_i::build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - -void llama_graph_i::build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf) { - GGML_UNUSED(ctx0); - GGML_UNUSED(gf); - - LLAMA_LOG_ERROR("%s: not implemented\n", __func__); -} - ggml_tensor * llama_graph_i::build_inp_self_k_shift( ggml_context * ctx0) { GGML_UNUSED(ctx0); diff --git a/src/llama-graph.h b/src/llama-graph.h index 5df90e76d..3433caf63 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -117,15 +117,6 @@ public: float kq_scale, int il); - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * gf); - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * gf); - virtual ggml_tensor * build_inp_self_k_shift( ggml_context * ctx0);