diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 01dd19e55..94d6d4f90 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2491,7 +2491,7 @@ void llama_context_kv_self::kv_self_update() { ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - build_k_shift(ctx0, gf); + build_kv_self_shift(ctx0, gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2520,7 +2520,7 @@ void llama_context_kv_self::kv_self_update() { ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - build_defrag(ctx0, gf); + build_kv_self_defrag(ctx0, gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2762,7 +2762,7 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv( return cur; } -ggml_tensor * llama_context_kv_self::build_soft_max_ext( +ggml_tensor * llama_context_kv_self::build_attn_soft_max( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) { @@ -2771,7 +2771,7 @@ ggml_tensor * llama_context_kv_self::build_soft_max_ext( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -void llama_context_kv_self::build_k_shift( +void llama_context_kv_self::build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * graph) { const auto & n_ctx = cparams.n_ctx; @@ -2843,7 +2843,7 @@ void llama_context_kv_self::build_k_shift( } } -void llama_context_kv_self::build_defrag( +void llama_context_kv_self::build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * graph) { const auto & hparams = model.hparams; @@ -2860,7 +2860,7 @@ void llama_context_kv_self::build_defrag( // number of cells moved uint32_t n_moves = 0; - // each move requires 6*n_layer tensors (see build_defrag) + // each move requires 6*n_layer tensors (see build_kv_self_defrag) // - source view, destination view, copy operation // - x2 for keys and values //const uint32_t max_moves = model.max_nodes()/(6*n_layer); diff --git a/src/llama-context.h b/src/llama-context.h index e3483228d..7a10f84bd 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -379,17 +379,17 @@ public: int il, bool worst_case) override; - virtual ggml_tensor * build_soft_max_ext( + virtual ggml_tensor * build_attn_soft_max( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) override; - virtual void build_k_shift( + virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * graph) override; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_defrag( + virtual void build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * graph) override; diff --git a/src/llama-graph.h b/src/llama-graph.h index 5267d53da..d60b57491 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -92,17 +92,17 @@ public: int il, bool worst_case) = 0; - virtual ggml_tensor * build_soft_max_ext( + virtual ggml_tensor * build_attn_soft_max( ggml_context * ctx0, ggml_tensor * kq, float kq_scale) = 0; - virtual void build_k_shift( + virtual void build_kv_self_shift( ggml_context * ctx0, ggml_cgraph * graph) = 0; // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_defrag( + virtual void build_kv_self_defrag( ggml_context * ctx0, ggml_cgraph * graph) = 0; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index ba11f1e15..543e78d2b 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -4251,18 +4251,18 @@ struct llm_build_context { return cur; } - struct ggml_cgraph * build_k_shift() { + struct ggml_cgraph * build_kv_self_shift() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - lgf.build_k_shift(ctx0, gf); + lgf.build_kv_self_shift(ctx0, gf); return gf; } - struct ggml_cgraph * build_defrag() { + struct ggml_cgraph * build_kv_self_defrag() { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false); - lgf.build_defrag(ctx0, gf); + lgf.build_kv_self_defrag(ctx0, gf); return gf; } @@ -5638,7 +5638,7 @@ struct llm_build_context { cb(kq, "kq", il); //kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); - kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); + kq = lgf.build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head))); cb(kq, "kq_soft_max_ext", il); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));