From 5eae8e5183f80a8b669757bde7b26cec05923081 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 12 Feb 2025 13:32:02 +0200
Subject: [PATCH] context : move build_rope_factors to base class

ggml-ci
---
 src/llama-context.cpp | 172 +++++++++++++++++++++---------------------
 src/llama-context.h   |  19 +++--
 src/llama.cpp         |  14 ++--
 3 files changed, 104 insertions(+), 101 deletions(-)

diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index f0d8bdaba..b29c98af6 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -57,6 +57,10 @@ uint32_t llama_context::n_ctx() const {
     return cparams.n_ctx;
 }
 
+uint32_t llama_context::n_ctx_per_seq() const {
+    return cparams.n_ctx / cparams.n_seq_max;
+}
+
 uint32_t llama_context::n_batch() const {
     return cparams.n_batch;
 }
@@ -122,8 +126,8 @@ void llama_context::synchronize() {
 }
 
 void llama_context::attach_threadpool(
-           ggml_threadpool_t   threadpool,
-           ggml_threadpool_t   threadpool_batch) {
+           ggml_threadpool_t threadpool,
+           ggml_threadpool_t threadpool_batch) {
     this->threadpool       = threadpool;
     this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool;
 }
@@ -202,6 +206,86 @@ llama_perf_context_data llama_context::perf_get_data() const {
     return data;
 }
 
+ggml_tensor * llama_context::build_cvec(
+        ggml_context * ctx0,
+         ggml_tensor * cur,
+                 int   il) {
+    return cvec.apply_to(ctx0, cur, il);
+}
+
+ggml_tensor * llama_context::build_lora_mm(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur) {
+    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
+
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float adapter_scale = lora.second;
+        const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat(
+            ctx0, lw->b,
+            ggml_mul_mat(ctx0, lw->a, cur)
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+ggml_tensor * llama_context::build_lora_mm_id(
+        ggml_context * ctx0,
+         ggml_tensor * w,
+         ggml_tensor * cur,
+         ggml_tensor * ids) {
+    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
+    for (const auto & lora : loras) {
+        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
+        if (lw == nullptr) {
+            continue;
+        }
+
+        const float alpha = lora.first->alpha;
+        const float rank  = (float) lw->b->ne[0];
+        const float scale = alpha ? lora.second * alpha / rank : lora.second;
+
+        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
+            ctx0, lw->b,
+            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
+            ids
+        );
+
+        ab_cur = ggml_scale(ctx0, ab_cur, scale);
+        res = ggml_add(ctx0, res, ab_cur);
+    }
+
+    return res;
+}
+
+ggml_tensor * llama_context::build_rope_factors(int il) {
+    const auto & hparams = model.hparams;
+
+    // choose long/short freq factors based on the context size
+    const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
+
+    if (model.layers[il].rope_freqs != nullptr) {
+        return model.layers[il].rope_freqs;
+    }
+
+    if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
+        return model.layers[il].rope_long;
+    }
+
+    return model.layers[il].rope_short;
+}
+
 void llama_context::perf_reset() {
     t_start_us  = ggml_time_us();
     t_eval_us   = n_eval = 0;
@@ -217,7 +301,7 @@ llama_context_unified::llama_context_unified(
         const llama_context_params & params,
         build_graph_callback && cb_build_graph) :
     llama_context(model),
-    cb_build_graph(std::move(cb_build_graph)){
+    cb_build_graph(std::move(cb_build_graph)) {
 
     const auto & hparams = model.hparams;
 
@@ -1825,69 +1909,6 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) {
     return n_outputs_max;
 }
 
-ggml_tensor * llama_context::build_cvec(
-        ggml_context * ctx0,
-         ggml_tensor * cur,
-                 int   il) {
-    return cvec.apply_to(ctx0, cur, il);
-}
-
-ggml_tensor * llama_context::build_lora_mm(
-        ggml_context * ctx0,
-         ggml_tensor * w,
-         ggml_tensor * cur) {
-    struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur);
-
-    for (const auto & lora : loras) {
-        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-
-        const float adapter_scale = lora.second;
-        const float scale = lw->get_scale(lora.first->alpha, adapter_scale);
-
-        struct ggml_tensor * ab_cur = ggml_mul_mat(
-            ctx0, lw->b,
-            ggml_mul_mat(ctx0, lw->a, cur)
-        );
-
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-
-    return res;
-}
-
-ggml_tensor * llama_context::build_lora_mm_id(
-        ggml_context * ctx0,
-         ggml_tensor * w,
-         ggml_tensor * cur,
-         ggml_tensor * ids) {
-    struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids);
-    for (const auto & lora : loras) {
-        struct llama_adapter_lora_weight * lw = lora.first->get_weight(w);
-        if (lw == nullptr) {
-            continue;
-        }
-
-        const float alpha = lora.first->alpha;
-        const float rank  = (float) lw->b->ne[0];
-        const float scale = alpha ? lora.second * alpha / rank : lora.second;
-
-        struct ggml_tensor * ab_cur = ggml_mul_mat_id(
-            ctx0, lw->b,
-            ggml_mul_mat_id(ctx0, lw->a, cur, ids),
-            ids
-        );
-
-        ab_cur = ggml_scale(ctx0, ab_cur, scale);
-        res = ggml_add(ctx0, res, ab_cur);
-    }
-
-    return res;
-}
-
 void llama_context_unified::kv_self_update() {
     auto & kv = kv_self;
 
@@ -2189,23 +2210,6 @@ ggml_tensor * llama_context_unified::build_soft_max_ext(
     return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
 }
 
-ggml_tensor * llama_context_unified::get_rope_factors(int il) {
-    const auto & hparams = model.hparams;
-
-    // choose long/short freq factors based on the context size
-    const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max;
-
-    if (model.layers[il].rope_freqs != nullptr) {
-        return model.layers[il].rope_freqs;
-    }
-
-    if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) {
-        return model.layers[il].rope_long;
-    }
-
-    return model.layers[il].rope_short;
-}
-
 ggml_tensor * llama_context_unified::build_inp_embd(
         ggml_context * ctx0,
          ggml_tensor * tok_embd,
@@ -2327,7 +2331,7 @@ void llama_context_unified::build_k_shift(
         const int64_t n_head_kv    = hparams.n_head_kv(il);
         const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il);
 
-        struct ggml_tensor * rope_factors = get_rope_factors(il);
+        struct ggml_tensor * rope_factors = build_rope_factors(il);
 
         struct ggml_tensor * k =
             ggml_view_3d(ctx0, kv_self.k_l[il],
diff --git a/src/llama-context.h b/src/llama-context.h
index 8ec7d3e2b..dd1030388 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -23,10 +23,11 @@ struct llama_context {
     const llama_model   & get_model()   const;
     const llama_cparams & get_cparams() const;
 
-    virtual uint32_t n_ctx()     const;
-    virtual uint32_t n_batch()   const;
-    virtual uint32_t n_ubatch()  const;
-    virtual uint32_t n_seq_max() const = 0;
+    virtual uint32_t n_ctx()         const;
+    virtual uint32_t n_ctx_per_seq() const;
+    virtual uint32_t n_batch()       const;
+    virtual uint32_t n_ubatch()      const;
+    virtual uint32_t n_seq_max()     const = 0;
 
     virtual uint32_t n_threads()       const;
     virtual uint32_t n_threads_batch() const;
@@ -126,6 +127,8 @@ struct llama_context {
              ggml_tensor * cur, // struct ggml_tensor * b
              ggml_tensor * ids);
 
+    virtual ggml_tensor * build_rope_factors(int il);
+
     // graph build API (context-specific)
 
     virtual ggml_tensor * build_inp_embd(
@@ -182,8 +185,6 @@ struct llama_context {
              ggml_tensor * kq,
                  float     kq_scale) = 0;
 
-    virtual ggml_tensor * get_rope_factors(int il) = 0;
-
     virtual void build_k_shift(
             ggml_context * ctx0,
              ggml_cgraph * graph) = 0;
@@ -342,7 +343,7 @@ class llama_context_unified : public llama_context {
 public:
     struct batch_manager;
 
-    // TODO: tmp until llama-model starts implementing the graph build function
+    // TODO: tmp until llama_model starts implementing the graph build function
     typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
 
     llama_context_unified(
@@ -496,8 +497,6 @@ public:
              ggml_tensor * kq,
                  float     kq_scale) override;
 
-    virtual ggml_tensor * get_rope_factors(int il) override;
-
     virtual void build_k_shift(
             ggml_context * ctx0,
              ggml_cgraph * graph) override;
@@ -601,7 +600,7 @@ public:
     virtual size_t state_get_data(      uint8_t * dst, size_t size) override;
     virtual size_t state_set_data(const uint8_t * src, size_t size) override;
 
-    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                  override;
+    virtual size_t state_seq_get_size(llama_seq_id seq_id)                                   override;
     virtual size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size) override;
     virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override;
 
diff --git a/src/llama.cpp b/src/llama.cpp
index c568f8d15..9e37b0cd4 100644
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -685,7 +685,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -857,7 +857,7 @@ struct llm_build_context {
             } else if (n_head > 0) {
                 // self-attention
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -2999,7 +2999,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 struct ggml_tensor* attn_norm_output = build_norm(inpL,
                     model.layers[il].attn_norm,
@@ -3706,7 +3706,7 @@ struct llm_build_context {
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * inpSA = inpL;
 
-            struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+            struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
             // norm
             cur = build_norm(inpL,
                     model.layers[il].attn_norm, NULL,
@@ -4480,7 +4480,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for 128k context
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -5373,7 +5373,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -6572,7 +6572,7 @@ struct llm_build_context {
             // self-attention
             {
                 // rope freq factors for llama3; may return nullptr for llama2 and other models
-                struct ggml_tensor * rope_factors = lctx.get_rope_factors(il);
+                struct ggml_tensor * rope_factors = lctx.build_rope_factors(il);
 
                 // compute Q and K and RoPE them
                 struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);