model : gemma3n text-only (#14400)

* gemma3n * add llm_graph_input_one
2025-08-06 01:05:03 -04:00 · 2025-06-26 19:34:02 +02:00
parent a01047b041
commit 8846aace49
13 changed files with 960 additions and 15 deletions
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -329,6 +329,17 @@ public:
    const llama_memory_hybrid_context * mctx;
 };

+// TODO: remove this when ggml_scale_add is implemented
+class llm_graph_input_one : public llm_graph_input_i {
+public:
+    llm_graph_input_one() {}
+    virtual ~llm_graph_input_one() = default;
+
+    void set_input(const llama_ubatch *) override;
+
+    ggml_tensor * one = nullptr; // F32
+};
+
 //
 // llm_graph_result
 //
@@ -589,14 +600,15 @@ struct llm_graph_context {

    llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;

+    // note: if k_cur or v_cur are not provided, they will not be stored in the memory
    ggml_tensor * build_attn(
            llm_graph_input_attn_kv_unified_iswa * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
-            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
-            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
+            ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
+            ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
            ggml_tensor * kq_b,
            ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
                  float   kq_scale,