llama : Support llama 4 text-only (#12791)

* llama4 conversion * initial support, no chat template * clean up a bit * fix tokenizer conversion * correct hparams * try this * fix shexp * ffn_inp_normed * chat template * clean up model conversion * add_bos * add scale_before_ffn * fix order * weight_before_ffn * llm_graph_input_attn_temp * add chunk attn mask * build_inp_attn_scale() * add comment about ggml_repeat * clarify comments * fix build
2025-06-27 20:05:20 +00:00 · 2025-04-07 23:06:44 +02:00
parent 82974011f3
commit 1466621e73
17 changed files with 532 additions and 22 deletions
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -100,6 +100,23 @@ public:
    const int64_t n_pos_per_token = 1;
 };

+// temperature tuning, used by llama4
+class llm_graph_input_attn_temp : public llm_graph_input_i {
+public:
+    llm_graph_input_attn_temp(int64_t n_pos_per_token, uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
+        : n_pos_per_token(n_pos_per_token), n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
+    virtual ~llm_graph_input_attn_temp() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
+
+    const int64_t n_pos_per_token = 1;
+
+    const uint32_t n_attn_temp_floor_scale;
+    const float    f_attn_temp_scale;
+};
+
 class llm_graph_input_pos_bucket : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
@ -470,6 +487,7 @@ struct llm_graph_context {

    ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
    ggml_tensor * build_inp_pos() const;
+    ggml_tensor * build_inp_attn_scale() const;
    ggml_tensor * build_inp_out_ids() const;
    ggml_tensor * build_inp_mean() const;
    ggml_tensor * build_inp_cls() const;