metal : fuse add, mul + add tests (#14596)

ggml-ci
2025-07-29 05:33:37 -04:00 · 2025-07-18 20:37:26 +03:00
parent 9fb1042ce6
commit bf9087f59a
8 changed files with 578 additions and 172 deletions
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -907,20 +907,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        cb(cur, "ffn_moe_weighted", il);
    }

+    ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
+
+    assert(n_expert_used > 0);
+
+    // order the views before the adds
+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
+        cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
+
+        ggml_build_forward_expand(gf, cur_experts[i]);
+    }
+
    // aggregate experts
    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
    //       to avoid potentially a large number of add nodes during warmup
    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
-    ggml_tensor * moe_out = nullptr;
-    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
-        ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
-                experts->nb[2], i*experts->nb[1]);
+    ggml_tensor * moe_out = cur_experts[0];

-        if (i == 0) {
-            moe_out = cur_expert;
-        } else {
-            moe_out = ggml_add(ctx0, moe_out, cur_expert);
-        }
+    for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
+        moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
    }

    if (hparams.n_expert_used == 1) {