graph : avoid huge warm-up graphs for MoE models (#14753)

* graph : avoid huge warm-up graphs for MoE models ggml-ci * cont : bump max nodes to 8x model tensors
2025-07-29 05:33:37 -04:00 · 2025-07-18 14:31:15 +03:00
parent eacdeb5bfc
commit d498af3d5a
2 changed files with 6 additions and 3 deletions
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
    }

    // aggregate experts
+    // note: here we explicitly use hparams.n_expert_used instead of n_expert_used
+    //       to avoid potentially a large number of add nodes during warmup
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/14753
    ggml_tensor * moe_out = nullptr;
-    for (int i = 0; i < n_expert_used; ++i) {
+    for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
        ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
                experts->nb[2], i*experts->nb[1]);

@@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
        }
    }

-    if (n_expert_used == 1) {
+    if (hparams.n_expert_used == 1) {
        // avoid returning a non-contiguous tensor
        moe_out = ggml_cont(ctx0, moe_out);
    }