mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-29 05:33:37 -04:00
@@ -907,20 +907,25 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
||||
cb(cur, "ffn_moe_weighted", il);
|
||||
}
|
||||
|
||||
ggml_tensor * cur_experts[LLAMA_MAX_EXPERTS] = { nullptr };
|
||||
|
||||
assert(n_expert_used > 0);
|
||||
|
||||
// order the views before the adds
|
||||
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
||||
cur_experts[i] = ggml_view_2d(ctx0, experts, n_embd, n_tokens, experts->nb[2], i*experts->nb[1]);
|
||||
|
||||
ggml_build_forward_expand(gf, cur_experts[i]);
|
||||
}
|
||||
|
||||
// aggregate experts
|
||||
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
||||
// to avoid potentially a large number of add nodes during warmup
|
||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
||||
ggml_tensor * moe_out = nullptr;
|
||||
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
||||
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
|
||||
experts->nb[2], i*experts->nb[1]);
|
||||
ggml_tensor * moe_out = cur_experts[0];
|
||||
|
||||
if (i == 0) {
|
||||
moe_out = cur_expert;
|
||||
} else {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_expert);
|
||||
}
|
||||
for (uint32_t i = 1; i < hparams.n_expert_used; ++i) {
|
||||
moe_out = ggml_add(ctx0, moe_out, cur_experts[i]);
|
||||
}
|
||||
|
||||
if (hparams.n_expert_used == 1) {
|
||||
|
Reference in New Issue
Block a user