mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-30 14:13:57 -04:00
graph : avoid huge warm-up graphs for MoE models (#14753)
* graph : avoid huge warm-up graphs for MoE models ggml-ci * cont : bump max nodes to 8x model tensors
This commit is contained in:
@@ -1312,7 +1312,7 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
|
|||||||
//
|
//
|
||||||
|
|
||||||
uint32_t llama_context::graph_max_nodes() const {
|
uint32_t llama_context::graph_max_nodes() const {
|
||||||
return std::max<uint32_t>(65536u, 5u*model.n_tensors());
|
return std::max<uint32_t>(1024u, 8u*model.n_tensors());
|
||||||
}
|
}
|
||||||
|
|
||||||
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
llm_graph_result * llama_context::get_gf_res_reserve() const {
|
||||||
|
@@ -906,8 +906,11 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// aggregate experts
|
// aggregate experts
|
||||||
|
// note: here we explicitly use hparams.n_expert_used instead of n_expert_used
|
||||||
|
// to avoid potentially a large number of add nodes during warmup
|
||||||
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14753
|
||||||
ggml_tensor * moe_out = nullptr;
|
ggml_tensor * moe_out = nullptr;
|
||||||
for (int i = 0; i < n_expert_used; ++i) {
|
for (uint32_t i = 0; i < hparams.n_expert_used; ++i) {
|
||||||
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
|
ggml_tensor * cur_expert = ggml_view_2d(ctx0, experts, n_embd, n_tokens,
|
||||||
experts->nb[2], i*experts->nb[1]);
|
experts->nb[2], i*experts->nb[1]);
|
||||||
|
|
||||||
@@ -918,7 +921,7 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_expert_used == 1) {
|
if (hparams.n_expert_used == 1) {
|
||||||
// avoid returning a non-contiguous tensor
|
// avoid returning a non-contiguous tensor
|
||||||
moe_out = ggml_cont(ctx0, moe_out);
|
moe_out = ggml_cont(ctx0, moe_out);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user