Add --no-op-offload to improve -ot pp perf in MoE models like llama4 400B (#13386)

This commit is contained in:
David Huang
2025-05-11 20:18:39 +08:00
committed by GitHub
parent 3eac209319
commit 7f323a589f
11 changed files with 57 additions and 9 deletions

View File

@ -853,7 +853,7 @@ int main(void) {
backends_modded.insert(backends_modded.end(), backends.begin(), backends.end());
ggml_backend_sched_t backend_sched = ggml_backend_sched_new(
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false);
backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true);
printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i]));
printf(" Device description: %s\n", ggml_backend_dev_description(devs[i]));