Add --no-op-offload to improve -ot pp perf in MoE models like llama4 400B (#13386)

This commit is contained in:
David Huang
2025-05-11 20:18:39 +08:00
committed by GitHub
parent 3eac209319
commit 7f323a589f
11 changed files with 57 additions and 9 deletions

View File

@ -363,6 +363,7 @@ extern "C" {
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
bool no_perf; // whether to measure performance timings
bool op_offload; // whether to offload host tensor operations to device
};
// model quantization parameters