Add --no-op-offload to improve -ot pp perf in MoE models like llama4 400B (#13386)

2025-06-26 11:45:21 +00:00 · 2025-05-11 20:18:39 +08:00
parent 3eac209319
commit 7f323a589f
11 changed files with 57 additions and 9 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -363,6 +363,7 @@ extern "C" {
        bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
        bool flash_attn;  // whether to use flash attention [EXPERIMENTAL]
        bool no_perf;     // whether to measure performance timings
+        bool op_offload;  // whether to offload host tensor operations to device
    };

    // model quantization parameters