From 7f323a589f8684c0eb722e7309074cb5eac0c8b5 Mon Sep 17 00:00:00 2001 From: David Huang <1969802+hjc4869@users.noreply.github.com> Date: Sun, 11 May 2025 20:18:39 +0800 Subject: [PATCH] Add `--no-op-offload` to improve `-ot` pp perf in MoE models like llama4 400B (#13386) --- common/arg.cpp | 7 +++++++ common/common.cpp | 1 + common/common.h | 1 + ggml/include/ggml-backend.h | 4 ++-- ggml/src/ggml-backend.cpp | 8 +++++-- include/llama.h | 1 + src/llama-context.cpp | 4 +++- src/llama-cparams.h | 1 + tests/test-opt.cpp | 2 +- tools/llama-bench/llama-bench.cpp | 35 +++++++++++++++++++++++++++++-- tools/mtmd/clip.cpp | 2 +- 11 files changed, 57 insertions(+), 9 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index e0f1d998f..a1fd4c965 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2437,6 +2437,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); + add_opt(common_arg( + {"--no-op-offload"}, + string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"), + [](common_params & params) { + params.no_op_offload = true; + } + )); add_opt(common_arg( {"--lora"}, "FNAME", "path to LoRA adapter (can be repeated to use multiple adapters)", diff --git a/common/common.cpp b/common/common.cpp index bd20af233..710bf1fe2 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1113,6 +1113,7 @@ struct llama_context_params common_context_params_to_llama(const common_params & cparams.offload_kqv = !params.no_kv_offload; cparams.flash_attn = params.flash_attn; cparams.no_perf = params.no_perf; + cparams.op_offload = !params.no_op_offload; if (params.reranking) { cparams.embeddings = true; diff --git a/common/common.h b/common/common.h index d051d4ec9..e15356b12 100644 --- a/common/common.h +++ b/common/common.h @@ -332,6 +332,7 @@ struct common_params { bool no_kv_offload = false; // disable KV offloading bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data + bool no_op_offload = false; // globally disable offload host tensor operations to device bool single_turn = false; // single turn chat conversation diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h index ea2c1a402..778927f68 100644 --- a/ggml/include/ggml-backend.h +++ b/ggml/include/ggml-backend.h @@ -248,7 +248,7 @@ extern "C" { // preferrably to run on the same backend as the buffer ggml_backend_buffer_set_usage(buf_weights, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); - sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false); + sched = ggml_backend_sched_new({backend_gpu, backend_gpu2, backend_cpu}, NULL, num_backends, GGML_DEFAULT_GRAPH_SIZE, false, true); // initialize buffers from a max size graph (optional) reserve_graph = build_graph(sched, max_batch_size); @@ -289,7 +289,7 @@ extern "C" { typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); // Initialize a backend scheduler, backends with low index are given priority over backends with high index - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel); + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, bool parallel, bool op_offload); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp index c36b5abfb..6f69d895f 100644 --- a/ggml/src/ggml-backend.cpp +++ b/ggml/src/ggml-backend.cpp @@ -674,6 +674,8 @@ struct ggml_backend_sched { char * context_buffer; size_t context_buffer_size; + bool op_offload; + int debug; }; @@ -766,7 +768,7 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor); // check if a backend with higher prio wants to offload the op - if (src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { + if (sched->op_offload && src_backend_id == sched->n_backends - 1 && ggml_backend_buffer_is_host(src->buffer)) { for (int b = 0; b < src_backend_id; b++) { if (ggml_backend_supports_op(sched->backends[b], tensor) && ggml_backend_offload_op(sched->backends[b], tensor)) { SET_CAUSE(tensor, "1.off"); @@ -1452,7 +1454,8 @@ ggml_backend_sched_t ggml_backend_sched_new( ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size, - bool parallel) { + bool parallel, + bool op_offload) { GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_SCHED_MAX_BACKENDS); GGML_ASSERT(ggml_backend_dev_type(ggml_backend_get_device(backends[n_backends - 1])) == GGML_BACKEND_DEVICE_TYPE_CPU); @@ -1497,6 +1500,7 @@ ggml_backend_sched_t ggml_backend_sched_new( } sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); + sched->op_offload = op_offload; ggml_backend_sched_reset(sched); diff --git a/include/llama.h b/include/llama.h index 7d5f9d559..6c6d377f8 100644 --- a/include/llama.h +++ b/include/llama.h @@ -363,6 +363,7 @@ extern "C" { bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool flash_attn; // whether to use flash attention [EXPERIMENTAL] bool no_perf; // whether to measure performance timings + bool op_offload; // whether to offload host tensor operations to device }; // model quantization parameters diff --git a/src/llama-context.cpp b/src/llama-context.cpp index fd64622b8..a12849f0e 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -93,6 +93,7 @@ llama_context::llama_context( } cparams.n_ubatch = std::min(cparams.n_batch, params.n_ubatch == 0 ? params.n_batch : params.n_ubatch); + cparams.op_offload = params.op_offload; const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; @@ -243,7 +244,7 @@ llama_context::llama_context( } } - sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel)); + sched.reset(ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), max_nodes, pipeline_parallel, cparams.op_offload)); if (pipeline_parallel) { LLAMA_LOG_INFO("%s: pipeline parallelism enabled (n_copies=%d)\n", __func__, ggml_backend_sched_get_n_copies(sched.get())); @@ -1871,6 +1872,7 @@ llama_context_params llama_context_default_params() { /*.offload_kqv =*/ true, /*.flash_attn =*/ false, /*.no_perf =*/ true, + /*.op_offload =*/ true, }; return result; diff --git a/src/llama-cparams.h b/src/llama-cparams.h index 30e550f02..246fa5777 100644 --- a/src/llama-cparams.h +++ b/src/llama-cparams.h @@ -30,6 +30,7 @@ struct llama_cparams { bool flash_attn; bool no_perf; bool warmup; + bool op_offload; enum llama_pooling_type pooling_type; diff --git a/tests/test-opt.cpp b/tests/test-opt.cpp index f90c92b4b..1bc160511 100644 --- a/tests/test-opt.cpp +++ b/tests/test-opt.cpp @@ -853,7 +853,7 @@ int main(void) { backends_modded.insert(backends_modded.end(), backends.begin(), backends.end()); ggml_backend_sched_t backend_sched = ggml_backend_sched_new( - backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false); + backends_modded.data(), nullptr, backends_modded.size(), GGML_DEFAULT_GRAPH_SIZE, false, true); printf("Backend %zu/%zu: %s\n", i + 1, dev_count, ggml_backend_dev_name(devs[i])); printf(" Device description: %s\n", ggml_backend_dev_description(devs[i])); diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 078659429..5d26b506b 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -219,6 +219,7 @@ struct cmd_params { std::vector> tensor_buft_overrides; std::vector use_mmap; std::vector embeddings; + std::vector no_op_offload; ggml_numa_strategy numa; int reps; ggml_sched_priority prio; @@ -253,6 +254,7 @@ static const cmd_params cmd_params_defaults = { /* tensor_buft_overrides*/ { std::vector{{nullptr,nullptr}} }, /* use_mmap */ { true }, /* embeddings */ { false }, + /* no_op_offload */ { false }, /* numa */ GGML_NUMA_STRATEGY_DISABLED, /* reps */ 5, /* prio */ GGML_SCHED_PRIO_NORMAL, @@ -311,6 +313,7 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.embeddings, ",").c_str()); printf(" -ts, --tensor-split (default: 0)\n"); printf(" -ot --override-tensors =;... (default: disabled)\n"); + printf(" -nopo, --no-op-offload (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay); @@ -588,6 +591,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = string_split(argv[i], split_delim); params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); + } else if (arg == "-nopo" || arg == "--no-op-offload") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = string_split(argv[i], split_delim); + params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) { invalid_param = true; @@ -794,6 +804,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; } + if (params.no_op_offload.empty()) { + params.no_op_offload = cmd_params_defaults.no_op_offload; + } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -833,6 +846,7 @@ struct cmd_params_instance { std::vector tensor_buft_overrides; bool use_mmap; bool embeddings; + bool no_op_offload; llama_model_params to_llama_mparams() const { llama_model_params mparams = llama_model_default_params(); @@ -902,6 +916,7 @@ struct cmd_params_instance { cparams.offload_kqv = !no_kv_offload; cparams.flash_attn = flash_attn; cparams.embeddings = embeddings; + cparams.op_offload = !no_op_offload; return cparams; } @@ -921,6 +936,7 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & ot : params.tensor_buft_overrides) for (const auto & mmp : params.use_mmap) for (const auto & embd : params.embeddings) + for (const auto & nopo : params.no_op_offload) for (const auto & nb : params.n_batch) for (const auto & nub : params.n_ubatch) for (const auto & tk : params.type_k) @@ -959,6 +975,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, + /* .no_op_offload= */ nopo, }; instances.push_back(instance); } @@ -990,6 +1007,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, + /* .no_op_offload= */ nopo, }; instances.push_back(instance); } @@ -1021,6 +1039,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, /* .embeddings = */ embd, + /* .no_op_offload= */ nopo, }; instances.push_back(instance); } @@ -1056,6 +1075,7 @@ struct test { std::vector tensor_buft_overrides; bool use_mmap; bool embeddings; + bool no_op_offload; int n_prompt; int n_gen; int n_depth; @@ -1089,6 +1109,7 @@ struct test { tensor_buft_overrides = inst.tensor_buft_overrides; use_mmap = inst.use_mmap; embeddings = inst.embeddings; + no_op_offload = inst.no_op_offload; n_prompt = inst.n_prompt; n_gen = inst.n_gen; n_depth = inst.n_depth; @@ -1134,7 +1155,7 @@ struct test { "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", }; return fields; @@ -1146,7 +1167,7 @@ struct test { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns") { + field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1222,6 +1243,7 @@ struct test { tensor_buft_overrides_str, std::to_string(use_mmap), std::to_string(embeddings), + std::to_string(no_op_offload), std::to_string(n_prompt), std::to_string(n_gen), std::to_string(n_depth), @@ -1404,6 +1426,9 @@ struct markdown_printer : public printer { if (field == "test") { return 15; } + if (field == "no_op_offload") { + return 4; + } int width = std::max((int) field.length(), 10); @@ -1435,6 +1460,9 @@ struct markdown_printer : public printer { if (field == "embeddings") { return "embd"; } + if (field == "no_op_offload") { + return "nopo"; + } if (field == "tensor_split") { return "ts"; } @@ -1503,6 +1531,9 @@ struct markdown_printer : public printer { if (params.embeddings.size() > 1 || params.embeddings != cmd_params_defaults.embeddings) { fields.emplace_back("embeddings"); } + if (params.no_op_offload.size() > 1 || params.no_op_offload != cmd_params_defaults.no_op_offload) { + fields.emplace_back("no_op_offload"); + } fields.emplace_back("test"); fields.emplace_back("t/s"); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 735dfe7f7..3f11c301a 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -383,7 +383,7 @@ struct clip_ctx { backend_buft.push_back(ggml_backend_get_default_buffer_type(backend_cpu)); sched.reset( - ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false) + ggml_backend_sched_new(backend_ptrs.data(), backend_buft.data(), backend_ptrs.size(), 8192, false, true) ); }