From ec428b02c347767f24c78111309e3f30d2ada289 Mon Sep 17 00:00:00 2001 From: Diego Devesa Date: Mon, 4 Aug 2025 16:05:36 -0700 Subject: [PATCH] llama : add --n-cpu-moe option (#15077) * llama : add --n-cpu-moe option Keeps the MoE weights of the first N layers in the CPU --- common/arg.cpp | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index a02db0b0a..013616cc3 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } throw std::invalid_argument("unknown buffer type"); } - // FIXME: this leaks memory - params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)}); + // keep strings alive and avoid leaking memory by storing them in a static vector + static std::list buft_overrides; + buft_overrides.push_back(tensor_name); + params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)}); } } )); add_opt(common_arg( - {"--cpu-moe"}, - "use CPU for Mixture of Experts (MoE) weights", + {"--cpu-moe", "-cmoe"}, + "keep all Mixture of Experts (MoE) weights in the CPU", [](common_params & params) { - params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()}); - params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()}); - params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); } ).set_env("LLAMA_ARG_CPU_MOE")); + add_opt(common_arg( + {"--n-cpu-moe", "-ncmoe"}, "N", + "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU", + [](common_params & params, int value) { + if (value < 0) { + throw std::invalid_argument("invalid value"); + } + for (int i = 0; i < value; ++i) { + // keep strings alive and avoid leaking memory by storing them in a static vector + static std::list buft_overrides; + buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()}); + } + } + ).set_env("LLAMA_ARG_N_CPU_MOE")); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM",