diff --git a/common/arg.cpp b/common/arg.cpp index 7744fd6c4..0a4a15e7f 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2095,6 +2095,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.no_kv_offload = true; } ).set_env("LLAMA_ARG_NO_KV_OFFLOAD")); + add_opt(common_arg( + {"-nr", "--no-repack"}, + "disable weight repacking", + [](common_params & params) { + params.no_extra_bufts = true; + } + ).set_env("LLAMA_ARG_NO_REPACK")); add_opt(common_arg( {"-ctk", "--cache-type-k"}, "TYPE", string_format( diff --git a/common/common.cpp b/common/common.cpp index d8c4d988b..c6962d1d1 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1122,6 +1122,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; mparams.check_tensors = params.check_tensors; + mparams.use_extra_bufts = !params.no_extra_bufts; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index f5acf37ff..45cf07b15 100644 --- a/common/common.h +++ b/common/common.h @@ -359,6 +359,7 @@ struct common_params { bool warmup = true; // warmup run bool check_tensors = false; // validate tensor data bool no_op_offload = false; // globally disable offload host tensor operations to device + bool no_extra_bufts = false; // disable extra buffer types (used for weight repacking) bool single_turn = false; // single turn chat conversation diff --git a/include/llama.h b/include/llama.h index 1a51e74a8..2cbe18d8c 100644 --- a/include/llama.h +++ b/include/llama.h @@ -284,10 +284,11 @@ extern "C" { const struct llama_model_kv_override * kv_overrides; // Keep the booleans together to avoid misalignment during copy-by-value. - bool vocab_only; // only load the vocabulary, no weights - bool use_mmap; // use mmap if possible - bool use_mlock; // force system to keep model in RAM - bool check_tensors; // validate model tensor data + bool vocab_only; // only load the vocabulary, no weights + bool use_mmap; // use mmap if possible + bool use_mlock; // force system to keep model in RAM + bool check_tensors; // validate model tensor data + bool use_extra_bufts; // use extra buffer types (used for weight repacking) }; // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 56c2ecd4c..3983a6932 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -290,7 +290,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara } // CPU: ACCEL -> GPU host -> CPU extra -> CPU -static buft_list_t make_cpu_buft_list(const std::vector & devices) { +static buft_list_t make_cpu_buft_list(const std::vector & devices, bool use_extra_bufts) { buft_list_t buft_list; // add ACCEL buffer types @@ -319,21 +319,22 @@ static buft_list_t make_cpu_buft_list(const std::vector & de } } - // add extra buffer types, only if no GPU device is present - // ref: https://github.com/ggml-org/llama.cpp/issues/12481#issuecomment-2743136094 - auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); - if (cpu_dev == nullptr) { - throw std::runtime_error(format("%s: no CPU backend found", __func__)); - } + // add extra buffer types + if (use_extra_bufts) { + auto * cpu_dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU); + if (cpu_dev == nullptr) { + throw std::runtime_error(format("%s: no CPU backend found", __func__)); + } - auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); - auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) - ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); - if (ggml_backend_dev_get_extra_bufts_fn) { - ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); - while (extra_bufts && *extra_bufts) { - buft_list.emplace_back(cpu_dev, *extra_bufts); - ++extra_bufts; + auto * cpu_reg = ggml_backend_dev_backend_reg(cpu_dev); + auto ggml_backend_dev_get_extra_bufts_fn = (ggml_backend_dev_get_extra_bufts_t) + ggml_backend_reg_get_proc_address(cpu_reg, "ggml_backend_dev_get_extra_bufts"); + if (ggml_backend_dev_get_extra_bufts_fn) { + ggml_backend_buffer_type_t * extra_bufts = ggml_backend_dev_get_extra_bufts_fn(cpu_dev); + while (extra_bufts && *extra_bufts) { + buft_list.emplace_back(cpu_dev, *extra_bufts); + ++extra_bufts; + } } } @@ -1839,7 +1840,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) { LLAMA_LOG_INFO("%s: loading model tensors, this can take a while... (mmap = %s)\n", __func__, ml.use_mmap ? "true" : "false"); // build a list of buffer types for the CPU and GPU devices - pimpl->cpu_buft_list = make_cpu_buft_list(devices); + pimpl->cpu_buft_list = make_cpu_buft_list(devices, params.use_extra_bufts); for (auto * dev : devices) { buft_list_t buft_list = make_gpu_buft_list(dev, split_mode, tensor_split); // add CPU buffer types as a fallback @@ -2044,7 +2045,13 @@ bool llama_model::load_tensors(llama_model_loader & ml) { for (const auto * overrides = ml.tensor_buft_overrides; overrides->pattern != nullptr; ++overrides) { std::regex pattern(overrides->pattern); if (std::regex_search(tensor_name, pattern)) { - buft = overrides->buft; + if (overrides->buft == ggml_backend_cpu_buffer_type()) { + // when overriding to a CPU buffer, consider the extra buffer types + buft = select_weight_buft(hparams, t_meta, op, pimpl->cpu_buft_list); + } else { + buft = overrides->buft; + } + LLAMA_LOG_DEBUG("tensor %s (%zu MiB %s) buffer type overridden to %s\n", tensor_name.c_str(), ggml_nbytes(t_meta) / 1024 / 1024, ggml_type_name(t_meta->type), @@ -17839,6 +17846,7 @@ llama_model_params llama_model_default_params() { /*.use_mmap =*/ true, /*.use_mlock =*/ false, /*.check_tensors =*/ false, + /*.use_extra_bufts =*/ true, }; #ifdef GGML_USE_METAL