llama-bench : add defrag-thold, check for invalid ranges (#13487)

This commit is contained in:
Diego Devesa
2025-05-12 15:31:37 -07:00
committed by GitHub
parent f0d46ef157
commit cf0a43bb64
3 changed files with 49 additions and 15 deletions

View File

@ -345,7 +345,7 @@ extern "C" {
float yarn_beta_fast; // YaRN low correction dim float yarn_beta_fast; // YaRN low correction dim
float yarn_beta_slow; // YaRN high correction dim float yarn_beta_slow; // YaRN high correction dim
uint32_t yarn_orig_ctx; // YaRN original context size uint32_t yarn_orig_ctx; // YaRN original context size
float defrag_thold; // defragment the KV cache if holes/size > thold, < 0 disabled (default) float defrag_thold; // defragment the KV cache if holes/size > thold, <= 0 disabled (default)
ggml_backend_sched_eval_callback cb_eval; ggml_backend_sched_eval_callback cb_eval;
void * cb_eval_user_data; void * cb_eval_user_data;

View File

@ -43,12 +43,13 @@ test parameters:
-ub, --ubatch-size <n> (default: 512) -ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16) -ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 16) -dt, --defrag-thold <f> (default: -1)
-t, --threads <n> (default: system dependent)
-C, --cpu-mask <hex,hex> (default: 0x0) -C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0) --cpu-strict <0|1> (default: 0)
--poll <0...100> (default: 50) --poll <0...100> (default: 50)
-ngl, --n-gpu-layers <n> (default: 99) -ngl, --n-gpu-layers <n> (default: 99)
-rpc, --rpc <rpc_servers> (default: ) -rpc, --rpc <rpc_servers> (default: none)
-sm, --split-mode <none|layer|row> (default: layer) -sm, --split-mode <none|layer|row> (default: layer)
-mg, --main-gpu <i> (default: 0) -mg, --main-gpu <i> (default: 0)
-nkvo, --no-kv-offload <0|1> (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0)
@ -62,7 +63,7 @@ test parameters:
Multiple values can be given for each parameter by separating them with ',' Multiple values can be given for each parameter by separating them with ','
or by specifying the parameter multiple times. Ranges can be given as or by specifying the parameter multiple times. Ranges can be given as
'start-end' or 'start-end+step' or 'start-end*mult'. 'first-last' or 'first-last+step' or 'first-last*mult'.
``` ```
llama-bench can perform three types of tests: llama-bench can perform three types of tests:

View File

@ -211,6 +211,8 @@ static std::vector<int> parse_int_range(const std::string & s) {
for (int i = first; i <= last;) { for (int i = first; i <= last;) {
result.push_back(i); result.push_back(i);
int prev_i = i;
if (op == '+') { if (op == '+') {
i += step; i += step;
} else if (op == '*') { } else if (op == '*') {
@ -218,6 +220,10 @@ static std::vector<int> parse_int_range(const std::string & s) {
} else { } else {
throw std::invalid_argument("invalid range format"); throw std::invalid_argument("invalid range format");
} }
if (i <= prev_i) {
throw std::invalid_argument("invalid range");
}
} }
search_start = match.suffix().first; search_start = match.suffix().first;
} }
@ -239,6 +245,7 @@ struct cmd_params {
std::vector<int> n_ubatch; std::vector<int> n_ubatch;
std::vector<ggml_type> type_k; std::vector<ggml_type> type_k;
std::vector<ggml_type> type_v; std::vector<ggml_type> type_v;
std::vector<float> defrag_thold;
std::vector<int> n_threads; std::vector<int> n_threads;
std::vector<std::string> cpu_mask; std::vector<std::string> cpu_mask;
std::vector<bool> cpu_strict; std::vector<bool> cpu_strict;
@ -274,6 +281,7 @@ static const cmd_params cmd_params_defaults = {
/* n_ubatch */ { 512 }, /* n_ubatch */ { 512 },
/* type_k */ { GGML_TYPE_F16 }, /* type_k */ { GGML_TYPE_F16 },
/* type_v */ { GGML_TYPE_F16 }, /* type_v */ { GGML_TYPE_F16 },
/* defrag_thold */ { -1.0f },
/* n_threads */ { cpu_get_num_math() }, /* n_threads */ { cpu_get_num_math() },
/* cpu_mask */ { "0x0" }, /* cpu_mask */ { "0x0" },
/* cpu_strict */ { false }, /* cpu_strict */ { false },
@ -335,6 +343,8 @@ static void print_usage(int /* argc */, char ** argv) {
join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str());
printf(" -ctv, --cache-type-v <t> (default: %s)\n", printf(" -ctv, --cache-type-v <t> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str());
printf(" -dt, --defrag-thold <f> (default: %s)\n",
join(cmd_params_defaults.defrag_thold, ",").c_str());
printf(" -t, --threads <n> (default: %s)\n", printf(" -t, --threads <n> (default: %s)\n",
join(cmd_params_defaults.n_threads, ",").c_str()); join(cmd_params_defaults.n_threads, ",").c_str());
printf(" -C, --cpu-mask <hex,hex> (default: %s)\n", printf(" -C, --cpu-mask <hex,hex> (default: %s)\n",
@ -368,7 +378,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf( printf(
"Multiple values can be given for each parameter by separating them with ','\n" "Multiple values can be given for each parameter by separating them with ','\n"
"or by specifying the parameter multiple times. Ranges can be given as\n" "or by specifying the parameter multiple times. Ranges can be given as\n"
"'start-end' or 'start-end+step' or 'start-end*mult'.\n"); "'first-last' or 'first-last+step' or 'first-last*mult'.\n");
} }
static ggml_type ggml_type_from_name(const std::string & s) { static ggml_type ggml_type_from_name(const std::string & s) {
@ -519,6 +529,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
break; break;
} }
params.type_v.insert(params.type_v.end(), types.begin(), types.end()); params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-dt" || arg == "--defrag-thold") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<float>(argv[i], split_delim);
params.defrag_thold.insert(params.defrag_thold.end(), p.begin(), p.end());
} else if (arg == "-t" || arg == "--threads") { } else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
@ -825,6 +842,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
if (params.type_v.empty()) { if (params.type_v.empty()) {
params.type_v = cmd_params_defaults.type_v; params.type_v = cmd_params_defaults.type_v;
} }
if (params.defrag_thold.empty()) {
params.defrag_thold = cmd_params_defaults.defrag_thold;
}
if (params.n_gpu_layers.empty()) { if (params.n_gpu_layers.empty()) {
params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; params.n_gpu_layers = cmd_params_defaults.n_gpu_layers;
} }
@ -883,6 +903,7 @@ struct cmd_params_instance {
int n_ubatch; int n_ubatch;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
float defrag_thold;
int n_threads; int n_threads;
std::string cpu_mask; std::string cpu_mask;
bool cpu_strict; bool cpu_strict;
@ -959,15 +980,16 @@ struct cmd_params_instance {
llama_context_params to_llama_cparams() const { llama_context_params to_llama_cparams() const {
llama_context_params cparams = llama_context_default_params(); llama_context_params cparams = llama_context_default_params();
cparams.n_ctx = n_prompt + n_gen + n_depth; cparams.n_ctx = n_prompt + n_gen + n_depth;
cparams.n_batch = n_batch; cparams.n_batch = n_batch;
cparams.n_ubatch = n_ubatch; cparams.n_ubatch = n_ubatch;
cparams.type_k = type_k; cparams.type_k = type_k;
cparams.type_v = type_v; cparams.type_v = type_v;
cparams.offload_kqv = !no_kv_offload; cparams.defrag_thold = defrag_thold;
cparams.flash_attn = flash_attn; cparams.offload_kqv = !no_kv_offload;
cparams.embeddings = embeddings; cparams.flash_attn = flash_attn;
cparams.op_offload = !no_op_offload; cparams.embeddings = embeddings;
cparams.op_offload = !no_op_offload;
return cparams; return cparams;
} }
@ -992,6 +1014,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
for (const auto & nub : params.n_ubatch) for (const auto & nub : params.n_ubatch)
for (const auto & tk : params.type_k) for (const auto & tk : params.type_k)
for (const auto & tv : params.type_v) for (const auto & tv : params.type_v)
for (const auto & defrag_thold : params.defrag_thold)
for (const auto & nkvo : params.no_kv_offload) for (const auto & nkvo : params.no_kv_offload)
for (const auto & fa : params.flash_attn) for (const auto & fa : params.flash_attn)
for (const auto & nt : params.n_threads) for (const auto & nt : params.n_threads)
@ -1012,6 +1035,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub, /* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm, /* .cpu_mask = */ cm,
/* .cpu_strict = */ cs, /* .cpu_strict = */ cs,
@ -1044,6 +1068,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub, /* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm, /* .cpu_mask = */ cm,
/* .cpu_strict = */ cs, /* .cpu_strict = */ cs,
@ -1076,6 +1101,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
/* .n_ubatch = */ nub, /* .n_ubatch = */ nub,
/* .type_k = */ tk, /* .type_k = */ tk,
/* .type_v = */ tv, /* .type_v = */ tv,
/* .defrag_thold = */ defrag_thold,
/* .n_threads = */ nt, /* .n_threads = */ nt,
/* .cpu_mask = */ cm, /* .cpu_mask = */ cm,
/* .cpu_strict = */ cs, /* .cpu_strict = */ cs,
@ -1117,6 +1143,7 @@ struct test {
int poll; int poll;
ggml_type type_k; ggml_type type_k;
ggml_type type_v; ggml_type type_v;
float defrag_thold;
int n_gpu_layers; int n_gpu_layers;
llama_split_mode split_mode; llama_split_mode split_mode;
int main_gpu; int main_gpu;
@ -1151,6 +1178,7 @@ struct test {
poll = inst.poll; poll = inst.poll;
type_k = inst.type_k; type_k = inst.type_k;
type_v = inst.type_v; type_v = inst.type_v;
defrag_thold = inst.defrag_thold;
n_gpu_layers = inst.n_gpu_layers; n_gpu_layers = inst.n_gpu_layers;
split_mode = inst.split_mode; split_mode = inst.split_mode;
main_gpu = inst.main_gpu; main_gpu = inst.main_gpu;
@ -1206,6 +1234,7 @@ struct test {
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
"defrag_thold",
"use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time",
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
}; };
@ -1225,7 +1254,7 @@ struct test {
field == "use_mmap" || field == "embeddings") { field == "use_mmap" || field == "embeddings") {
return BOOL; return BOOL;
} }
if (field == "avg_ts" || field == "stddev_ts") { if (field == "avg_ts" || field == "stddev_ts" || field == "defrag_thold") {
return FLOAT; return FLOAT;
} }
return STRING; return STRING;
@ -1292,6 +1321,7 @@ struct test {
std::to_string(flash_attn), std::to_string(flash_attn),
tensor_split_str, tensor_split_str,
tensor_buft_overrides_str, tensor_buft_overrides_str,
std::to_string(defrag_thold),
std::to_string(use_mmap), std::to_string(use_mmap),
std::to_string(embeddings), std::to_string(embeddings),
std::to_string(no_op_offload), std::to_string(no_op_offload),
@ -1558,6 +1588,9 @@ struct markdown_printer : public printer {
if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) { if (params.type_v.size() > 1 || params.type_v != cmd_params_defaults.type_v) {
fields.emplace_back("type_v"); fields.emplace_back("type_v");
} }
if (params.defrag_thold.size() > 1 || params.defrag_thold != cmd_params_defaults.defrag_thold) {
fields.emplace_back("defrag_thold");
}
if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) {
fields.emplace_back("main_gpu"); fields.emplace_back("main_gpu");
} }