mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
llama-bench : accept ranges for integer parameters (#13410)
This commit is contained in:
@ -20,10 +20,20 @@ Performance testing tool for llama.cpp.
|
||||
## Syntax
|
||||
|
||||
```
|
||||
usage: ./llama-bench [options]
|
||||
usage: llama-bench [options]
|
||||
|
||||
options:
|
||||
-h, --help
|
||||
--numa <distribute|isolate|numactl> numa mode (default: disabled)
|
||||
-r, --repetitions <n> number of times to repeat each test (default: 5)
|
||||
--prio <0|1|2|3> process/thread priority (default: 0)
|
||||
--delay <0...N> (seconds) delay between each test (default: 0)
|
||||
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
|
||||
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
|
||||
-v, --verbose verbose output
|
||||
--progress print test progress indicators
|
||||
|
||||
test parameters:
|
||||
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
|
||||
-p, --n-prompt <n> (default: 512)
|
||||
-n, --n-gen <n> (default: 128)
|
||||
@ -33,7 +43,7 @@ options:
|
||||
-ub, --ubatch-size <n> (default: 512)
|
||||
-ctk, --cache-type-k <t> (default: f16)
|
||||
-ctv, --cache-type-v <t> (default: f16)
|
||||
-t, --threads <n> (default: 8)
|
||||
-t, --threads <n> (default: 16)
|
||||
-C, --cpu-mask <hex,hex> (default: 0x0)
|
||||
--cpu-strict <0|1> (default: 0)
|
||||
--poll <0...100> (default: 50)
|
||||
@ -44,17 +54,15 @@ options:
|
||||
-nkvo, --no-kv-offload <0|1> (default: 0)
|
||||
-fa, --flash-attn <0|1> (default: 0)
|
||||
-mmp, --mmap <0|1> (default: 1)
|
||||
--numa <distribute|isolate|numactl> (default: disabled)
|
||||
-embd, --embeddings <0|1> (default: 0)
|
||||
-ts, --tensor-split <ts0/ts1/..> (default: 0)
|
||||
-r, --repetitions <n> (default: 5)
|
||||
--prio <0|1|2|3> (default: 0)
|
||||
--delay <0...N> (seconds) (default: 0)
|
||||
-o, --output <csv|json|jsonl|md|sql> (default: md)
|
||||
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
|
||||
-v, --verbose (default: 0)
|
||||
-ot --override-tensors <tensor name pattern>=<buffer type>;...
|
||||
(default: disabled)
|
||||
-nopo, --no-op-offload <0|1> (default: 0)
|
||||
|
||||
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.
|
||||
Multiple values can be given for each parameter by separating them with ','
|
||||
or by specifying the parameter multiple times. Ranges can be given as
|
||||
'start-end' or 'start-end+step' or 'start-end*mult'.
|
||||
```
|
||||
|
||||
llama-bench can perform three types of tests:
|
||||
|
@ -195,6 +195,40 @@ static std::string pair_str(const std::pair<int, int> & p) {
|
||||
return buf;
|
||||
}
|
||||
|
||||
static std::vector<int> parse_int_range(const std::string & s) {
|
||||
// first[-last[(+|*)step]]
|
||||
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
|
||||
|
||||
std::smatch match;
|
||||
std::string::const_iterator search_start(s.cbegin());
|
||||
std::vector<int> result;
|
||||
while (std::regex_search(search_start, s.cend(), match, range_regex)) {
|
||||
int first = std::stoi(match[1]);
|
||||
int last = match[2].matched ? std::stoi(match[2]) : first;
|
||||
char op = match[3].matched ? match[3].str()[0] : '+';
|
||||
int step = match[4].matched ? std::stoi(match[4]) : 1;
|
||||
|
||||
for (int i = first; i <= last;) {
|
||||
result.push_back(i);
|
||||
|
||||
if (op == '+') {
|
||||
i += step;
|
||||
} else if (op == '*') {
|
||||
i *= step;
|
||||
} else {
|
||||
throw std::invalid_argument("invalid range format");
|
||||
}
|
||||
}
|
||||
search_start = match.suffix().first;
|
||||
}
|
||||
|
||||
if (search_start != s.cend()) {
|
||||
throw std::invalid_argument("invalid range format");
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
struct cmd_params {
|
||||
std::vector<std::string> model;
|
||||
std::vector<int> n_prompt;
|
||||
@ -251,7 +285,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* no_kv_offload */ { false },
|
||||
/* flash_attn */ { false },
|
||||
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} },
|
||||
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
|
||||
/* use_mmap */ { true },
|
||||
/* embeddings */ { false },
|
||||
/* no_op_offload */ { false },
|
||||
@ -270,13 +304,29 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help\n");
|
||||
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
|
||||
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
|
||||
cmd_params_defaults.reps);
|
||||
printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
|
||||
cmd_params_defaults.prio);
|
||||
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
|
||||
cmd_params_defaults.delay);
|
||||
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
|
||||
output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
|
||||
output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" -v, --verbose verbose output\n");
|
||||
printf(" --progress print test progress indicators\n");
|
||||
printf("\n");
|
||||
printf("test parameters:\n");
|
||||
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
|
||||
printf(" -p, --n-prompt <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_prompt, ",").c_str());
|
||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -pg <pp,tg> (default: %s)\n",
|
||||
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
|
||||
printf(" -d, --n-depth <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_depth, ",").c_str());
|
||||
printf(" -b, --batch-size <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_batch, ",").c_str());
|
||||
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
||||
@ -308,25 +358,17 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
join(cmd_params_defaults.flash_attn, ",").c_str());
|
||||
printf(" -mmp, --mmap <0|1> (default: %s)\n",
|
||||
join(cmd_params_defaults.use_mmap, ",").c_str());
|
||||
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
||||
printf(" -embd, --embeddings <0|1> (default: %s)\n",
|
||||
join(cmd_params_defaults.embeddings, ",").c_str());
|
||||
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
||||
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n");
|
||||
printf(" -nopo, --no-op-offload <i> (default: 0)\n");
|
||||
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
||||
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
|
||||
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
|
||||
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
|
||||
output_format_str(cmd_params_defaults.output_format));
|
||||
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
|
||||
output_format_str(cmd_params_defaults.output_format_stderr));
|
||||
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
|
||||
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
|
||||
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
|
||||
printf(" (default: disabled)\n");
|
||||
printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
|
||||
printf("\n");
|
||||
printf(
|
||||
"Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter "
|
||||
"multiple times.\n");
|
||||
"Multiple values can be given for each parameter by separating them with ','\n"
|
||||
"or by specifying the parameter multiple times. Ranges can be given as\n"
|
||||
"'start-end' or 'start-end+step' or 'start-end*mult'.\n");
|
||||
}
|
||||
|
||||
static ggml_type ggml_type_from_name(const std::string & s) {
|
||||
@ -380,6 +422,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
std::replace(arg.begin(), arg.end(), '_', '-');
|
||||
}
|
||||
|
||||
try {
|
||||
if (arg == "-h" || arg == "--help") {
|
||||
print_usage(argc, argv);
|
||||
exit(0);
|
||||
@ -395,14 +438,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
|
||||
} else if (arg == "-n" || arg == "--n-gen") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
|
||||
} else if (arg == "-pg") {
|
||||
if (++i >= argc) {
|
||||
@ -420,21 +463,21 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ub" || arg == "--ubatch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ctk" || arg == "--cache-type-k") {
|
||||
if (++i >= argc) {
|
||||
@ -442,6 +485,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
|
||||
std::vector<ggml_type> types;
|
||||
for (const auto & t : p) {
|
||||
ggml_type gt = ggml_type_from_name(t);
|
||||
@ -461,6 +505,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
|
||||
std::vector<ggml_type> types;
|
||||
for (const auto & t : p) {
|
||||
ggml_type gt = ggml_type_from_name(t);
|
||||
@ -479,7 +524,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
|
||||
} else if (arg == "-C" || arg == "--cpu-mask") {
|
||||
if (++i >= argc) {
|
||||
@ -500,14 +545,14 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.poll.insert(params.poll.end(), p.begin(), p.end());
|
||||
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
auto p = parse_int_range(argv[i]);
|
||||
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
|
||||
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
|
||||
if (++i >= argc) {
|
||||
@ -521,6 +566,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
auto p = string_split<std::string>(argv[i], split_delim);
|
||||
|
||||
std::vector<llama_split_mode> modes;
|
||||
for (const auto & m : p) {
|
||||
llama_split_mode mode;
|
||||
@ -545,7 +591,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.main_gpu = string_split<int>(argv[i], split_delim);
|
||||
params.main_gpu = parse_int_range(argv[i]);
|
||||
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -557,9 +603,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
} else {
|
||||
}
|
||||
std::string value(argv[i]);
|
||||
/**/ if (value == "distribute" || value == "") {
|
||||
if (value == "distribute" || value == "") {
|
||||
params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
|
||||
} else if (value == "isolate") {
|
||||
params.numa = GGML_NUMA_STRATEGY_ISOLATE;
|
||||
@ -569,7 +615,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (arg == "-fa" || arg == "--flash-attn") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -739,7 +784,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
} catch (const std::exception & e) {
|
||||
fprintf(stderr, "error: %s\n", e.what());
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (invalid_param) {
|
||||
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
|
||||
print_usage(argc, argv);
|
||||
|
Reference in New Issue
Block a user