llama-bench : accept ranges for integer parameters (#13410)

This commit is contained in:
Diego Devesa
2025-05-12 13:08:22 +02:00
committed by GitHub
parent a71a4075cd
commit 22cdab343b
2 changed files with 404 additions and 345 deletions

View File

@ -20,10 +20,20 @@ Performance testing tool for llama.cpp.
## Syntax ## Syntax
``` ```
usage: ./llama-bench [options] usage: llama-bench [options]
options: options:
-h, --help -h, --help
--numa <distribute|isolate|numactl> numa mode (default: disabled)
-r, --repetitions <n> number of times to repeat each test (default: 5)
--prio <0|1|2|3> process/thread priority (default: 0)
--delay <0...N> (seconds) delay between each test (default: 0)
-o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: none)
-v, --verbose verbose output
--progress print test progress indicators
test parameters:
-m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf) -m, --model <filename> (default: models/7B/ggml-model-q4_0.gguf)
-p, --n-prompt <n> (default: 512) -p, --n-prompt <n> (default: 512)
-n, --n-gen <n> (default: 128) -n, --n-gen <n> (default: 128)
@ -33,7 +43,7 @@ options:
-ub, --ubatch-size <n> (default: 512) -ub, --ubatch-size <n> (default: 512)
-ctk, --cache-type-k <t> (default: f16) -ctk, --cache-type-k <t> (default: f16)
-ctv, --cache-type-v <t> (default: f16) -ctv, --cache-type-v <t> (default: f16)
-t, --threads <n> (default: 8) -t, --threads <n> (default: 16)
-C, --cpu-mask <hex,hex> (default: 0x0) -C, --cpu-mask <hex,hex> (default: 0x0)
--cpu-strict <0|1> (default: 0) --cpu-strict <0|1> (default: 0)
--poll <0...100> (default: 50) --poll <0...100> (default: 50)
@ -44,17 +54,15 @@ options:
-nkvo, --no-kv-offload <0|1> (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0)
-fa, --flash-attn <0|1> (default: 0) -fa, --flash-attn <0|1> (default: 0)
-mmp, --mmap <0|1> (default: 1) -mmp, --mmap <0|1> (default: 1)
--numa <distribute|isolate|numactl> (default: disabled)
-embd, --embeddings <0|1> (default: 0) -embd, --embeddings <0|1> (default: 0)
-ts, --tensor-split <ts0/ts1/..> (default: 0) -ts, --tensor-split <ts0/ts1/..> (default: 0)
-r, --repetitions <n> (default: 5) -ot --override-tensors <tensor name pattern>=<buffer type>;...
--prio <0|1|2|3> (default: 0) (default: disabled)
--delay <0...N> (seconds) (default: 0) -nopo, --no-op-offload <0|1> (default: 0)
-o, --output <csv|json|jsonl|md|sql> (default: md)
-oe, --output-err <csv|json|jsonl|md|sql> (default: none)
-v, --verbose (default: 0)
Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times. Multiple values can be given for each parameter by separating them with ','
or by specifying the parameter multiple times. Ranges can be given as
'start-end' or 'start-end+step' or 'start-end*mult'.
``` ```
llama-bench can perform three types of tests: llama-bench can perform three types of tests:

View File

@ -195,6 +195,40 @@ static std::string pair_str(const std::pair<int, int> & p) {
return buf; return buf;
} }
static std::vector<int> parse_int_range(const std::string & s) {
// first[-last[(+|*)step]]
std::regex range_regex(R"(^(\d+)(?:-(\d+)(?:([\+|\*])(\d+))?)?(?:,|$))");
std::smatch match;
std::string::const_iterator search_start(s.cbegin());
std::vector<int> result;
while (std::regex_search(search_start, s.cend(), match, range_regex)) {
int first = std::stoi(match[1]);
int last = match[2].matched ? std::stoi(match[2]) : first;
char op = match[3].matched ? match[3].str()[0] : '+';
int step = match[4].matched ? std::stoi(match[4]) : 1;
for (int i = first; i <= last;) {
result.push_back(i);
if (op == '+') {
i += step;
} else if (op == '*') {
i *= step;
} else {
throw std::invalid_argument("invalid range format");
}
}
search_start = match.suffix().first;
}
if (search_start != s.cend()) {
throw std::invalid_argument("invalid range format");
}
return result;
}
struct cmd_params { struct cmd_params {
std::vector<std::string> model; std::vector<std::string> model;
std::vector<int> n_prompt; std::vector<int> n_prompt;
@ -251,7 +285,7 @@ static const cmd_params cmd_params_defaults = {
/* no_kv_offload */ { false }, /* no_kv_offload */ { false },
/* flash_attn */ { false }, /* flash_attn */ { false },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) }, /* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{{nullptr,nullptr}} }, /* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
/* use_mmap */ { true }, /* use_mmap */ { true },
/* embeddings */ { false }, /* embeddings */ { false },
/* no_op_offload */ { false }, /* no_op_offload */ { false },
@ -270,13 +304,29 @@ static void print_usage(int /* argc */, char ** argv) {
printf("\n"); printf("\n");
printf("options:\n"); printf("options:\n");
printf(" -h, --help\n"); printf(" -h, --help\n");
printf(" --numa <distribute|isolate|numactl> numa mode (default: disabled)\n");
printf(" -r, --repetitions <n> number of times to repeat each test (default: %d)\n",
cmd_params_defaults.reps);
printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n",
cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n",
cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> output format printed to stdout (default: %s)\n",
output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose verbose output\n");
printf(" --progress print test progress indicators\n");
printf("\n");
printf("test parameters:\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
printf(" -p, --n-prompt <n> (default: %s)\n", printf(" -p, --n-prompt <n> (default: %s)\n",
join(cmd_params_defaults.n_prompt, ",").c_str()); join(cmd_params_defaults.n_prompt, ",").c_str());
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
printf(" -pg <pp,tg> (default: %s)\n", printf(" -pg <pp,tg> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str()); join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str()); printf(" -d, --n-depth <n> (default: %s)\n",
join(cmd_params_defaults.n_depth, ",").c_str());
printf(" -b, --batch-size <n> (default: %s)\n", printf(" -b, --batch-size <n> (default: %s)\n",
join(cmd_params_defaults.n_batch, ",").c_str()); join(cmd_params_defaults.n_batch, ",").c_str());
printf(" -ub, --ubatch-size <n> (default: %s)\n", printf(" -ub, --ubatch-size <n> (default: %s)\n",
@ -308,25 +358,17 @@ static void print_usage(int /* argc */, char ** argv) {
join(cmd_params_defaults.flash_attn, ",").c_str()); join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", printf(" -mmp, --mmap <0|1> (default: %s)\n",
join(cmd_params_defaults.use_mmap, ",").c_str()); join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", printf(" -embd, --embeddings <0|1> (default: %s)\n",
join(cmd_params_defaults.embeddings, ",").c_str()); join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n"); printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;... (default: disabled)\n"); printf(" -ot --override-tensors <tensor name pattern>=<buffer type>;...\n");
printf(" -nopo, --no-op-offload <i> (default: 0)\n"); printf(" (default: disabled)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps); printf(" -nopo, --no-op-offload <0|1> (default: 0)\n");
printf(" --prio <0|1|2|3> (default: %d)\n", cmd_params_defaults.prio);
printf(" --delay <0...N> (seconds) (default: %d)\n", cmd_params_defaults.delay);
printf(" -o, --output <csv|json|jsonl|md|sql> (default: %s)\n",
output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> (default: %s)\n",
output_format_str(cmd_params_defaults.output_format_stderr));
printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0");
printf(" --progress (default: %s)\n", cmd_params_defaults.progress ? "1" : "0");
printf("\n"); printf("\n");
printf( printf(
"Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter " "Multiple values can be given for each parameter by separating them with ','\n"
"multiple times.\n"); "or by specifying the parameter multiple times. Ranges can be given as\n"
"'start-end' or 'start-end+step' or 'start-end*mult'.\n");
} }
static ggml_type ggml_type_from_name(const std::string & s) { static ggml_type ggml_type_from_name(const std::string & s) {
@ -380,186 +422,190 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
std::replace(arg.begin(), arg.end(), '_', '-'); std::replace(arg.begin(), arg.end(), '_', '-');
} }
if (arg == "-h" || arg == "--help") { try {
print_usage(argc, argv); if (arg == "-h" || arg == "--help") {
exit(0); print_usage(argc, argv);
} else if (arg == "-m" || arg == "--model") { exit(0);
if (++i >= argc) { } else if (arg == "-m" || arg == "--model") {
invalid_param = true; if (++i >= argc) {
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.model.insert(params.model.end(), p.begin(), p.end());
} else if (arg == "-p" || arg == "--n-prompt") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
} else if (arg == "-n" || arg == "--n-gen") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
} else if (arg == "-pg") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], ',');
if (p.size() != 2) {
invalid_param = true;
break;
}
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
} else if (arg == "-d" || arg == "--n-depth") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
} else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<int>(argv[i], split_delim);
params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
if (gt == GGML_TYPE_COUNT) {
invalid_param = true; invalid_param = true;
break; break;
} }
types.push_back(gt); auto p = string_split<std::string>(argv[i], split_delim);
} params.model.insert(params.model.end(), p.begin(), p.end());
if (invalid_param) { } else if (arg == "-p" || arg == "--n-prompt") {
break; if (++i >= argc) {
}
params.type_k.insert(params.type_k.end(), types.begin(), types.end());
} else if (arg == "-ctv" || arg == "--cache-type-v") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
if (gt == GGML_TYPE_COUNT) {
invalid_param = true; invalid_param = true;
break; break;
} }
types.push_back(gt); auto p = parse_int_range(argv[i]);
} params.n_prompt.insert(params.n_prompt.end(), p.begin(), p.end());
if (invalid_param) { } else if (arg == "-n" || arg == "--n-gen") {
break; if (++i >= argc) {
} invalid_param = true;
params.type_v.insert(params.type_v.end(), types.begin(), types.end()); break;
} else if (arg == "-t" || arg == "--threads") { }
if (++i >= argc) { auto p = parse_int_range(argv[i]);
invalid_param = true; params.n_gen.insert(params.n_gen.end(), p.begin(), p.end());
break; } else if (arg == "-pg") {
} if (++i >= argc) {
auto p = string_split<int>(argv[i], split_delim); invalid_param = true;
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end()); break;
} else if (arg == "-C" || arg == "--cpu-mask") { }
if (++i >= argc) { auto p = string_split<std::string>(argv[i], ',');
invalid_param = true; if (p.size() != 2) {
break; invalid_param = true;
} break;
auto p = string_split<std::string>(argv[i], split_delim); }
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end()); params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
} else if (arg == "--cpu-strict") { } else if (arg == "-d" || arg == "--n-depth") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = string_split<bool>(argv[i], split_delim); auto p = parse_int_range(argv[i]);
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end()); params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
} else if (arg == "--poll") { } else if (arg == "-b" || arg == "--batch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = string_split<int>(argv[i], split_delim); auto p = parse_int_range(argv[i]);
params.poll.insert(params.poll.end(), p.begin(), p.end()); params.n_batch.insert(params.n_batch.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") { } else if (arg == "-ub" || arg == "--ubatch-size") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
auto p = string_split<int>(argv[i], split_delim); auto p = parse_int_range(argv[i]);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); params.n_ubatch.insert(params.n_ubatch.end(), p.begin(), p.end());
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { } else if (arg == "-ctk" || arg == "--cache-type-k") {
if (++i >= argc) { if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
params.rpc_servers.push_back(argv[i]); auto p = string_split<std::string>(argv[i], split_delim);
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) { std::vector<ggml_type> types;
invalid_param = true; for (const auto & t : p) {
break; ggml_type gt = ggml_type_from_name(t);
} if (gt == GGML_TYPE_COUNT) {
auto p = string_split<std::string>(argv[i], split_delim); invalid_param = true;
std::vector<llama_split_mode> modes; break;
for (const auto & m : p) { }
llama_split_mode mode; types.push_back(gt);
if (m == "none") { }
mode = LLAMA_SPLIT_MODE_NONE; if (invalid_param) {
} else if (m == "layer") { break;
mode = LLAMA_SPLIT_MODE_LAYER; }
} else if (m == "row") { params.type_k.insert(params.type_k.end(), types.begin(), types.end());
mode = LLAMA_SPLIT_MODE_ROW; } else if (arg == "-ctv" || arg == "--cache-type-v") {
} else { if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<ggml_type> types;
for (const auto & t : p) {
ggml_type gt = ggml_type_from_name(t);
if (gt == GGML_TYPE_COUNT) {
invalid_param = true;
break;
}
types.push_back(gt);
}
if (invalid_param) {
break;
}
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = parse_int_range(argv[i]);
params.n_threads.insert(params.n_threads.end(), p.begin(), p.end());
} else if (arg == "-C" || arg == "--cpu-mask") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
params.cpu_mask.insert(params.cpu_mask.end(), p.begin(), p.end());
} else if (arg == "--cpu-strict") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.cpu_strict.insert(params.cpu_strict.end(), p.begin(), p.end());
} else if (arg == "--poll") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = parse_int_range(argv[i]);
params.poll.insert(params.poll.end(), p.begin(), p.end());
} else if (arg == "-ngl" || arg == "--n-gpu-layers") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = parse_int_range(argv[i]);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
} else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<std::string>(argv[i], split_delim);
std::vector<llama_split_mode> modes;
for (const auto & m : p) {
llama_split_mode mode;
if (m == "none") {
mode = LLAMA_SPLIT_MODE_NONE;
} else if (m == "layer") {
mode = LLAMA_SPLIT_MODE_LAYER;
} else if (m == "row") {
mode = LLAMA_SPLIT_MODE_ROW;
} else {
invalid_param = true;
break;
}
modes.push_back(mode);
}
if (invalid_param) {
break;
}
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.main_gpu = parse_int_range(argv[i]);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true; invalid_param = true;
break; break;
} }
modes.push_back(mode);
}
if (invalid_param) {
break;
}
params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end());
} else if (arg == "-mg" || arg == "--main-gpu") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.main_gpu = string_split<int>(argv[i], split_delim);
} else if (arg == "-nkvo" || arg == "--no-kv-offload") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto p = string_split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]); std::string value(argv[i]);
/**/ if (value == "distribute" || value == "") { if (value == "distribute" || value == "") {
params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE;
} else if (value == "isolate") { } else if (value == "isolate") {
params.numa = GGML_NUMA_STRATEGY_ISOLATE; params.numa = GGML_NUMA_STRATEGY_ISOLATE;
@ -569,177 +615,182 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
invalid_param = true; invalid_param = true;
break; break;
} }
} } else if (arg == "-fa" || arg == "--flash-attn") {
} else if (arg == "-fa" || arg == "--flash-attn") { if (++i >= argc) {
if (++i >= argc) { invalid_param = true;
invalid_param = true; break;
break; }
} auto p = string_split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim); params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end()); } else if (arg == "-mmp" || arg == "--mmap") {
} else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) {
if (++i >= argc) { invalid_param = true;
invalid_param = true; break;
break; }
} auto p = string_split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim); params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end());
params.use_mmap.insert(params.use_mmap.end(), p.begin(), p.end()); } else if (arg == "-embd" || arg == "--embeddings") {
} else if (arg == "-embd" || arg == "--embeddings") { if (++i >= argc) {
if (++i >= argc) { invalid_param = true;
invalid_param = true; break;
break; }
} auto p = string_split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim); params.embeddings.insert(params.embeddings.end(), p.begin(), p.end());
params.embeddings.insert(params.embeddings.end(), p.begin(), p.end()); } else if (arg == "-nopo" || arg == "--no-op-offload") {
} else if (arg == "-nopo" || arg == "--no-op-offload") { if (++i >= argc) {
if (++i >= argc) { invalid_param = true;
invalid_param = true; break;
break; }
} auto p = string_split<bool>(argv[i], split_delim);
auto p = string_split<bool>(argv[i], split_delim); params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end());
params.no_op_offload.insert(params.no_op_offload.end(), p.begin(), p.end()); } else if (arg == "-ts" || arg == "--tensor-split") {
} else if (arg == "-ts" || arg == "--tensor-split") { if (++i >= argc) {
if (++i >= argc) { invalid_param = true;
invalid_param = true; break;
break; }
} for (auto ts : string_split<std::string>(argv[i], split_delim)) {
for (auto ts : string_split<std::string>(argv[i], split_delim)) { // split string by ; and /
// split string by ; and / const std::regex regex{ R"([;/]+)" };
const std::regex regex{ R"([;/]+)" }; std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 };
std::sregex_token_iterator it{ ts.begin(), ts.end(), regex, -1 }; std::vector<std::string> split_arg{ it, {} };
std::vector<std::string> split_arg{ it, {} }; GGML_ASSERT(split_arg.size() <= llama_max_devices());
GGML_ASSERT(split_arg.size() <= llama_max_devices());
std::vector<float> tensor_split(llama_max_devices()); std::vector<float> tensor_split(llama_max_devices());
for (size_t i = 0; i < llama_max_devices(); ++i) { for (size_t i = 0; i < llama_max_devices(); ++i) {
if (i < split_arg.size()) { if (i < split_arg.size()) {
tensor_split[i] = std::stof(split_arg[i]); tensor_split[i] = std::stof(split_arg[i]);
} else { } else {
tensor_split[i] = 0.0f; tensor_split[i] = 0.0f;
}
}
params.tensor_split.push_back(tensor_split);
}
} else if (arg == "-ot" || arg == "--override-tensor") {
if (++i >= argc) {
invalid_param = true;
break;
}
auto value = argv[i];
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list;
if (buft_list.empty()) {
// enumerate all the devices and add their buffer types to the list
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
auto * dev = ggml_backend_dev_get(i);
auto * buft = ggml_backend_dev_buffer_type(dev);
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
}
} }
} }
params.tensor_split.push_back(tensor_split); auto override_group_span_len = std::strcspn(value, ",");
} bool last_group = false;
} else if (arg == "-ot" || arg == "--override-tensor") { do {
if (++i >= argc) { if (override_group_span_len == 0) {
invalid_param = true; // Adds an empty override-tensors for an empty span
break; params.tensor_buft_overrides.push_back({{}});
} if (value[override_group_span_len] == '\0') {
auto value = argv[i]; value = &value[override_group_span_len];
/* static */ std::map<std::string, ggml_backend_buffer_type_t> buft_list; last_group = true;
if (buft_list.empty()) { } else {
// enumerate all the devices and add their buffer types to the list value = &value[override_group_span_len + 1];
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { override_group_span_len = std::strcspn(value, ",");
auto * dev = ggml_backend_dev_get(i); }
auto * buft = ggml_backend_dev_buffer_type(dev); continue;
if (buft) {
buft_list[ggml_backend_buft_name(buft)] = buft;
} }
} // Stamps null terminators into the argv
} // value for this option to avoid the
auto override_group_span_len = std::strcspn(value, ","); // memory leak present in the implementation
bool last_group = false; // over in arg.cpp. Acceptable because we
do { // only parse these args once in this program.
if (override_group_span_len == 0) { auto override_group = value;
// Adds an empty override-tensors for an empty span
params.tensor_buft_overrides.push_back({{}});
if (value[override_group_span_len] == '\0') { if (value[override_group_span_len] == '\0') {
value = &value[override_group_span_len]; value = &value[override_group_span_len];
last_group = true; last_group = true;
} else { } else {
value[override_group_span_len] = '\0';
value = &value[override_group_span_len + 1]; value = &value[override_group_span_len + 1];
override_group_span_len = std::strcspn(value, ",");
} }
continue; std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
} auto override_span_len = std::strcspn(override_group, ";");
// Stamps null terminators into the argv while (override_span_len > 0) {
// value for this option to avoid the auto override = override_group;
// memory leak present in the implementation if (override_group[override_span_len] != '\0') {
// over in arg.cpp. Acceptable because we override_group[override_span_len] = '\0';
// only parse these args once in this program. override_group = &override_group[override_span_len + 1];
auto override_group = value; } else {
if (value[override_group_span_len] == '\0') { override_group = &override_group[override_span_len];
value = &value[override_group_span_len];
last_group = true;
} else {
value[override_group_span_len] = '\0';
value = &value[override_group_span_len + 1];
}
std::vector<llama_model_tensor_buft_override> group_tensor_buft_overrides{};
auto override_span_len = std::strcspn(override_group, ";");
while (override_span_len > 0) {
auto override = override_group;
if (override_group[override_span_len] != '\0') {
override_group[override_span_len] = '\0';
override_group = &override_group[override_span_len + 1];
} else {
override_group = &override_group[override_span_len];
}
auto tensor_name_span_len = std::strcspn(override, "=");
if (tensor_name_span_len >= override_span_len) {
invalid_param = true;
break;
}
override[tensor_name_span_len] = '\0';
auto tensor_name = override;
auto buffer_type = &override[tensor_name_span_len + 1];
if (buft_list.find(buffer_type) == buft_list.end()) {
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
} }
invalid_param = true; auto tensor_name_span_len = std::strcspn(override, "=");
if (tensor_name_span_len >= override_span_len) {
invalid_param = true;
break;
}
override[tensor_name_span_len] = '\0';
auto tensor_name = override;
auto buffer_type = &override[tensor_name_span_len + 1];
if (buft_list.find(buffer_type) == buft_list.end()) {
printf("Available buffer types:\n");
for (const auto & it : buft_list) {
printf(" %s\n", ggml_backend_buft_name(it.second));
}
invalid_param = true;
break;
}
group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)});
override_span_len = std::strcspn(override_group, ";");
}
if (invalid_param) {
break; break;
} }
group_tensor_buft_overrides.push_back({tensor_name, buft_list.at(buffer_type)}); group_tensor_buft_overrides.push_back({nullptr,nullptr});
override_span_len = std::strcspn(override_group, ";"); params.tensor_buft_overrides.push_back(group_tensor_buft_overrides);
} override_group_span_len = std::strcspn(value, ",");
if (invalid_param) { } while (!last_group);
} else if (arg == "-r" || arg == "--repetitions") {
if (++i >= argc) {
invalid_param = true;
break; break;
} }
group_tensor_buft_overrides.push_back({nullptr,nullptr}); params.reps = std::stoi(argv[i]);
params.tensor_buft_overrides.push_back(group_tensor_buft_overrides); } else if (arg == "--prio") {
override_group_span_len = std::strcspn(value, ","); if (++i >= argc) {
} while (!last_group); invalid_param = true;
} else if (arg == "-r" || arg == "--repetitions") { break;
if (++i >= argc) { }
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
} else if (arg == "--delay") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.delay = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
break;
}
invalid_param = !output_format_from_str(argv[i], params.output_format);
} else if (arg == "-oe" || arg == "--output-err") {
if (++i >= argc) {
invalid_param = true;
break;
}
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else if (arg == "--progress") {
params.progress = true;
} else {
invalid_param = true; invalid_param = true;
break; break;
} }
params.reps = std::stoi(argv[i]); } catch (const std::exception & e) {
} else if (arg == "--prio") { fprintf(stderr, "error: %s\n", e.what());
if (++i >= argc) {
invalid_param = true;
break;
}
params.prio = (enum ggml_sched_priority) std::stoi(argv[i]);
} else if (arg == "--delay") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.delay = std::stoi(argv[i]);
} else if (arg == "-o" || arg == "--output") {
if (++i >= argc) {
invalid_param = true;
break;
}
invalid_param = !output_format_from_str(argv[i], params.output_format);
} else if (arg == "-oe" || arg == "--output-err") {
if (++i >= argc) {
invalid_param = true;
break;
}
invalid_param = !output_format_from_str(argv[i], params.output_format_stderr);
} else if (arg == "-v" || arg == "--verbose") {
params.verbose = true;
} else if (arg == "--progress") {
params.progress = true;
} else {
invalid_param = true; invalid_param = true;
break; break;
} }
} }
if (invalid_param) { if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
print_usage(argc, argv); print_usage(argc, argv);