mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-29 04:35:05 +00:00
llama-bench: add -d
depth arg (#13096)
* add depth param * update llama-bench README and add depth param * llama-bench: default params for depth arg for faster execution * Update examples/llama-bench/README.md Co-authored-by: Johannes Gäßler <johannesg@5d6.de> * fix buffer print ub * use user provided args * remove extra whitespaces --------- Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
@ -200,6 +200,7 @@ struct cmd_params {
|
||||
std::vector<int> n_prompt;
|
||||
std::vector<int> n_gen;
|
||||
std::vector<std::pair<int, int>> n_pg;
|
||||
std::vector<int> n_depth;
|
||||
std::vector<int> n_batch;
|
||||
std::vector<int> n_ubatch;
|
||||
std::vector<ggml_type> type_k;
|
||||
@ -233,6 +234,7 @@ static const cmd_params cmd_params_defaults = {
|
||||
/* n_prompt */ { 512 },
|
||||
/* n_gen */ { 128 },
|
||||
/* n_pg */ {},
|
||||
/* n_depth */ { 0 },
|
||||
/* n_batch */ { 2048 },
|
||||
/* n_ubatch */ { 512 },
|
||||
/* type_k */ { GGML_TYPE_F16 },
|
||||
@ -272,6 +274,7 @@ static void print_usage(int /* argc */, char ** argv) {
|
||||
printf(" -n, --n-gen <n> (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str());
|
||||
printf(" -pg <pp,tg> (default: %s)\n",
|
||||
join(transform_to_str(cmd_params_defaults.n_pg, pair_str), ",").c_str());
|
||||
printf(" -d, --n-depth <n> (default: %s)\n", join(cmd_params_defaults.n_depth, ",").c_str());
|
||||
printf(" -b, --batch-size <n> (default: %s)\n",
|
||||
join(cmd_params_defaults.n_batch, ",").c_str());
|
||||
printf(" -ub, --ubatch-size <n> (default: %s)\n",
|
||||
@ -409,6 +412,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
break;
|
||||
}
|
||||
params.n_pg.push_back({ std::stoi(p[0]), std::stoi(p[1]) });
|
||||
} else if (arg == "-d" || arg == "--n-depth") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
auto p = string_split<int>(argv[i], split_delim);
|
||||
params.n_depth.insert(params.n_depth.end(), p.begin(), p.end());
|
||||
} else if (arg == "-b" || arg == "--batch-size") {
|
||||
if (++i >= argc) {
|
||||
invalid_param = true;
|
||||
@ -739,6 +749,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
||||
if (params.n_pg.empty()) {
|
||||
params.n_pg = cmd_params_defaults.n_pg;
|
||||
}
|
||||
if (params.n_depth.empty()) {
|
||||
params.n_depth = cmd_params_defaults.n_depth;
|
||||
}
|
||||
if (params.n_batch.empty()) {
|
||||
params.n_batch = cmd_params_defaults.n_batch;
|
||||
}
|
||||
@ -801,6 +814,7 @@ struct cmd_params_instance {
|
||||
std::string model;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
int n_batch;
|
||||
int n_ubatch;
|
||||
ggml_type type_k;
|
||||
@ -880,7 +894,7 @@ struct cmd_params_instance {
|
||||
llama_context_params to_llama_cparams() const {
|
||||
llama_context_params cparams = llama_context_default_params();
|
||||
|
||||
cparams.n_ctx = n_prompt + n_gen;
|
||||
cparams.n_ctx = n_prompt + n_gen + n_depth;
|
||||
cparams.n_batch = n_batch;
|
||||
cparams.n_ubatch = n_ubatch;
|
||||
cparams.type_k = type_k;
|
||||
@ -916,6 +930,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
for (const auto & nt : params.n_threads)
|
||||
for (const auto & cm : params.cpu_mask)
|
||||
for (const auto & cs : params.cpu_strict)
|
||||
for (const auto & nd : params.n_depth)
|
||||
for (const auto & pl : params.poll) {
|
||||
for (const auto & n_prompt : params.n_prompt) {
|
||||
if (n_prompt == 0) {
|
||||
@ -925,6 +940,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .model = */ m,
|
||||
/* .n_prompt = */ n_prompt,
|
||||
/* .n_gen = */ 0,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
@ -955,6 +971,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .model = */ m,
|
||||
/* .n_prompt = */ 0,
|
||||
/* .n_gen = */ n_gen,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
@ -985,6 +1002,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
||||
/* .model = */ m,
|
||||
/* .n_prompt = */ n_pg.first,
|
||||
/* .n_gen = */ n_pg.second,
|
||||
/* .n_depth = */ nd,
|
||||
/* .n_batch = */ nb,
|
||||
/* .n_ubatch = */ nub,
|
||||
/* .type_k = */ tk,
|
||||
@ -1040,6 +1058,7 @@ struct test {
|
||||
bool embeddings;
|
||||
int n_prompt;
|
||||
int n_gen;
|
||||
int n_depth;
|
||||
std::string test_time;
|
||||
std::vector<uint64_t> samples_ns;
|
||||
|
||||
@ -1072,6 +1091,7 @@ struct test {
|
||||
embeddings = inst.embeddings;
|
||||
n_prompt = inst.n_prompt;
|
||||
n_gen = inst.n_gen;
|
||||
n_depth = inst.n_depth;
|
||||
// RFC 3339 date-time format
|
||||
time_t t = time(NULL);
|
||||
std::strftime(buf, sizeof(buf), "%FT%TZ", gmtime(&t));
|
||||
@ -1113,9 +1133,11 @@ struct test {
|
||||
"build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename",
|
||||
"model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads",
|
||||
"cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "use_mmap",
|
||||
"embeddings", "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns",
|
||||
"split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "test_time", "avg_ns",
|
||||
"stddev_ns", "avg_ts", "stddev_ts",
|
||||
"use_mmap", "embeddings", "n_prompt", "n_gen", "n_depth", "test_time",
|
||||
"avg_ns", "stddev_ns", "avg_ts", "stddev_ts",
|
||||
};
|
||||
return fields;
|
||||
}
|
||||
@ -1125,8 +1147,8 @@ struct test {
|
||||
static field_type get_field_type(const std::string & field) {
|
||||
if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" ||
|
||||
field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" ||
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "avg_ns" ||
|
||||
field == "stddev_ns") {
|
||||
field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" ||
|
||||
field == "avg_ns" || field == "stddev_ns") {
|
||||
return INT;
|
||||
}
|
||||
if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" ||
|
||||
@ -1204,6 +1226,7 @@ struct test {
|
||||
std::to_string(embeddings),
|
||||
std::to_string(n_prompt),
|
||||
std::to_string(n_gen),
|
||||
std::to_string(n_depth),
|
||||
test_time,
|
||||
std::to_string(avg_ns()),
|
||||
std::to_string(stdev_ns()),
|
||||
@ -1381,7 +1404,7 @@ struct markdown_printer : public printer {
|
||||
return 4;
|
||||
}
|
||||
if (field == "test") {
|
||||
return 13;
|
||||
return 15;
|
||||
}
|
||||
|
||||
int width = std::max((int) field.length(), 10);
|
||||
@ -1531,6 +1554,10 @@ struct markdown_printer : public printer {
|
||||
} else {
|
||||
snprintf(buf, sizeof(buf), "pp%d+tg%d", t.n_prompt, t.n_gen);
|
||||
}
|
||||
if (t.n_depth > 0) {
|
||||
int len = strlen(buf);
|
||||
snprintf(buf + len, sizeof(buf) - len, " @ d%d", t.n_depth);
|
||||
}
|
||||
value = buf;
|
||||
} else if (field == "t/s") {
|
||||
snprintf(buf, sizeof(buf), "%.2f ± %.2f", t.avg_ts(), t.stdev_ts());
|
||||
@ -1789,6 +1816,14 @@ int main(int argc, char ** argv) {
|
||||
for (int i = 0; i < params.reps; i++) {
|
||||
llama_kv_self_clear(ctx);
|
||||
|
||||
if (t.n_depth > 0) {
|
||||
if (params.progress) {
|
||||
fprintf(stderr, "llama-bench: benchmark %d/%zu: depth run %d/%d\n", params_idx, params_count,
|
||||
i + 1, params.reps);
|
||||
}
|
||||
test_prompt(ctx, t.n_depth, t.n_batch, t.n_threads);
|
||||
}
|
||||
|
||||
uint64_t t_start = get_time_ns();
|
||||
|
||||
if (t.n_prompt > 0) {
|
||||
|
Reference in New Issue
Block a user