mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-09-01 04:44:35 -04:00
server : export max observed n_past value (#15361)
Add tracking for high watermark cache usage and make it available in /metrics endpoint. Use-case: Tracking largest needed cache usage under realistic workload to better understand memory requirements and be able to adjust cache size/quantization for model/cache accordingly.
This commit is contained in:
committed by
GitHub
parent
21c17b5bef
commit
e5155e6986
@@ -1201,6 +1201,8 @@ struct server_task_result_metrics : server_task_result {
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
uint64_t t_tokens_generation_total = 0;
|
||||
|
||||
uint64_t n_past_max = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
@@ -1226,6 +1228,8 @@ struct server_task_result_metrics : server_task_result {
|
||||
{ "n_tokens_predicted_total", n_tokens_predicted_total },
|
||||
{ "t_prompt_processing_total", t_prompt_processing_total },
|
||||
|
||||
{ "n_past_max", n_past_max },
|
||||
|
||||
{ "n_prompt_tokens_processed", n_prompt_tokens_processed },
|
||||
{ "t_prompt_processing", t_prompt_processing },
|
||||
{ "n_tokens_predicted", n_tokens_predicted },
|
||||
@@ -1587,6 +1591,8 @@ struct server_metrics {
|
||||
uint64_t n_tokens_predicted_total = 0;
|
||||
uint64_t t_tokens_generation_total = 0;
|
||||
|
||||
uint64_t n_past_max = 0;
|
||||
|
||||
uint64_t n_prompt_tokens_processed = 0;
|
||||
uint64_t t_prompt_processing = 0;
|
||||
|
||||
@@ -1605,6 +1611,10 @@ struct server_metrics {
|
||||
n_prompt_tokens_processed += slot.n_prompt_tokens_processed;
|
||||
t_prompt_processing += slot.t_prompt_processing;
|
||||
t_prompt_processing_total += slot.t_prompt_processing;
|
||||
|
||||
if (slot.n_past > 0) {
|
||||
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
|
||||
}
|
||||
}
|
||||
|
||||
void on_prediction(const server_slot & slot) {
|
||||
@@ -1620,6 +1630,9 @@ struct server_metrics {
|
||||
if (slot.is_processing()) {
|
||||
n_busy_slots_total++;
|
||||
}
|
||||
if (slot.n_past > 0) {
|
||||
n_past_max = std::max(n_past_max, (uint64_t) slot.n_past);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2875,6 +2888,8 @@ struct server_context {
|
||||
res->n_tokens_predicted_total = metrics.n_tokens_predicted_total;
|
||||
res->t_tokens_generation_total = metrics.t_tokens_generation_total;
|
||||
|
||||
res->n_past_max = metrics.n_past_max;
|
||||
|
||||
res->n_prompt_tokens_processed = metrics.n_prompt_tokens_processed;
|
||||
res->t_prompt_processing = metrics.t_prompt_processing;
|
||||
res->n_tokens_predicted = metrics.n_tokens_predicted;
|
||||
@@ -4077,6 +4092,10 @@ int main(int argc, char ** argv) {
|
||||
{"name", "n_decode_total"},
|
||||
{"help", "Total number of llama_decode() calls"},
|
||||
{"value", res_metrics->n_decode_total}
|
||||
}, {
|
||||
{"name", "n_past_max"},
|
||||
{"help", "Largest observed n_past."},
|
||||
{"value", res_metrics->n_past_max}
|
||||
}, {
|
||||
{"name", "n_busy_slots_per_decode"},
|
||||
{"help", "Average number of busy slots per llama_decode() call"},
|
||||
|
Reference in New Issue
Block a user