mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 20:25:20 +00:00
server: Add "tokens per second" information in the backend (#10548)
* add cmake rvv support * add timings * remove space * update readme * fix * fix code * remove empty line * add test --------- Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
This commit is contained in:
@ -650,6 +650,10 @@ static json format_final_response_oaicompat(const json & request, const json & r
|
||||
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
|
||||
}
|
||||
|
||||
if (result.contains("timings")) {
|
||||
res.push_back({"timings", json_value(result, "timings", json::object())});
|
||||
}
|
||||
|
||||
return res;
|
||||
}
|
||||
|
||||
@ -740,6 +744,11 @@ static std::vector<json> format_partial_response_oaicompat(const json & result,
|
||||
{"model", modelname},
|
||||
{"object", "chat.completion.chunk"}
|
||||
};
|
||||
|
||||
if (result.contains("timings")) {
|
||||
ret.push_back({"timings", json_value(result, "timings", json::object())});
|
||||
}
|
||||
|
||||
if (!finish_reason.empty()) {
|
||||
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
|
||||
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);
|
||||
|
Reference in New Issue
Block a user