mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 03:55:20 +00:00
server : improve error reporting (#13680)
This commit is contained in:
@ -3366,14 +3366,29 @@ struct server_context {
|
|||||||
metrics.on_decoded(slots);
|
metrics.on_decoded(slots);
|
||||||
|
|
||||||
if (ret != 0) {
|
if (ret != 0) {
|
||||||
if (n_batch == 1 || ret < 0) {
|
{
|
||||||
// if you get here, it means the KV cache is full - try increasing it via the context size
|
std::string err;
|
||||||
SRV_ERR("failed to decode the batch: KV cache is full - try increasing it via the context size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
|
|
||||||
|
if (n_batch == 1 && ret == 1) {
|
||||||
|
err = "Context size has been exceeded.";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret == -1) {
|
||||||
|
err = "Invalid input batch.";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret < -1) {
|
||||||
|
err = "Compute error.";
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!err.empty()) {
|
||||||
|
SRV_ERR("%s, i = %d, n_batch = %d, ret = %d\n", err.c_str(), i, n_batch, ret);
|
||||||
for (auto & slot : slots) {
|
for (auto & slot : slots) {
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "Input prompt is too big compared to KV size. Please try increasing KV size.");
|
send_error(slot, err);
|
||||||
|
}
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
break; // break loop of n_batch
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// retry with half the batch size to try to find a free slot in the KV cache
|
// retry with half the batch size to try to find a free slot in the KV cache
|
||||||
|
Reference in New Issue
Block a user