llama : auto-batch preparation (#13845)

* llama : auto-batch

ggml-ci

* context : simplify if branching
This commit is contained in:
Georgi Gerganov
2025-05-31 12:55:57 +03:00
committed by GitHub
parent 51fa76f172
commit 3f55f781f1
5 changed files with 67 additions and 54 deletions

View File

@ -392,7 +392,7 @@ int main(int argc, char ** argv) {
return 1; return 1;
} }
LOG_ERR("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2); LOG_WRN("%s : failed to decode the batch, retrying with n_batch = %d\n", __func__, n_batch / 2);
n_cache_miss += 1; n_cache_miss += 1;

View File

@ -424,28 +424,33 @@ const llama_kv_cache * llama_context::get_kv_self() const {
return kv_self; return kv_self;
} }
void llama_context::kv_self_update() { bool llama_context::kv_self_update() {
if (!memory) { if (!memory) {
return; return false;
} }
llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get()); llama_kv_cache * kv_self = static_cast<llama_kv_cache *>(memory.get());
if (kv_self->update(*this)) { if (!kv_self->update(*this)) {
// if the KV cache did any computation, we have to reserve a new worst-case graph // no updates have been performed
const auto kv_state = kv_self->init_full(); return false;
if (!kv_state) {
throw std::runtime_error("failed to initialize KV cache");
}
const uint32_t n_seqs = cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
if (!gf) {
LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
}
} }
// if the KV cache did any computation, we have to reserve a new worst-case graph
const auto kv_state = kv_self->init_full();
if (!kv_state) {
throw std::runtime_error("failed to initialize KV cache");
}
const uint32_t n_seqs = cparams.n_seq_max;
const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
auto * gf = graph_reserve(n_tokens, n_seqs, n_tokens, kv_state.get());
if (!gf) {
LLAMA_LOG_ERROR("%s: failed to reserve graph after the KV cache update\n", __func__);
}
return true;
} }
enum llama_pooling_type llama_context::pooling_type() const { enum llama_pooling_type llama_context::pooling_type() const {
@ -933,24 +938,44 @@ int llama_context::decode(llama_batch & inp_batch) {
// handle any pending defrags/shifts // handle any pending defrags/shifts
kv_self_update(); kv_self_update();
auto kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all); llama_memory_state_ptr kv_state;
if (!kv_state) {
return -2;
}
switch (kv_state->get_status()) { bool did_defrag = false;
case LLAMA_MEMORY_STATUS_SUCCESS:
{ while (true) {
} break; kv_state = kv_self->init_batch(batch, cparams.n_ubatch, embd_pooled, /* logits_all */ n_outputs_all == n_tokens_all);
case LLAMA_MEMORY_STATUS_FAILED_PREPARE: if (!kv_state) {
{ return -2;
// not a fatal error, we can re-try with a different batch }
return 1;
} switch (kv_state->get_status()) {
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE: case LLAMA_MEMORY_STATUS_SUCCESS:
{ {
return -2; } break;
} case LLAMA_MEMORY_STATUS_FAILED_PREPARE:
{
if (!did_defrag) {
did_defrag = true;
kv_self->defrag_sched(-1.0f);
if (kv_self_update()) {
LLAMA_LOG_DEBUG("%s: failed to init batch of size %d, retrying after defrag\n", __func__, batch.n_tokens);
continue;
}
}
LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
return 1;
}
case LLAMA_MEMORY_STATUS_FAILED_COMPUTE:
{
return -2;
}
}
break;
} }
// reserve output buffer // reserve output buffer
@ -2646,22 +2671,8 @@ int32_t llama_encode(
int32_t llama_decode( int32_t llama_decode(
llama_context * ctx, llama_context * ctx,
llama_batch batch) { llama_batch batch) {
int ret = ctx->decode(batch); const int ret = ctx->decode(batch);
if (ret != 0 && ret != 1) {
// defrag and try again
// TODO: distinguish return code when we are sure that even after defrag there is no space available
if (ret == 1) {
llama_kv_self_defrag(ctx);
ret = ctx->decode(batch);
if (ret == 1) {
LLAMA_LOG_WARN("%s: failed to find KV cache slot for batch of size %d\n", __func__, batch.n_tokens);
return ret;
}
}
if (ret != 0) {
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret); LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
} }

View File

@ -50,8 +50,9 @@ struct llama_context {
llama_kv_cache * get_kv_self(); llama_kv_cache * get_kv_self();
const llama_kv_cache * get_kv_self() const; const llama_kv_cache * get_kv_self() const;
// return true of the KV cache was updated
// TODO: remove // TODO: remove
void kv_self_update(); bool kv_self_update();
enum llama_pooling_type pooling_type() const; enum llama_pooling_type pooling_type() const;

View File

@ -1809,9 +1809,10 @@ llama_pos llama_kv_cache_unified_iswa::seq_pos_max(llama_seq_id seq_id) const {
llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) { llama_memory_state_ptr llama_kv_cache_unified_iswa::init_batch(const llama_batch & batch, uint32_t n_ubatch, bool embd_pooled, bool logits_all) {
GGML_UNUSED(embd_pooled); GGML_UNUSED(embd_pooled);
auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all); // TODO: if we fail with split_simple, we should attempt different splitting strategies
// but to do that properly, we first have to refactor the batches to be more flexible
// TODO: if we fail with split_simple, we should attempt split_equal auto sbatch = llama_sbatch(batch, hparams.n_embd, true, logits_all);
std::vector<llama_ubatch> ubatches; std::vector<llama_ubatch> ubatches;

View File

@ -3431,7 +3431,7 @@ struct server_context {
// retry with half the batch size to try to find a free slot in the KV cache // retry with half the batch size to try to find a free slot in the KV cache
n_batch /= 2; n_batch /= 2;
SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size - try increasing it via the context size or enable defragmentation, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret); SRV_WRN("failed to find free space in the KV cache, retrying with smaller batch size, i = %d, n_batch = %d, ret = %d\n", i, n_batch, ret);
continue; // continue loop of n_batch continue; // continue loop of n_batch
} }