llama : use n_swa + n_ubatch cells for SWA cache (#13833)

* llama : use n_swa + n_ubatch cells for SWA cache

ggml-ci

* llama : add warning about multi-sqeuence SWA contexts
This commit is contained in:
Georgi Gerganov
2025-05-31 15:57:44 +03:00
committed by GitHub
parent c7e0a2054b
commit 3600cc2886
6 changed files with 24 additions and 11 deletions

View File

@ -123,6 +123,11 @@ llama_context::llama_context(
__func__, n_ctx_per_seq, hparams.n_ctx_train);
}
if (!params.swa_full && cparams.n_seq_max > 1) {
LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
__func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
}
if (!hparams.vocab_only) {
// GPU backends
for (auto * dev : model.devices) {