llama : use n_swa + n_ubatch cells for SWA cache (#13833)

* llama : use n_swa + n_ubatch cells for SWA cache ggml-ci * llama : add warning about multi-sqeuence SWA contexts
2025-07-01 13:05:52 +00:00 · 2025-05-31 15:57:44 +03:00
parent c7e0a2054b
commit 3600cc2886
6 changed files with 24 additions and 11 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -123,6 +123,11 @@ llama_context::llama_context(
                __func__, n_ctx_per_seq, hparams.n_ctx_train);
    }

+    if (!params.swa_full && cparams.n_seq_max > 1) {
+        LLAMA_LOG_WARN("%s: requested n_seq_max (%u) > 1, but swa_full is not enabled -- performance may be degraded: %s\n",
+                __func__, cparams.n_seq_max, "https://github.com/ggml-org/llama.cpp/pull/13845#issuecomment-2924800573");
+    }
+
    if (!hparams.vocab_only) {
        // GPU backends
        for (auto * dev : model.devices) {