diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 7628cbc9b..f73d4b9bf 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -3883,6 +3883,11 @@ llama_context_recurrent::llama_context_recurrent( llama_context_recurrent::~llama_context_recurrent() = default; void llama_context_recurrent::reserve() { + // simulate full KV cache + kv_self.n = kv_self.size; + + LLAMA_LOG_DEBUG("%s: kv_self.n = %u\n", __func__, kv_self.n); + // TODO: implement recurrent-specific reserve logic llama_context::reserve(); } diff --git a/src/llama-context.h b/src/llama-context.h index 0e55aae1c..2945cbabe 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -447,6 +447,7 @@ private: ggml_tensor * self_k_shift; // I32 [kv_size] } inp; +protected: // // graph // @@ -570,6 +571,7 @@ private: ggml_tensor * s_mask; // F32 [1, n_kv] } inp; +protected: // // graph //