llama.cpp/src/llama-memory.h

#pragma once

#include "llama.h"

#include <memory>
#include <vector>

struct llama_ubatch;

class llama_io_write_i;
class llama_io_read_i;

struct llama_memory_params {
    // kv cache
    ggml_type type_k;
    ggml_type type_v;

    // use full-size SWA cache
    bool swa_full;
};

enum llama_memory_status {
    LLAMA_MEMORY_STATUS_SUCCESS = 0,
    LLAMA_MEMORY_STATUS_NO_UPDATE,
    LLAMA_MEMORY_STATUS_FAILED_PREPARE,
    LLAMA_MEMORY_STATUS_FAILED_COMPUTE,
};

// helper function for combining the status of two memory states
// useful for implementing hybrid memory types (e.g. iSWA)
llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);

// the interface for managing the memory state during batch processing
// this interface is implemented per memory type. see:
//   - llama_kv_cache_unified_state
//   - llama_kv_cache_unified_iswa_state
//   ...
//
// the only method that can mutate the memory and the memory state is llama_memory_i::apply()
//
// TODO: rename to llama_memory_context_i ?
struct llama_memory_state_i {
    virtual ~llama_memory_state_i() = default;

    // consume the current ubatch from the state and proceed to the next one
    // return false if we are done
    virtual bool next() = 0;

    // apply the memory state for the current ubatch to the memory object
    // return false on failure
    virtual bool apply() = 0;

    // TODO: this might get reworked in the future when refactoring llama_batch
    virtual std::vector<int64_t> & out_ids() = 0;

    // get the current ubatch
    virtual const llama_ubatch & get_ubatch() const = 0;

    // get the status of the memory state - used for error handling and checking if any updates would be applied
    virtual llama_memory_status get_status() const = 0;
};

using llama_memory_state_ptr = std::unique_ptr<llama_memory_state_i>;

// general concept of LLM memory
// the KV cache is a type of LLM memory, but there can be other types
struct llama_memory_i {
    virtual ~llama_memory_i() = default;

    // split the input batch into a set of ubatches and verify that they can fit into the cache
    // return a state object containing the ubatches and KV cache state required to process them
    // check the llama_memory_state_i::get_status() for the result
    virtual llama_memory_state_ptr init_batch(
            const llama_batch & batch,
            uint32_t n_ubatch,
            bool embd_pooled,
            bool logits_all) = 0;

    // simulate full cache, used for allocating worst-case compute buffers
    virtual llama_memory_state_ptr init_full() = 0;

    // prepare for any pending memory updates, such as shifts, defrags, etc.
    // status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update
    virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0;

    // getters
    virtual bool get_can_shift() const = 0;

    //
    // ops
    //

    // if data == true, the data buffers will also be cleared together with the metadata
    virtual void clear(bool data) = 0;

    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
    virtual void seq_keep(llama_seq_id seq_id) = 0;
    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos shift) = 0;
    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;

    virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;
    virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;

    //
    // state write/read
    //

    virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;
    virtual void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1) = 0;
};

using llama_memory_ptr = std::unique_ptr<llama_memory_i>;

// TODO: temporary until the llama_kv_cache is removed from the public API
struct llama_kv_cache : public llama_memory_i {
    virtual ~llama_kv_cache() = default;
};
llama : refactor llama_context, llama_kv_cache, llm_build_context (#12181) * llama : refactor llama_context, llama_kv_cache, llm_build_context ggml-ci * graph : don't mutate the KV cache during defrag ggml-ci * context : reduce virtuals + remove test function ggml-ci * context : move interface implementation to source file + factory ggml-ci * graph : move KV cache build functions to llama_context impl ggml-ci * graph : remove model reference from build_pooling ggml-ci * graph : remove llama_model reference ggml-ci * kv_cache : provide rope factors ggml-ci * graph : rework inputs to use only unique_ptr, remove attn input abstraction ggml-ci * context : remove llama_context_i abstraction ggml-ci * context : clean-up ggml-ci * graph : clean-up ggml-ci * llama : remove redundant keywords (struct, enum) ggml-ci * model : adapt gemma3 ggml-ci * graph : restore same attention ops as on master ggml-ci * llama : remove TODO + fix indent ggml-ci 2025-03-13 12:35:44 +02:00			`#pragma once`

			`#include "llama.h"`

kv-cache : refactor + add llama_memory_state_i (#13746) * kv-cache : simplify the "struct llama_kv_cache" interface ggml-ci * kv-cache : revert the (n_swa + n_ubatch) change (for next PR) ggml-ci * kv-cache : some comments ggml-ci * context : fix graph reserve for multiple sequences ggml-ci * kv-cache : fix typo [no ci] * kv-cache : fix find_slot() logic for free slots ggml-ci * llama : add TODO for deprecating the defrag API in the future * kv-cache : improve find_slot() using min/max seq pos info ggml-ci * llama : handle aborts and compute errors ggml-ci * memory : extract state into llama_memory_state ggml-ci * kv-cache : add comments ggml-ci * server : update batching logic to reset n_batch on successful decode * server : upon full re-processing, remove the sequence from the cache * kv-cache : add TODO for doing split_equal when split_simple fails ggml-ci 2025-05-31 10:24:04 +03:00			`#include <memory>`
			`#include <vector>`

			`struct llama_ubatch;`

memory : migrate from llama_kv_cache to more generic llama_memory (#14006) * memory : merge llama_kv_cache into llama_memory + new `llama_memory` API ggml-ci * context : fix casts ggml-ci 2025-06-05 15:29:22 +03:00			`class llama_io_write_i;`
			`class llama_io_read_i;`

kv-cache : separate recurrent vs non-recurrent impl (#12799) * kv-cache : serparate recurrent vs non-recurrent impl (wip) ggml-ci * kv-cache : init -> contructor + add llama_memory_params ggml-ci * kv-cache : fix callback reference ggml-ci * context : llama_kv_cache -> llama_memory_i ggml-ci * context : move memory creation logic to model ggml-ci * llama : remove reference of memory during encode ggml-ci * kv-cache : hide padding details in the implementation ggml-ci * kv-cache : add ubatch_next() ggml-ci * context : simplify sbatch logic ggml-ci * kv-cache : hide defrag logic in the implementation ggml-ci * context : hide kv cache details in implementation ggml-ci * build : fix ggml-ci * cont : another fix ggml-ci * kv-cache : simplify interface (wip) ggml-ci * kv-cache : use separate KV cell structs for unified/recurrent ggml-ci * kv-cache : clean-up ggml-ci * model : better llama_model::create_model() signature ggml-ci * kv-cache : fix recurrent seq_rm() ggml-ci * kv-cache : replace `struct callbacks` with `llama_model &` ggml-ci * kv-cache : replace `struct graph_params` with `llama_context &` ggml-ci * kv-cache : fix offload check ggml-ci * context : avoid passing unique_ptr ggml-ci * kv-cache : avoid using the backends from the llama_context ref #13113 ggml-ci * kv-cache : more consistent debug logs [no ci] * kv-cache : do not pass the full llama_context for kv graphs ggml-ci * kv-cache : remove comment * kv-cache : ggml_rope_ext_inplace -> ggml_rope_ext ggml-ci * kv-cache : fix recurrent multi-user case ggml-ci * memory : remove comments [no ci] 2025-05-02 17:48:36 +03:00			`struct llama_memory_params {`
			`// kv cache`
			`ggml_type type_k;`
			`ggml_type type_v;`

kv-cache : add SWA support (#13194) * kv-cache : prepare for SWA ggml-ci * kv-cache : initial iSWA implementation ggml-ci * kv-cache : rework error recovery logic ggml-ci * models : fix Phi-3 SWA parameters ggml-ci * model : adjust Granite to rope factor changes ggml-ci * server : check if context can do shifts ggml-ci * iswa : for now, always enable shifts (experiment) ggml-ci * kv-cache : simplify SWA logic ggml-ci * kv-cache : apply defrag when we fail to find slots for the batch ggml-ci * llama : update docs about llama_decode ggml-ci * kv-cache : update warning logs when no space for the batch is available ggml-ci * llama : add llama_kv_self_seq_pos_min() * kv-cache : keep track of partial SWA computes and print warnings * server : disallow use cases involving partial SWA context ggml-ci * llama : add param to control SWA cache size ggml-ci * minor : clean-up ggml-ci 2025-05-20 08:05:46 +03:00			`// use full-size SWA cache`
			`bool swa_full;`
kv-cache : separate recurrent vs non-recurrent impl (#12799) * kv-cache : serparate recurrent vs non-recurrent impl (wip) ggml-ci * kv-cache : init -> contructor + add llama_memory_params ggml-ci * kv-cache : fix callback reference ggml-ci * context : llama_kv_cache -> llama_memory_i ggml-ci * context : move memory creation logic to model ggml-ci * llama : remove reference of memory during encode ggml-ci * kv-cache : hide padding details in the implementation ggml-ci * kv-cache : add ubatch_next() ggml-ci * context : simplify sbatch logic ggml-ci * kv-cache : hide defrag logic in the implementation ggml-ci * context : hide kv cache details in implementation ggml-ci * build : fix ggml-ci * cont : another fix ggml-ci * kv-cache : simplify interface (wip) ggml-ci * kv-cache : use separate KV cell structs for unified/recurrent ggml-ci * kv-cache : clean-up ggml-ci * model : better llama_model::create_model() signature ggml-ci * kv-cache : fix recurrent seq_rm() ggml-ci * kv-cache : replace `struct callbacks` with `llama_model &` ggml-ci * kv-cache : replace `struct graph_params` with `llama_context &` ggml-ci * kv-cache : fix offload check ggml-ci * context : avoid passing unique_ptr ggml-ci * kv-cache : avoid using the backends from the llama_context ref #13113 ggml-ci * kv-cache : more consistent debug logs [no ci] * kv-cache : do not pass the full llama_context for kv graphs ggml-ci * kv-cache : remove comment * kv-cache : ggml_rope_ext_inplace -> ggml_rope_ext ggml-ci * kv-cache : fix recurrent multi-user case ggml-ci * memory : remove comments [no ci] 2025-05-02 17:48:36 +03:00			`};`

kv-cache : refactor + add llama_memory_state_i (#13746) * kv-cache : simplify the "struct llama_kv_cache" interface ggml-ci * kv-cache : revert the (n_swa + n_ubatch) change (for next PR) ggml-ci * kv-cache : some comments ggml-ci * context : fix graph reserve for multiple sequences ggml-ci * kv-cache : fix typo [no ci] * kv-cache : fix find_slot() logic for free slots ggml-ci * llama : add TODO for deprecating the defrag API in the future * kv-cache : improve find_slot() using min/max seq pos info ggml-ci * llama : handle aborts and compute errors ggml-ci * memory : extract state into llama_memory_state ggml-ci * kv-cache : add comments ggml-ci * server : update batching logic to reset n_batch on successful decode * server : upon full re-processing, remove the sequence from the cache * kv-cache : add TODO for doing split_equal when split_simple fails ggml-ci 2025-05-31 10:24:04 +03:00			`enum llama_memory_status {`
			`LLAMA_MEMORY_STATUS_SUCCESS = 0,`
kv-cache : refactor the update/defrag mechanism (#13988) * kv-cache : refactor update mechanism ggml-ci * memory : improve status handling * defrag : reset head + add comments ggml-ci * cont : minor fixes ggml-ci 2025-06-04 18:58:20 +03:00			`LLAMA_MEMORY_STATUS_NO_UPDATE,`
kv-cache : refactor + add llama_memory_state_i (#13746) * kv-cache : simplify the "struct llama_kv_cache" interface ggml-ci * kv-cache : revert the (n_swa + n_ubatch) change (for next PR) ggml-ci * kv-cache : some comments ggml-ci * context : fix graph reserve for multiple sequences ggml-ci * kv-cache : fix typo [no ci] * kv-cache : fix find_slot() logic for free slots ggml-ci * llama : add TODO for deprecating the defrag API in the future * kv-cache : improve find_slot() using min/max seq pos info ggml-ci * llama : handle aborts and compute errors ggml-ci * memory : extract state into llama_memory_state ggml-ci * kv-cache : add comments ggml-ci * server : update batching logic to reset n_batch on successful decode * server : upon full re-processing, remove the sequence from the cache * kv-cache : add TODO for doing split_equal when split_simple fails ggml-ci 2025-05-31 10:24:04 +03:00			`LLAMA_MEMORY_STATUS_FAILED_PREPARE,`
			`LLAMA_MEMORY_STATUS_FAILED_COMPUTE,`
			`};`

kv-cache : refactor the update/defrag mechanism (#13988) * kv-cache : refactor update mechanism ggml-ci * memory : improve status handling * defrag : reset head + add comments ggml-ci * cont : minor fixes ggml-ci 2025-06-04 18:58:20 +03:00			`// helper function for combining the status of two memory states`
			`// useful for implementing hybrid memory types (e.g. iSWA)`
			`llama_memory_status llama_memory_status_combine(llama_memory_status s0, llama_memory_status s1);`

kv-cache : refactor + add llama_memory_state_i (#13746) * kv-cache : simplify the "struct llama_kv_cache" interface ggml-ci * kv-cache : revert the (n_swa + n_ubatch) change (for next PR) ggml-ci * kv-cache : some comments ggml-ci * context : fix graph reserve for multiple sequences ggml-ci * kv-cache : fix typo [no ci] * kv-cache : fix find_slot() logic for free slots ggml-ci * llama : add TODO for deprecating the defrag API in the future * kv-cache : improve find_slot() using min/max seq pos info ggml-ci * llama : handle aborts and compute errors ggml-ci * memory : extract state into llama_memory_state ggml-ci * kv-cache : add comments ggml-ci * server : update batching logic to reset n_batch on successful decode * server : upon full re-processing, remove the sequence from the cache * kv-cache : add TODO for doing split_equal when split_simple fails ggml-ci 2025-05-31 10:24:04 +03:00			`// the interface for managing the memory state during batch processing`
			`// this interface is implemented per memory type. see:`
			`// - llama_kv_cache_unified_state`
			`// - llama_kv_cache_unified_iswa_state`
			`// ...`
			`//`
			`// the only method that can mutate the memory and the memory state is llama_memory_i::apply()`
			`//`
			`// TODO: rename to llama_memory_context_i ?`
memory : migrate from llama_kv_cache to more generic llama_memory (#14006) * memory : merge llama_kv_cache into llama_memory + new `llama_memory` API ggml-ci * context : fix casts ggml-ci 2025-06-05 15:29:22 +03:00			`struct llama_memory_state_i {`
kv-cache : refactor + add llama_memory_state_i (#13746) * kv-cache : simplify the "struct llama_kv_cache" interface ggml-ci * kv-cache : revert the (n_swa + n_ubatch) change (for next PR) ggml-ci * kv-cache : some comments ggml-ci * context : fix graph reserve for multiple sequences ggml-ci * kv-cache : fix typo [no ci] * kv-cache : fix find_slot() logic for free slots ggml-ci * llama : add TODO for deprecating the defrag API in the future * kv-cache : improve find_slot() using min/max seq pos info ggml-ci * llama : handle aborts and compute errors ggml-ci * memory : extract state into llama_memory_state ggml-ci * kv-cache : add comments ggml-ci * server : update batching logic to reset n_batch on successful decode * server : upon full re-processing, remove the sequence from the cache * kv-cache : add TODO for doing split_equal when split_simple fails ggml-ci 2025-05-31 10:24:04 +03:00			`virtual ~llama_memory_state_i() = default;`

			`// consume the current ubatch from the state and proceed to the next one`
			`// return false if we are done`
			`virtual bool next() = 0;`

			`// apply the memory state for the current ubatch to the memory object`
			`// return false on failure`
			`virtual bool apply() = 0;`

			`// TODO: this might get reworked in the future when refactoring llama_batch`
			`virtual std::vector<int64_t> & out_ids() = 0;`

			`// get the current ubatch`
			`virtual const llama_ubatch & get_ubatch() const = 0;`

kv-cache : refactor the update/defrag mechanism (#13988) * kv-cache : refactor update mechanism ggml-ci * memory : improve status handling * defrag : reset head + add comments ggml-ci * cont : minor fixes ggml-ci 2025-06-04 18:58:20 +03:00			`// get the status of the memory state - used for error handling and checking if any updates would be applied`
kv-cache : refactor + add llama_memory_state_i (#13746) * kv-cache : simplify the "struct llama_kv_cache" interface ggml-ci * kv-cache : revert the (n_swa + n_ubatch) change (for next PR) ggml-ci * kv-cache : some comments ggml-ci * context : fix graph reserve for multiple sequences ggml-ci * kv-cache : fix typo [no ci] * kv-cache : fix find_slot() logic for free slots ggml-ci * llama : add TODO for deprecating the defrag API in the future * kv-cache : improve find_slot() using min/max seq pos info ggml-ci * llama : handle aborts and compute errors ggml-ci * memory : extract state into llama_memory_state ggml-ci * kv-cache : add comments ggml-ci * server : update batching logic to reset n_batch on successful decode * server : upon full re-processing, remove the sequence from the cache * kv-cache : add TODO for doing split_equal when split_simple fails ggml-ci 2025-05-31 10:24:04 +03:00			`virtual llama_memory_status get_status() const = 0;`
			`};`

			`using llama_memory_state_ptr = std::unique_ptr<llama_memory_state_i>;`
memory : migrate from llama_kv_cache to more generic llama_memory (#14006) * memory : merge llama_kv_cache into llama_memory + new `llama_memory` API ggml-ci * context : fix casts ggml-ci 2025-06-05 15:29:22 +03:00
			`// general concept of LLM memory`
			`// the KV cache is a type of LLM memory, but there can be other types`
			`struct llama_memory_i {`
			`virtual ~llama_memory_i() = default;`

			`// split the input batch into a set of ubatches and verify that they can fit into the cache`
			`// return a state object containing the ubatches and KV cache state required to process them`
			`// check the llama_memory_state_i::get_status() for the result`
			`virtual llama_memory_state_ptr init_batch(`
			`const llama_batch & batch,`
			`uint32_t n_ubatch,`
			`bool embd_pooled,`
			`bool logits_all) = 0;`

			`// simulate full cache, used for allocating worst-case compute buffers`
			`virtual llama_memory_state_ptr init_full() = 0;`

			`// prepare for any pending memory updates, such as shifts, defrags, etc.`
			`// status == LLAMA_MEMORY_STATUS_NO_UPDATE if there is nothing to update`
			`virtual llama_memory_state_ptr init_update(llama_context * lctx, bool optimize) = 0;`

			`// getters`
			`virtual bool get_can_shift() const = 0;`

			`//`
			`// ops`
			`//`

llama : deprecate llama_kv_self_ API (#14030) * llama : deprecate llama_kv_self_ API ggml-ci * llama : allow llama_memory_(nullptr) ggml-ci * memory : add flag for optional data clear in llama_memory_clear ggml-ci 2025-06-06 14:11:15 +03:00			`// if data == true, the data buffers will also be cleared together with the metadata`
			`virtual void clear(bool data) = 0;`
memory : migrate from llama_kv_cache to more generic llama_memory (#14006) * memory : merge llama_kv_cache into llama_memory + new `llama_memory` API ggml-ci * context : fix casts ggml-ci 2025-06-05 15:29:22 +03:00
			`virtual bool seq_rm (llama_seq_id seq_id, llama_pos p0, llama_pos p1) = 0;`
			`virtual void seq_cp (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;`
			`virtual void seq_keep(llama_seq_id seq_id) = 0;`
			`virtual void seq_add (llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos shift) = 0;`
			`virtual void seq_div (llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) = 0;`

			`virtual llama_pos seq_pos_min(llama_seq_id seq_id) const = 0;`
			`virtual llama_pos seq_pos_max(llama_seq_id seq_id) const = 0;`

			`//`
			`// state write/read`
			`//`

			`virtual void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const = 0;`
			`virtual void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1) = 0;`
			`};`

			`using llama_memory_ptr = std::unique_ptr<llama_memory_i>;`

			`// TODO: temporary until the llama_kv_cache is removed from the public API`
			`struct llama_kv_cache : public llama_memory_i {`
			`virtual ~llama_kv_cache() = default;`
			`};`