llama : refactor llama_context, llama_kv_cache, llm_build_context (#12181)

* llama : refactor llama_context, llama_kv_cache, llm_build_context ggml-ci * graph : don't mutate the KV cache during defrag ggml-ci * context : reduce virtuals + remove test function ggml-ci * context : move interface implementation to source file + factory ggml-ci * graph : move KV cache build functions to llama_context impl ggml-ci * graph : remove model reference from build_pooling ggml-ci * graph : remove llama_model reference ggml-ci * kv_cache : provide rope factors ggml-ci * graph : rework inputs to use only unique_ptr, remove attn input abstraction ggml-ci * context : remove llama_context_i abstraction ggml-ci * context : clean-up ggml-ci * graph : clean-up ggml-ci * llama : remove redundant keywords (struct, enum) ggml-ci * model : adapt gemma3 ggml-ci * graph : restore same attention ops as on master ggml-ci * llama : remove TODO + fix indent ggml-ci
2025-06-27 03:55:20 +00:00 · 2025-03-13 12:35:44 +02:00
parent 2048b5913d
commit e0dbec0bc6
46 changed files with 13903 additions and 12190 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -955,8 +955,8 @@ struct common_init_result common_init_from_params(common_params & params) {
        return iparams;
    }
-    if (params.ctx_shift && !llama_kv_cache_can_shift(lctx)) {
+    if (params.ctx_shift && !llama_kv_self_can_shift(lctx)) {
-        LOG_WRN("%s: KV cache shifting is not supported for this model, disabling KV cache shifting\n", __func__);
+        LOG_WRN("%s: KV cache shifting is not supported for this context, disabling KV cache shifting\n", __func__);
        params.ctx_shift = false;
    }
@ -1060,7 +1060,7 @@ struct common_init_result common_init_from_params(common_params & params) {
        if (llama_model_has_decoder(model)) {
            llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch)));
        }
-        llama_kv_cache_clear(lctx);
+        llama_kv_self_clear(lctx);
        llama_synchronize(lctx);
        llama_perf_context_reset(lctx);
    }
--- a/common/speculative.cpp
+++ b/common/speculative.cpp
@ -173,7 +173,7 @@ llama_tokens common_speculative_gen_draft(
    result.reserve(params.n_draft);
    if (reuse_n == 0) {
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        prompt.clear();
    } else {
@ -192,14 +192,14 @@ llama_tokens common_speculative_gen_draft(
        }
        if (reuse_i > 0) {
-            llama_kv_cache_seq_rm (ctx, 0, 0, reuse_i);
+            llama_kv_self_seq_rm (ctx, 0, 0, reuse_i);
-            llama_kv_cache_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
+            llama_kv_self_seq_add(ctx, 0, reuse_i, -1, -reuse_i);
            prompt.erase(prompt.begin(), prompt.begin() + reuse_i);
        }
        if (reuse_n < (int) prompt.size()) {
-            llama_kv_cache_seq_rm (ctx, 0, reuse_n, -1);
+            llama_kv_self_seq_rm (ctx, 0, reuse_n, -1);
            prompt.erase(prompt.begin() + reuse_n, prompt.end());
        }
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@ -132,7 +132,7 @@ int main(int argc, char ** argv) {
                const auto t_pp_start = ggml_time_us();
-                llama_kv_cache_clear(ctx);
+                llama_kv_self_clear(ctx);
                if (!decode_helper(ctx, batch, ctx_params.n_batch)) {
                    LOG_ERR("%s: llama_decode() failed\n", __func__);
@ -141,7 +141,7 @@ int main(int argc, char ** argv) {
                if (is_pp_shared) {
                    for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                        llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
                    }
                }
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -116,7 +116,7 @@ if llama_decode(context, batch) != 0 {
 }
 for i in 1 ..< n_parallel {
-    llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
+    llama_kv_self_seq_cp(context, 0, Int32(i), 0, batch.n_tokens)
 }
 if n_parallel > 1 {
--- a/examples/cvector-generator/cvector-generator.cpp
+++ b/examples/cvector-generator/cvector-generator.cpp
@ -342,7 +342,7 @@ static bool cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
 }
 static bool get_hidden_layers(llama_context * ctx, std::vector<llama_token> & tokens) {
-    llama_kv_cache_clear(ctx);
+    llama_kv_self_clear(ctx);
    if (llama_decode(ctx, llama_batch_get_one(tokens.data(), tokens.size()))) {
        fprintf(stderr, "%s : failed to eval\n", __func__);
        return false;
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@ -38,7 +38,7 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
    const struct llama_model * model = llama_get_model(ctx);
    // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_self_clear(ctx);
    // run model
    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -45,7 +45,7 @@ static std::vector<std::vector<float>> encode(llama_context * ctx, const std::ve
        }
        // clear previous kv_cache values (irrelevant for embeddings)
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        llama_set_embeddings(ctx, true);
        llama_set_causal_attn(ctx, false);
@ -102,7 +102,7 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
    llama_token eos_token = llama_vocab_eos(vocab);
-    llama_kv_cache_clear(ctx);
+    llama_kv_self_clear(ctx);
    llama_set_embeddings(ctx, false);
    llama_set_causal_attn(ctx, true);
--- a/examples/imatrix/imatrix.cpp
+++ b/examples/imatrix/imatrix.cpp
@ -495,7 +495,7 @@ static bool compute_imatrix(llama_context * ctx, const common_params & params) {
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        llama_batch batch = llama_batch_init(n_batch, 0, 1);
--- a/examples/infill/infill.cpp
+++ b/examples/infill/infill.cpp
@ -332,8 +332,8 @@ int main(int argc, char ** argv) {
                LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                    n_past, n_left, n_ctx, params.n_keep, n_discard);
-                llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
+                llama_kv_self_seq_rm (ctx, 0, params.n_keep + 1            , params.n_keep + n_discard + 1);
-                llama_kv_cache_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
+                llama_kv_self_seq_add(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard);
                n_past -= n_discard;
--- a/examples/llama-bench/llama-bench.cpp
+++ b/examples/llama-bench/llama-bench.cpp
@ -1578,7 +1578,7 @@ int main(int argc, char ** argv) {
        test t(inst, lmodel, ctx);
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        // cool off before the test
        if (params.delay) {
@ -1618,7 +1618,7 @@ int main(int argc, char ** argv) {
        }
        for (int i = 0; i < params.reps; i++) {
-            llama_kv_cache_clear(ctx);
+            llama_kv_self_clear(ctx);
            uint64_t t_start = get_time_ns();
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -194,7 +194,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
        }
        batch->logits[batch->n_tokens - 1] = true;
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
        const auto t_pp_start = ggml_time_us();
        if (llama_decode(context, *batch) != 0) {
@ -206,7 +206,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
        LOGi("Benchmark text generation (tg)");
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
        const auto t_tg_start = ggml_time_us();
        for (i = 0; i < tg; i++) {
@ -223,7 +223,7 @@ Java_android_llama_cpp_LLamaAndroid_bench_1model(
        const auto t_tg_end = ggml_time_us();
-        llama_kv_cache_clear(context);
+        llama_kv_self_clear(context);
        const auto t_pp = double(t_pp_end - t_pp_start) / 1000000.0;
        const auto t_tg = double(t_tg_end - t_tg_start) / 1000000.0;
@ -448,5 +448,5 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
 extern "C"
 JNIEXPORT void JNICALL
 Java_android_llama_cpp_LLamaAndroid_kv_1cache_1clear(JNIEnv *, jobject, jlong context) {
-    llama_kv_cache_clear(reinterpret_cast<llama_context *>(context));
+    llama_kv_self_clear(reinterpret_cast<llama_context *>(context));
 }
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -210,7 +210,7 @@ actor LlamaContext {
            }
            batch.logits[Int(batch.n_tokens) - 1] = 1 // true
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
            let t_pp_start = DispatchTime.now().uptimeNanoseconds / 1000;
@ -223,7 +223,7 @@ actor LlamaContext {
            // bench text generation
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
            let t_tg_start = DispatchTime.now().uptimeNanoseconds / 1000;
@ -242,7 +242,7 @@ actor LlamaContext {
            let t_tg_end = DispatchTime.now().uptimeNanoseconds / 1000;
-            llama_kv_cache_clear(context)
+            llama_kv_self_clear(context)
            let t_pp = Double(t_pp_end - t_pp_start) / 1000000.0
            let t_tg = Double(t_tg_end - t_tg_start) / 1000000.0
@ -292,7 +292,7 @@ actor LlamaContext {
    func clear() {
        tokens_list.removeAll()
        temporary_invalid_cchars.removeAll()
-        llama_kv_cache_clear(context)
+        llama_kv_self_clear(context)
    }
    private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
--- a/examples/llava/gemma3-cli.cpp
+++ b/examples/llava/gemma3-cli.cpp
@ -309,7 +309,7 @@ int main(int argc, char ** argv) {
            }
            if (line == "/clear") {
                ctx.n_past = 0;
-                llama_kv_cache_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
+                llama_kv_self_seq_rm(ctx.lctx, 0, 1, -1); // keep BOS
                LOG("Chat history cleared\n\n");
                continue;
            }
--- a/examples/lookahead/lookahead.cpp
+++ b/examples/lookahead/lookahead.cpp
@ -96,7 +96,7 @@ int main(int argc, char ** argv) {
    llama_decode(ctx, llama_batch_get_one(&inp.back(),           1));
    for (int s = 1; s < W + G + 1; ++s) {
-        llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+        llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
    }
    const auto t_enc_end = ggml_time_us();
@ -438,17 +438,17 @@ int main(int argc, char ** argv) {
        // KV cache management
        // if no verification token matched, we simply remove all cells from this batch -> no fragmentation
-        llama_kv_cache_seq_rm(ctx, -1, n_past, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_past, -1);
        if (seq_id_best != 0) {
            // if a verification token matched, we keep the best sequence and remove the rest
            // this leads to some KV cache fragmentation
-            llama_kv_cache_seq_keep(ctx, seq_id_best);
+            llama_kv_self_seq_keep(ctx, seq_id_best);
-            llama_kv_cache_seq_cp  (ctx, seq_id_best, 0, -1, -1);
+            llama_kv_self_seq_cp  (ctx, seq_id_best, 0, -1, -1);
-            llama_kv_cache_seq_rm  (ctx, seq_id_best,    -1, -1);
+            llama_kv_self_seq_rm  (ctx, seq_id_best,    -1, -1);
            for (int s = 1; s < W + G + 1; ++s) {
-                llama_kv_cache_seq_cp(ctx, 0, s, -1, -1);
+                llama_kv_self_seq_cp(ctx, 0, s, -1, -1);
            }
        }
    }
--- a/examples/lookup/lookup.cpp
+++ b/examples/lookup/lookup.cpp
@ -192,7 +192,7 @@ int main(int argc, char ** argv){
        // KV cache management
        // clean the cache of draft tokens that weren't accepted
-        llama_kv_cache_seq_rm(ctx, 0, n_past, -1);
+        llama_kv_self_seq_rm(ctx, 0, n_past, -1);
        common_batch_clear(batch_tgt);
        common_batch_add(batch_tgt, draft[0], n_past, { 0 }, true);
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -354,7 +354,7 @@ int main(int argc, char ** argv) {
        }
        // remove any "future" tokens that we might have inherited from the previous session
-        llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1);
+        llama_kv_self_seq_rm(ctx, -1, n_matching_session_tokens, -1);
    }
    LOG_DBG("recalculate the cached logits (check): embd_inp.size() %zu, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu\n",
@ -602,8 +602,8 @@ int main(int argc, char ** argv) {
                    LOG_DBG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n",
                            n_past, n_left, n_ctx, params.n_keep, n_discard);
-                    llama_kv_cache_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
+                    llama_kv_self_seq_rm (ctx, 0, params.n_keep            , params.n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
+                    llama_kv_self_seq_add(ctx, 0, params.n_keep + n_discard, n_past, -n_discard);
                    n_past -= n_discard;
@ -626,9 +626,9 @@ int main(int argc, char ** argv) {
                    LOG_DBG("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", ga_i + ib*bd, ga_i + ib*bd + ga_w, ga_n, (ga_i + ib*bd)/ga_n, (ga_i + ib*bd + ga_w)/ga_n);
                    LOG_DBG("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", ga_i + ib*bd + ga_w, n_past + ib*bd, dd, ga_i + ib*bd + ga_w + dd, n_past + ib*bd + dd);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
+                    llama_kv_self_seq_add(ctx, 0, ga_i,                n_past,              ib*bd);
-                    llama_kv_cache_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
+                    llama_kv_self_seq_div(ctx, 0, ga_i + ib*bd,        ga_i + ib*bd + ga_w, ga_n);
-                    llama_kv_cache_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
+                    llama_kv_self_seq_add(ctx, 0, ga_i + ib*bd + ga_w, n_past + ib*bd,      dd);
                    n_past -= bd;
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@ -202,7 +202,7 @@ int main(int argc, char ** argv) {
        // assign the system KV cache to all parallel sequences
        for (int32_t i = 1; i <= n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+            llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
        }
        LOG_INF("\n");
@ -234,9 +234,9 @@ int main(int argc, char ** argv) {
        if (batch.n_tokens == 0) {
            // all sequences have ended - clear the entire KV cache
            for (int i = 1; i <= n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                llama_kv_self_seq_rm(ctx, i, -1, -1);
                // but keep the system prompt
-                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
+                llama_kv_self_seq_cp(ctx, 0, i, -1, -1);
            }
            LOG_INF("%s: clearing the KV cache\n", __func__);
@ -372,8 +372,8 @@ int main(int argc, char ** argv) {
                    }
                    // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx,    client.id + 1, -1, -1);
+                    llama_kv_self_seq_rm(ctx,    client.id + 1, -1, -1);
-                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
+                    llama_kv_self_seq_cp(ctx, 0, client.id + 1, -1, -1);
                    const auto t_main_end = ggml_time_us();
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -133,11 +133,11 @@ int main(int argc, char ** argv) {
            const int ib = i/n_batch - 1;
            const int bd = n_batch_grp*(n_grp - 1);
-            llama_kv_cache_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
+            llama_kv_self_seq_add (ctx, 0, n_past - n_batch,         n_past,         ib*bd);
-            llama_kv_cache_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
+            llama_kv_self_seq_div (ctx, 0, n_past - n_batch + ib*bd, n_past + ib*bd, n_grp);
-            llama_kv_cache_update  (ctx);
+            llama_kv_self_update  (ctx);
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
        }
        common_batch_clear(batch);
@ -167,12 +167,12 @@ int main(int argc, char ** argv) {
        LOG_INF("%s: shifting KV cache with %d\n", __func__, n_discard);
-        llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+        llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-        llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+        llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-      //llama_kv_cache_defrag (ctx);
+      //llama_kv_self_defrag (ctx);
-        llama_kv_cache_update (ctx);
+        llama_kv_self_update (ctx);
-        n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+        n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
        common_batch_clear(batch);
@ -198,12 +198,12 @@ int main(int argc, char ** argv) {
        if (n_discard > 0) {
            LOG_INF("%s: shifting KV cache with %d to free space for the answer\n", __func__, n_discard);
-            llama_kv_cache_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
+            llama_kv_self_seq_rm (ctx, 0, n_keep            , n_keep + n_discard);
-            llama_kv_cache_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
+            llama_kv_self_seq_add(ctx, 0, n_keep + n_discard, n_ctx,  -n_discard);
-          //llama_kv_cache_defrag (ctx);
+          //llama_kv_self_defrag (ctx);
-            llama_kv_cache_update (ctx);
+            llama_kv_self_update (ctx);
-            n_past = llama_kv_cache_seq_pos_max(ctx, 0) + 1;
+            n_past = llama_kv_self_seq_pos_max(ctx, 0) + 1;
        }
    }
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -361,7 +361,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        llama_batch batch = llama_batch_init(n_batch, 0, 1);
@ -547,7 +547,7 @@ static results_perplexity perplexity(llama_context * ctx, const common_params &
        const auto t_start = std::chrono::high_resolution_clock::now();
        // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
@ -924,7 +924,7 @@ static void hellaswag_score(llama_context * ctx, const common_params & params) {
            return;
        }
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@ -1203,7 +1203,7 @@ static void winogrande_score(llama_context * ctx, const common_params & params)
            return;
        }
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@ -1575,7 +1575,7 @@ static void multiple_choice_score(llama_context * ctx, const common_params & par
            return;
        }
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        // decode all tasks [i0, i1)
        if (!decode_helper(ctx, batch, batch_logits, n_batch, n_vocab)) {
@ -1765,7 +1765,7 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
        }
        // clear the KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        llama_batch batch = llama_batch_init(n_batch, 0, 1);
--- a/examples/quantize-stats/quantize-stats.cpp
+++ b/examples/quantize-stats/quantize-stats.cpp
@ -1,6 +1,6 @@
 #include "ggml.h"
 #include "llama.h"
-#include "llama-context.h"
+#include "llama-model.h"
 #include "common.h"
 #include <algorithm>
@ -328,7 +328,7 @@ int main(int argc, char ** argv) {
        }
    }
-    const auto & tensors = llama_internal_get_tensor_map(ctx);
+    const auto & tensors = llama_internal_get_tensor_map(model);
    // check layer tensors
    int included_layers = 0;
--- a/examples/retrieval/retrieval.cpp
+++ b/examples/retrieval/retrieval.cpp
@ -83,7 +83,7 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
 static void batch_decode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
    // clear previous kv_cache values (irrelevant for embeddings)
-    llama_kv_cache_clear(ctx);
+    llama_kv_self_clear(ctx);
    // run model
    LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
--- a/examples/run/run.cpp
+++ b/examples/run/run.cpp
@ -891,7 +891,7 @@ static int apply_chat_template(const struct common_chat_templates * tmpls, Llama
 // Function to tokenize the prompt
 static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt,
                           std::vector<llama_token> & prompt_tokens, const LlamaData & llama_data) {
-    const bool is_first = llama_get_kv_cache_used_cells(llama_data.context.get()) == 0;
+    const bool is_first = llama_kv_self_used_cells(llama_data.context.get()) == 0;
    const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
    prompt_tokens.resize(n_prompt_tokens);
@ -907,7 +907,7 @@ static int tokenize_prompt(const llama_vocab * vocab, const std::string & prompt
 // Check if we have enough space in the context to evaluate this batch
 static int check_context_size(const llama_context_ptr & ctx, const llama_batch & batch) {
    const int n_ctx      = llama_n_ctx(ctx.get());
-    const int n_ctx_used = llama_get_kv_cache_used_cells(ctx.get());
+    const int n_ctx_used = llama_kv_self_used_cells(ctx.get());
    if (n_ctx_used + batch.n_tokens > n_ctx) {
        printf(LOG_COL_DEFAULT "\n");
        printe("context size exceeded\n");
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -15,7 +15,7 @@ int main(int argc, char ** argv) {
        return 1;
    }
-    print_build_info();
+    common_init();
    if (params.n_predict < 0) {
        params.n_predict = 16;
@ -196,7 +196,7 @@ int main(int argc, char ** argv) {
        fprintf(stderr, "%s : seq 0 copied, %zd bytes\n", __func__, ncopy);
        // erase whole kv
-        llama_kv_cache_clear(ctx3);
+        llama_kv_self_clear(ctx3);
        fprintf(stderr, "%s : kv cache cleared\n", __func__);
        // restore kv into seq 1
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -2113,7 +2113,7 @@ struct server_context {
        SRV_DBG("%s", "clearing KV cache\n");
        // clear the entire KV cache
-        llama_kv_cache_clear(ctx);
+        llama_kv_self_clear(ctx);
        clean_kv_cache = false;
    }
@ -2655,8 +2655,8 @@ struct server_context {
                    res->n_tasks_deferred    = queue_tasks.queue_tasks_deferred.size();
                    res->t_start             = metrics.t_start;
-                    res->kv_cache_tokens_count = llama_get_kv_cache_token_count(ctx);
+                    res->kv_cache_tokens_count = llama_kv_self_n_tokens(ctx);
-                    res->kv_cache_used_cells   = llama_get_kv_cache_used_cells(ctx);
+                    res->kv_cache_used_cells   = llama_kv_self_used_cells(ctx);
                    res->n_prompt_tokens_processed_total = metrics.n_prompt_tokens_processed_total;
                    res->t_prompt_processing_total       = metrics.t_prompt_processing_total;
@ -2772,7 +2772,7 @@ struct server_context {
                    // Erase token cache
                    const size_t n_erased = slot->cache_tokens.size();
-                    llama_kv_cache_seq_rm(ctx, slot->id, -1, -1);
+                    llama_kv_self_seq_rm(ctx, slot->id, -1, -1);
                    slot->cache_tokens.clear();
                    auto res = std::make_unique<server_task_result_slot_erase>();
@ -2840,8 +2840,8 @@ struct server_context {
                SLT_WRN(slot, "slot context shift, n_keep = %d, n_left = %d, n_discard = %d\n", n_keep, n_left, n_discard);
-                llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
+                llama_kv_self_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
+                llama_kv_self_seq_add(ctx, slot.id, n_keep + n_discard, slot.n_past,        -n_discard);
                if (slot.params.cache_prompt) {
                    for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++) {
@ -3032,8 +3032,8 @@ struct server_context {
                                            const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
-                                            llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
+                                            llama_kv_self_seq_rm (ctx, slot.id, head_p, head_c);
-                                            llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
+                                            llama_kv_self_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
                                            for (size_t i = 0; i < n_match; i++) {
                                                slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
@ -3071,9 +3071,9 @@ struct server_context {
                    }
                    // keep only the common part
-                    if (!llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1)) {
+                    if (!llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1)) {
                        // could not partially delete (likely using a non-Transformer model)
-                        llama_kv_cache_seq_rm(ctx, slot.id, -1, -1);
+                        llama_kv_self_seq_rm(ctx, slot.id, -1, -1);
                        // there is no common part left
                        slot.n_past = 0;
@ -3313,7 +3313,7 @@ struct server_context {
                slot.cache_tokens.push_back(id);
                slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
-                llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
+                llama_kv_self_seq_rm(ctx, slot.id, slot.n_past, -1);
                for (size_t i = 0; i < ids.size(); ++i) {
                    completion_token_output result;
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@ -302,7 +302,7 @@ class ServerPreset:
        server.model_hf_repo = "ggml-org/models"
        server.model_hf_file = "tinyllamas/stories260K.gguf"
        server.model_alias = "tinyllama-2"
-        server.n_ctx = 256
+        server.n_ctx = 512
        server.n_batch = 32
        server.n_slots = 2
        server.n_predict = 64
--- a/examples/simple-chat/simple-chat.cpp
+++ b/examples/simple-chat/simple-chat.cpp
@ -98,7 +98,7 @@ int main(int argc, char ** argv) {
    auto generate = [&](const std::string & prompt) {
        std::string response;
-        const bool is_first = llama_get_kv_cache_used_cells(ctx) == 0;
+        const bool is_first = llama_kv_self_used_cells(ctx) == 0;
        // tokenize the prompt
        const int n_prompt_tokens = -llama_tokenize(vocab, prompt.c_str(), prompt.size(), NULL, 0, is_first, true);
@ -113,7 +113,7 @@ int main(int argc, char ** argv) {
        while (true) {
            // check if we have enough space in the context to evaluate this batch
            int n_ctx = llama_n_ctx(ctx);
-            int n_ctx_used = llama_get_kv_cache_used_cells(ctx);
+            int n_ctx_used = llama_kv_self_used_cells(ctx);
            if (n_ctx_used + batch.n_tokens > n_ctx) {
                printf("\033[0m\n");
                fprintf(stderr, "context size exceeded\n");
--- a/examples/speculative-simple/speculative-simple.cpp
+++ b/examples/speculative-simple/speculative-simple.cpp
@ -217,7 +217,7 @@ int main(int argc, char ** argv) {
        {
            LOG_DBG("clear kv cache from any extra tokens, n_past = %d\n", n_past);
-            llama_kv_cache_seq_rm(ctx_tgt, 0, n_past, -1);
+            llama_kv_self_seq_rm(ctx_tgt, 0, n_past, -1);
        }
        if ((params.n_predict >= 0 && n_predict > params.n_predict) || has_eos) {
--- a/examples/speculative/speculative.cpp
+++ b/examples/speculative/speculative.cpp
@ -420,14 +420,14 @@ int main(int argc, char ** argv) {
            {
                LOG_DBG("keeping sequence %d, n_past_tgt = %d, n_past_dft = %d\n", s_keep, n_past_tgt, n_past_dft);
-                llama_kv_cache_seq_keep(ctx_dft, s_keep);
+                llama_kv_self_seq_keep(ctx_dft, s_keep);
-                llama_kv_cache_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
+                llama_kv_self_seq_cp  (ctx_dft, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_dft, 0);
+                llama_kv_self_seq_keep(ctx_dft, 0);
-                llama_kv_cache_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
+                llama_kv_self_seq_rm  (ctx_tgt, s_keep, n_past_tgt, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, s_keep);
+                llama_kv_self_seq_keep(ctx_tgt, s_keep);
-                llama_kv_cache_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
+                llama_kv_self_seq_cp  (ctx_tgt, s_keep, 0, -1, -1);
-                llama_kv_cache_seq_keep(ctx_tgt, 0);
+                llama_kv_self_seq_keep(ctx_tgt, 0);
            }
            for (int s = 0; s < n_seq_dft; ++s) {
@ -444,7 +444,7 @@ int main(int argc, char ** argv) {
            common_batch_clear(batch_dft);
            common_batch_add  (batch_dft, token_id, n_past_dft, { 0 }, true);
-            llama_kv_cache_seq_rm(ctx_dft, 0, n_past_dft, -1);
+            llama_kv_self_seq_rm(ctx_dft, 0, n_past_dft, -1);
            // LOG_DBG("dft batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_dft, batch_dft).c_str());
            llama_decode(ctx_dft, batch_dft);
@ -503,8 +503,8 @@ int main(int argc, char ** argv) {
                    if (n_seq_cur < n_seq_dft && cur_p->data[f].p > p_draft_split) {
                        LOG_DBG("splitting seq %3d into %3d\n", s, n_seq_cur);
-                        llama_kv_cache_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
+                        llama_kv_self_seq_rm(ctx_dft,    n_seq_cur, -1, -1);
-                        llama_kv_cache_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
+                        llama_kv_self_seq_cp(ctx_dft, s, n_seq_cur, -1, -1);
                        // all previous tokens from this branch are now also part of the new branch
                        for (int t = 0; t < batch_tgt.n_tokens; ++t) {
@ -585,9 +585,9 @@ int main(int argc, char ** argv) {
        // evaluate the target model on the drafted tokens
        {
-            llama_kv_cache_seq_keep(ctx_tgt, 0);
+            llama_kv_self_seq_keep(ctx_tgt, 0);
            for (int s = 1; s < n_seq_dft; ++s) {
-                llama_kv_cache_seq_cp(ctx_tgt, 0, s, -1, -1);
+                llama_kv_self_seq_cp(ctx_tgt, 0, s, -1, -1);
            }
            // LOG_DBG("target batch: %s\n", LOG_BATCH_TOSTR_PRETTY(ctx_tgt, batch_tgt).c_str());
--- a/include/llama.h
+++ b/include/llama.h
@ -60,6 +60,7 @@ extern "C" {
    struct llama_model;
    struct llama_context;
    struct llama_sampler;
    struct llama_kv_cache;
    typedef int32_t llama_pos;
    typedef int32_t llama_token;
@ -469,7 +470,8 @@ extern "C" {
    DEPRECATED(LLAMA_API int32_t llama_n_vocab    (const struct llama_vocab * vocab), "use llama_vocab_n_tokens instead");
    LLAMA_API const struct llama_model * llama_get_model   (const struct llama_context * ctx);
-    LLAMA_API enum llama_pooling_type    llama_pooling_type(const struct llama_context * ctx);
+    LLAMA_API    struct llama_kv_cache * llama_get_kv_self (      struct llama_context * ctx);
    LLAMA_API  enum llama_pooling_type   llama_pooling_type(const struct llama_context * ctx); // TODO: rename to llama_get_pooling_type
    LLAMA_API const struct llama_vocab * llama_model_get_vocab(const struct llama_model * model);
    LLAMA_API enum llama_rope_type       llama_model_rope_type(const struct llama_model * model);
@ -586,7 +588,7 @@ extern "C" {
    // KV cache
    //
-    // TODO: remove llama_kv_cache_view_* API
+    // TODO: start using struct llama_kv_cache
    // Information associated with an individual cell in the KV cache view.
    struct llama_kv_cache_view_cell {
@ -641,13 +643,19 @@ extern "C" {
    // Returns the number of tokens in the KV cache (slow, use only for debug)
    // If a KV cell has multiple sequences assigned to it, it will be counted multiple times
-    LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_n_tokens(const struct llama_context * ctx);
    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_token_count(const struct llama_context * ctx),
            "use llama_kv_self_n_tokens instead");
    // Returns the number of used KV cells (i.e. have at least one sequence assigned to them)
-    LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx);
+    LLAMA_API int32_t llama_kv_self_used_cells(const struct llama_context * ctx);
    DEPRECATED(LLAMA_API int32_t llama_get_kv_cache_used_cells(const struct llama_context * ctx),
            "use llama_kv_self_used_cells instead");
    // Clear the KV cache - both cell info is erased and KV data is zeroed
-    LLAMA_API void llama_kv_cache_clear(
+    LLAMA_API void llama_kv_self_clear(
            struct llama_context * ctx);
    // Removes all tokens that belong to the specified sequence and have positions in [p0, p1)
@ -655,7 +663,7 @@ extern "C" {
    // seq_id < 0 : match any sequence
    // p0 < 0     : [0,  p1]
    // p1 < 0     : [p0, inf)
-    LLAMA_API bool llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_self_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -665,7 +673,7 @@ extern "C" {
    // Note that this does not allocate extra KV cache memory - it simply assigns the tokens to the new sequence
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_cp(
+    LLAMA_API void llama_kv_self_seq_cp(
            struct llama_context * ctx,
                    llama_seq_id   seq_id_src,
                    llama_seq_id   seq_id_dst,
@ -673,17 +681,17 @@ extern "C" {
                       llama_pos   p1);
    // Removes all tokens that do not belong to the specified sequence
-    LLAMA_API void llama_kv_cache_seq_keep(
+    LLAMA_API void llama_kv_self_seq_keep(
            struct llama_context * ctx,
                    llama_seq_id   seq_id);
    // Adds relative position "delta" to all tokens that belong to the specified sequence and have positions in [p0, p1)
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_add(
+    LLAMA_API void llama_kv_self_seq_add(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -693,10 +701,10 @@ extern "C" {
    // Integer division of the positions by factor of `d > 1`
    // If the KV cache is RoPEd, the KV data is updated accordingly:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
    // p0 < 0 : [0,  p1]
    // p1 < 0 : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_div(
+    LLAMA_API void llama_kv_self_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
@ -704,24 +712,76 @@ extern "C" {
                             int   d);
    // Returns the largest position present in the KV cache for the specified sequence
-    LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
+    LLAMA_API llama_pos llama_kv_self_seq_pos_max(
            struct llama_context * ctx,
                     llama_seq_id   seq_id);
    // TODO: the llama_kv_cache_defrag and llama_kv_cache_update API tightly couples llama_context with llama_kv_cache
    //       how to avoid this?
    // Defragment the KV cache
    // This will be applied:
    //   - lazily on next llama_decode()
-    //   - explicitly with llama_kv_cache_update()
+    //   - explicitly with llama_kv_self_update()
-    LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx);
+    LLAMA_API void llama_kv_self_defrag(struct llama_context * ctx);
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_cache_update(struct llama_context * ctx);
    // Check if the context supports KV cache shifting
-    LLAMA_API bool llama_kv_cache_can_shift(struct llama_context * ctx);
+    LLAMA_API bool llama_kv_self_can_shift(const struct llama_context * ctx);
    // Apply the KV cache updates (such as K-shifts, defragmentation, etc.)
    LLAMA_API void llama_kv_self_update(struct llama_context * ctx);
    DEPRECATED(LLAMA_API void llama_kv_cache_clear(
            struct llama_context * ctx),
            "use llama_kv_self_clear instead");
    DEPRECATED(LLAMA_API bool llama_kv_cache_seq_rm(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1),
            "use llama_kv_self_seq_rm instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_cp(
            struct llama_context * ctx,
                    llama_seq_id   seq_id_src,
                    llama_seq_id   seq_id_dst,
                       llama_pos   p0,
                       llama_pos   p1),
            "use llama_kv_self_seq_cp instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_keep(
            struct llama_context * ctx,
                    llama_seq_id   seq_id),
            "use llama_kv_self_seq_keep instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_add(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
                       llama_pos   delta),
            "use llama_kv_self_seq_add instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_seq_div(
            struct llama_context * ctx,
                    llama_seq_id   seq_id,
                       llama_pos   p0,
                       llama_pos   p1,
                             int   d),
            "use llama_kv_self_seq_div instead");
    DEPRECATED(LLAMA_API llama_pos llama_kv_cache_seq_pos_max(
            struct llama_context * ctx,
                    llama_seq_id   seq_id),
            "use llama_kv_self_seq_pos_max instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_defrag(struct llama_context * ctx),
            "use llama_kv_self_defrag instead");
    DEPRECATED(LLAMA_API bool llama_kv_cache_can_shift(const struct llama_context * ctx),
            "use llama_kv_self_can_shift instead");
    DEPRECATED(LLAMA_API void llama_kv_cache_update(struct llama_context * ctx),
            "use llama_kv_self_update instead");
    //
    // State / sessions
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -15,18 +15,21 @@ add_library(llama
            llama-chat.cpp
            llama-context.cpp
            llama-grammar.cpp
            llama-graph.cpp
            llama-hparams.cpp
            llama-impl.cpp
            llama-io.cpp
            llama-kv-cache.cpp
            llama-memory.cpp
            llama-mmap.cpp
            llama-model-loader.cpp
            llama-model.cpp
            llama-quant.cpp
            llama-sampling.cpp
            llama-vocab.cpp
            unicode.h
            unicode.cpp
            unicode-data.cpp
            unicode.cpp
            unicode.h
            )
 target_include_directories(llama PUBLIC . ../include ../common)
--- a/src/llama-adapter.cpp
+++ b/src/llama-adapter.cpp
@ -4,14 +4,13 @@
 #include "llama-mmap.h"
 #include "llama-model.h"
 #include <algorithm>
 #include <map>
 #include <cassert>
 #include <stdexcept>
 // vec
-struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
+ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    if (il < 0 || il < layer_start || il > layer_end || (size_t) il >= tensors.size()) {
        return nullptr;
    }
@ -19,7 +18,7 @@ struct ggml_tensor * llama_adapter_cvec::tensor_for(int il) const {
    return tensors[il];
 }
-struct ggml_tensor * llama_adapter_cvec::apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const {
+ggml_tensor * llama_adapter_cvec::apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const {
    ggml_tensor * layer_dir = tensor_for(il);
    if (layer_dir != nullptr) {
        cur = ggml_add(ctx, cur, layer_dir);
@ -40,7 +39,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
    auto ctx_for_buft = [&](ggml_backend_buffer_type_t buft) -> ggml_context * {
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
-            struct ggml_init_params params = {
+            ggml_init_params params = {
                /*.mem_size   =*/ hparams.n_layer*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
@ -91,7 +90,7 @@ bool llama_adapter_cvec::init(const llama_model & model) {
    return true;
 }
-int32_t llama_adapter_cvec::apply(
+bool llama_adapter_cvec::apply(
        const llama_model & model,
        const float * data,
        size_t len,
@ -104,17 +103,17 @@ int32_t llama_adapter_cvec::apply(
        // disable the current control vector (but leave allocated for later)
        layer_start = -1;
        layer_end   = -1;
-        return 0;
+        return true;
    }
    if (n_embd != (int) hparams.n_embd) {
        LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__);
-        return 1;
+        return false;
    }
    if (tensors.empty()) {
        if (!init(model)) {
-            return 1;
+            return false;
        }
    }
@ -130,12 +129,12 @@ int32_t llama_adapter_cvec::apply(
        }
    }
-    return 0;
+    return true;
 }
 // lora
-llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor * w) {
+llama_adapter_lora_weight * llama_adapter_lora::get_weight(ggml_tensor * w) {
    const std::string name(w->name);
    const auto pos = ab_map.find(name);
@ -146,11 +145,11 @@ llama_adapter_lora_weight * llama_adapter_lora::get_weight(struct ggml_tensor *
    return nullptr;
 }
-static void llama_adapter_lora_init_impl(struct llama_model & model, const char * path_lora, struct llama_adapter_lora & adapter) {
+static void llama_adapter_lora_init_impl(llama_model & model, const char * path_lora, llama_adapter_lora & adapter) {
    LLAMA_LOG_INFO("%s: loading lora adapter from '%s' ...\n", __func__, path_lora);
    ggml_context * ctx_init;
-    struct gguf_init_params meta_gguf_params = {
+    gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ true,
        /* .ctx      = */ &ctx_init,
    };
@ -201,7 +200,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
        auto it = ctx_map.find(buft);
        if (it == ctx_map.end()) {
            // add a new context
-            struct ggml_init_params params = {
+            ggml_init_params params = {
                /*.mem_size   =*/ n_tensors*ggml_tensor_overhead(),
                /*.mem_buffer =*/ NULL,
                /*.no_alloc   =*/ true,
@ -264,7 +263,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
            throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
        }
-        struct ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
+        ggml_context * dev_ctx = ctx_for_buft(ggml_backend_buffer_get_type(model_tensor->buffer));
        // validate tensor shape
        if (is_token_embd) {
            // expect B to be non-transposed, A and B are flipped; see llm_build_inp_embd()
@ -281,8 +280,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
        }
        // save tensor to adapter
-        struct ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
+        ggml_tensor * tensor_a = ggml_dup_tensor(dev_ctx, w.a);
-        struct ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
+        ggml_tensor * tensor_b = ggml_dup_tensor(dev_ctx, w.b);
        ggml_set_name(tensor_a, w.a->name);
        ggml_set_name(tensor_b, w.b->name);
        adapter.ab_map[name] = llama_adapter_lora_weight(tensor_a, tensor_b);
@ -308,7 +307,7 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
    {
        llama_file gguf_file(path_lora, "rb");
        std::vector<uint8_t> read_buf;
-        auto set_tensor = [&](struct ggml_tensor * orig, struct ggml_tensor * dev) {
+        auto set_tensor = [&](ggml_tensor * orig, ggml_tensor * dev) {
            size_t offs = gguf_get_data_offset(ctx_gguf.get()) + gguf_get_tensor_offset(ctx_gguf.get(), gguf_find_tensor(ctx_gguf.get(), orig->name));
            size_t size = ggml_nbytes(orig);
            read_buf.resize(size);
@ -327,8 +326,8 @@ static void llama_adapter_lora_init_impl(struct llama_model & model, const char
    LLAMA_LOG_INFO("%s: loaded %zu tensors from lora file\n", __func__, adapter.ab_map.size()*2);
 }
-struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model, const char * path_lora) {
+llama_adapter_lora * llama_adapter_lora_init(llama_model * model, const char * path_lora) {
-    struct llama_adapter_lora * adapter = new llama_adapter_lora();
+    llama_adapter_lora * adapter = new llama_adapter_lora();
    try {
        llama_adapter_lora_init_impl(*model, path_lora, *adapter);
@ -342,6 +341,6 @@ struct llama_adapter_lora * llama_adapter_lora_init(struct llama_model * model,
    return nullptr;
 }
-void llama_adapter_lora_free(struct llama_adapter_lora * adapter) {
+void llama_adapter_lora_free(llama_adapter_lora * adapter) {
    delete adapter;
 }
--- a/src/llama-adapter.h
+++ b/src/llama-adapter.h
@ -15,11 +15,11 @@
 //
 struct llama_adapter_cvec {
-    struct ggml_tensor * tensor_for(int il) const;
+    ggml_tensor * tensor_for(int il) const;
-    struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int  il) const;
+    ggml_tensor * apply_to(ggml_context * ctx, ggml_tensor * cur, int  il) const;
-    int32_t apply(
+    bool apply(
            const llama_model & model,
            const float * data,
            size_t len,
@ -36,7 +36,7 @@ private:
    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
-    std::vector<struct ggml_tensor *> tensors; // per layer
+    std::vector<ggml_tensor *> tensors; // per layer
 };
 //
@ -44,8 +44,8 @@ private:
 //
 struct llama_adapter_lora_weight {
-    struct ggml_tensor * a = nullptr;
+    ggml_tensor * a = nullptr;
-    struct ggml_tensor * b = nullptr;
+    ggml_tensor * b = nullptr;
    // get actual scale based on rank and alpha
    float get_scale(float alpha, float adapter_scale) const {
@ -55,12 +55,12 @@ struct llama_adapter_lora_weight {
    }
    llama_adapter_lora_weight() = default;
-    llama_adapter_lora_weight(struct ggml_tensor * a, struct ggml_tensor * b) : a(a), b(b) {}
+    llama_adapter_lora_weight(ggml_tensor * a, ggml_tensor * b) : a(a), b(b) {}
 };
 struct llama_adapter_lora {
    // map tensor name to lora_a_b
-    std::unordered_map<std::string, struct llama_adapter_lora_weight> ab_map;
+    std::unordered_map<std::string, llama_adapter_lora_weight> ab_map;
    std::vector<ggml_context_ptr> ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
@ -70,5 +70,7 @@ struct llama_adapter_lora {
    llama_adapter_lora() = default;
    ~llama_adapter_lora() = default;
-    llama_adapter_lora_weight * get_weight(struct ggml_tensor * w);
+    llama_adapter_lora_weight * get_weight(ggml_tensor * w);
 };
 using llama_adapter_loras = std::unordered_map<llama_adapter_lora *, float>;
--- a/src/llama-batch.h
+++ b/src/llama-batch.h
@ -42,9 +42,9 @@ struct llama_sbatch {
    bool logits_all; // TODO: remove once lctx.logits_all is removed too
    // sorted indices into the batch
-    std::vector<size_t> ids;
+    std::vector<int64_t> ids;
    // batch indices of the output
-    std::vector<size_t> out_ids;
+    std::vector<int64_t> out_ids;
    std::vector<llama_sbatch_seq> seq;
    const llama_batch * batch = nullptr;
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -3,66 +3,210 @@
 #include "llama.h"
 #include "llama-batch.h"
 #include "llama-cparams.h"
-#include "llama-model.h"
+#include "llama-graph.h"
 #include "llama-kv-cache.h"
 #include "llama-adapter.h"
 #include "ggml-cpp.h"
 #include <map>
 #include <unordered_map>
 #include <vector>
-#include <set>
+
 struct llama_model;
 struct llama_kv_cache;
 class llama_io_read_i;
 class llama_io_write_i;
 struct llama_context {
-    llama_context(const llama_model & model)
+    // init scheduler and compute buffers, reserve worst-case graphs
-        : model(model)
+    llama_context(
-        , t_start_us(model.t_start_us)
+            const llama_model & model,
-        , t_load_us(model.t_load_us) {}
+                  llama_context_params params);
-    const struct llama_model & model;
+    ~llama_context();
-    struct llama_cparams      cparams;
+    void synchronize();
    struct llama_sbatch       sbatch;  // TODO: revisit if needed
    struct llama_kv_cache     kv_self;
    struct llama_adapter_cvec cvec;
-    std::unordered_map<struct llama_adapter_lora *, float> lora;
+    const llama_model & get_model() const;
-    std::vector<ggml_backend_ptr> backends;
+    uint32_t n_ctx()         const;
-    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
+    uint32_t n_ctx_per_seq() const;
    uint32_t n_batch()       const;
    uint32_t n_ubatch()      const;
    uint32_t n_seq_max()     const;
-    ggml_backend_t backend_cpu = nullptr;
+    uint32_t n_threads()       const;
    uint32_t n_threads_batch() const;
-    ggml_threadpool_t threadpool       = nullptr;
+          llama_kv_cache * get_kv_self();
-    ggml_threadpool_t threadpool_batch = nullptr;
+    const llama_kv_cache * get_kv_self() const;
-    bool has_evaluated_once = false;
+    void kv_self_update();
-    mutable int64_t t_start_us;
+    enum llama_pooling_type pooling_type() const;
    mutable int64_t t_load_us;
    mutable int64_t t_p_eval_us = 0;
    mutable int64_t t_eval_us   = 0;
-    mutable int64_t t_compute_start_us = 0;
+    float * get_logits();
-    mutable int64_t n_queued_tokens = 0;
+    float * get_logits_ith(int32_t i);
-    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
+    float * get_embeddings();
-    mutable int32_t n_eval   = 0; // number of eval calls
+    float * get_embeddings_ith(int32_t i);
    float * get_embeddings_seq(llama_seq_id seq_id);
-    // host buffer for the model output (logits and embeddings)
+    void attach_threadpool(
-    ggml_backend_buffer_ptr buf_output;
+            ggml_threadpool_t threadpool,
            ggml_threadpool_t threadpool_batch);
    void detach_threadpool();
    void set_n_threads(int32_t n_threads, int32_t n_threads_batch);
    void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data);
    void set_embeddings (bool value);
    void set_causal_attn(bool value);
    void set_adapter_lora(
            llama_adapter_lora * adapter,
            float scale);
    bool rm_adapter_lora(
            llama_adapter_lora * adapter);
    void clear_adapter_lora();
    bool apply_adapter_cvec(
            const float * data,
                 size_t   len,
                int32_t   n_embd,
                int32_t   il_start,
                int32_t   il_end);
    int encode(llama_batch & inp_batch);
    int decode(llama_batch & inp_batch);
    //
    // state save/load
    //
    size_t state_get_size();
    size_t state_get_data(      uint8_t * dst, size_t size);
    size_t state_set_data(const uint8_t * src, size_t size);
    size_t state_seq_get_size(llama_seq_id seq_id);
    size_t state_seq_get_data(llama_seq_id seq_id,       uint8_t * dst, size_t size);
    size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size);
    bool state_load_file(
            const char * filepath,
           llama_token * tokens_out,
                size_t   n_token_capacity,
                size_t * n_token_count_out);
    bool state_save_file(
            const char * filepath,
     const llama_token * tokens,
                size_t   n_token_count);
    size_t state_seq_load_file(
          llama_seq_id   seq_id,
            const char * filepath,
           llama_token * tokens_out,
                size_t   n_token_capacity,
                size_t * n_token_count_out);
    size_t state_seq_save_file(
          llama_seq_id   seq_id,
            const char * filepath,
     const llama_token * tokens,
                size_t   n_token_count);
    //
    // perf
    //
    llama_perf_context_data perf_get_data() const;
    void perf_reset();
 private:
    //
    // output
    //
    // Make sure enough space is available for outputs.
    // Returns max number of outputs for which space was reserved.
    int32_t output_reserve(int32_t n_outputs);
    // make the outputs have the same order they had in the user-provided batch
    // TODO: maybe remove this
    void output_reorder();
    //
    // graph
    //
    int32_t graph_max_nodes() const;
    // zero-out inputs and create the ctx_compute for the compute graph
    ggml_cgraph * graph_init();
    llm_graph_result_ptr graph_build(
            ggml_context * ctx,
             ggml_cgraph * gf,
      const llama_ubatch & ubatch,
          llm_graph_type   gtype);
    // returns the result of ggml_backend_sched_graph_compute_async execution
    ggml_status graph_compute(
            ggml_cgraph * gf,
                   bool   batched);
    llm_graph_cb graph_get_cb() const;
    // used by kv_self_update()
    ggml_tensor * build_rope_shift(
        ggml_context * ctx0,
        ggml_tensor * cur,
        ggml_tensor * shift,
        ggml_tensor * factors,
        ggml_backend_buffer * bbuf) const;
    llm_graph_result_ptr build_kv_self_shift(
            ggml_context * ctx0,
            ggml_cgraph * gf) const;
    llm_graph_result_ptr build_kv_self_defrag(
            ggml_context * ctx0,
            ggml_cgraph * gf) const;
    // TODO: read/write lora adapters and cvec
    size_t state_write_data(llama_io_write_i & io);
    size_t state_read_data (llama_io_read_i  & io);
    size_t state_seq_write_data(llama_io_write_i & io, llama_seq_id seq_id);
    size_t state_seq_read_data (llama_io_read_i  & io, llama_seq_id seq_id);
    //
    // members
    //
    const llama_model & model;
    llama_cparams       cparams;
    llama_adapter_cvec  cvec;
    llama_adapter_loras loras;
    llama_sbatch        sbatch;
    llama_cross cross; // TODO: tmp for handling cross-attention - need something better probably
    std::unique_ptr<llama_kv_cache_unified> kv_self;
    // TODO: remove
    bool logits_all = false;
    // decode output (2-dimensional array: [n_outputs][n_vocab])
    size_t  logits_size = 0; // capacity (of floats) for logits
    float * logits      = nullptr;
    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
    size_t  output_size = 0; // capacity (of tokens positions) for the output buffers
    int32_t n_outputs   = 0; // number of actually-used outputs in the current ubatch or last logical batch
    bool logits_all = false;
    // embeddings output (2-dimensional array: [n_outputs][n_embd])
    // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
    size_t  embd_size = 0; // capacity (of floats) for embeddings
@ -72,57 +216,47 @@ struct llama_context {
    // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
    std::map<llama_seq_id, std::vector<float>> embd_seq;
-    // whether we are computing encoder output or decoder output
+    int32_t n_outputs     = 0; // number of actually-used outputs in the current ubatch or last logical batch
-    bool is_encoding = false;
+    int32_t n_outputs_max = 0; // capacity (of tokens positions) for the output buffers
-    // TODO: find a better way to accommodate mutli-dimension position encoding methods
+    std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
    // number of position id each token get, 1 for each token in most cases.
    // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
    int n_pos_per_token = 1;
    // output of the encoder part of the encoder-decoder models
    std::vector<float> embd_enc;
    std::vector<std::set<llama_seq_id>> seq_ids_enc;
    // memory buffers used to evaluate the model
    std::vector<uint8_t> buf_compute_meta;
    ggml_backend_sched_ptr sched;
    ggml_backend_t backend_cpu = nullptr;
    std::vector<ggml_backend_ptr> backends;
    ggml_context_ptr ctx_compute;
    ggml_threadpool_t threadpool       = nullptr;
    ggml_threadpool_t threadpool_batch = nullptr;
    ggml_abort_callback abort_callback      = nullptr;
    void *              abort_callback_data = nullptr;
-    // input tensors
+    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;
-    struct ggml_tensor * inp_tokens;        // I32 [n_batch]
+
-    struct ggml_tensor * inp_embd;          // F32 [n_embd, n_batch]
+    // buffer types used for the compute buffer of each backend
-    struct ggml_tensor * inp_pos;           // I32 [n_batch]
+    std::vector<ggml_backend_t>             backend_ptrs;
-    struct ggml_tensor * inp_out_ids;       // I32 [n_outputs]
+    std::vector<ggml_backend_buffer_type_t> backend_buft;
-    struct ggml_tensor * inp_KQ_mask;       // F32 [kv_size, n_batch]
+
-    struct ggml_tensor * inp_KQ_mask_swa;   // F32 [kv_size, n_batch]
+    // memory buffers used to evaluate the model
-    struct ggml_tensor * inp_K_shift;       // I32 [kv_size]
+    std::vector<uint8_t> buf_compute_meta;
-    struct ggml_tensor * inp_mean;          // F32 [n_batch, n_batch]
+
-    struct ggml_tensor * inp_cls;           // I32 [n_batch]
+    // host buffer for the model output (logits and embeddings)
-    struct ggml_tensor * inp_s_copy;        // I32 [kv_size]
+    ggml_backend_buffer_ptr buf_output;
-    struct ggml_tensor * inp_s_mask;        // F32 [1, n_kv]
+
-    struct ggml_tensor * inp_s_seq;         // I32 [n_kv, n_batch]
+    bool has_evaluated_once = false;
-    struct ggml_tensor * inp_pos_bucket;    // I32 [n_batch|n_kv, n_batch]
+
-    struct ggml_tensor * inp_embd_enc;      // F32 [n_embd, n_outputs_enc]
+    // perf
-    struct ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
+    mutable int64_t t_start_us  = 0;
    mutable int64_t t_load_us   = 0;
    mutable int64_t t_p_eval_us = 0;
    mutable int64_t t_eval_us   = 0;
    mutable int64_t t_compute_start_us = 0;
    mutable int64_t n_queued_tokens    = 0;
    mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
    mutable int32_t n_eval   = 0; // number of eval calls
 };
 // TODO: make these methods of llama_context
 void llama_set_k_shift(struct llama_context & lctx);
 void llama_set_s_copy(struct llama_context & lctx);
 void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
 // Make sure enough space is available for outputs.
 // Returns max number of outputs for which space was reserved.
 size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
 // make the outputs have the same order they had in the user-provided batch
 void llama_output_reorder(struct llama_context & ctx);
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -0,0 +1,576 @@
 #pragma once
 #include "llama-arch.h"
 #include "llama-hparams.h"
 #include "llama-adapter.h"
 #include <cstdint>
 #include <vector>
 #include <memory>
 #include <set>
 #include <functional>
 struct ggml_cgraph;
 struct ggml_context;
 struct ggml_tensor;
 struct llama_ubatch;
 struct llama_cparams;
 class llama_memory_i;
 class llama_kv_cache_unified;
 // certain models (typically multi-modal) can produce different types of graphs
 enum llm_graph_type {
    LLM_GRAPH_TYPE_DEFAULT,
    LLM_GRAPH_TYPE_ENCODER,
    LLM_GRAPH_TYPE_DECODER,
 };
 enum llm_ffn_op_type {
    LLM_FFN_SILU,
    LLM_FFN_GELU,
    LLM_FFN_RELU,
    LLM_FFN_RELU_SQR,
    LLM_FFN_SWIGLU,
 };
 enum llm_ffn_gate_type {
    LLM_FFN_SEQ,
    LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
 };
 enum llm_norm_type {
    LLM_NORM,
    LLM_NORM_RMS,
    LLM_NORM_GROUP,
 };
 // TODO: tmp - need something better to pass the data from the encoder to the decoder
 struct llama_cross {
    // the output embeddings from the encoder as a ggml tensor
    // TODO: this needs more work to be correct, for now copy the embeddings data to host memory
    //       ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
    //ggml_tensor * t_embd = nullptr;
    int64_t n_embd = 0;
    int64_t n_enc  = 0;
    // embeddings data copied to host memory (tmp)
    std::vector<float> v_embd;
    // needed to construct the cross-attention mask in the decoder
    std::vector<std::set<llama_seq_id>> seq_ids_enc;
 };
 //
 // llm_graph_input
 //
 class llm_graph_input_i {
 public:
    virtual ~llm_graph_input_i() = default;
    virtual void set_input(const llama_ubatch * ubatch) = 0;
 };
 using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
 class llm_graph_input_embd : public llm_graph_input_i {
 public:
    llm_graph_input_embd()          = default;
    virtual ~llm_graph_input_embd() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * tokens = nullptr; // I32 [n_batch]
    ggml_tensor * embd   = nullptr; // F32 [n_embd, n_batch]
 };
 class llm_graph_input_pos : public llm_graph_input_i {
 public:
    llm_graph_input_pos(int64_t n_pos_per_token) : n_pos_per_token(n_pos_per_token) {}
    virtual ~llm_graph_input_pos() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * pos = nullptr; // I32 [n_batch]
    const int64_t n_pos_per_token = 1;
 };
 class llm_graph_input_pos_bucket : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
    virtual ~llm_graph_input_pos_bucket() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
    const llama_hparams & hparams;
 };
 class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
 public:
    llm_graph_input_pos_bucket_kv(
            const llama_hparams & hparams,
            const llama_kv_cache_unified * kv_self) : hparams(hparams), kv_self(kv_self) {}
    virtual ~llm_graph_input_pos_bucket_kv() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
    const llama_hparams & hparams;
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_out_ids : public llm_graph_input_i {
 public:
    llm_graph_input_out_ids(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
            int32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
    virtual ~llm_graph_input_out_ids() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * out_ids; // I32 [n_outputs]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const int32_t n_outputs;
 };
 class llm_graph_input_mean : public llm_graph_input_i {
 public:
    llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
    virtual ~llm_graph_input_mean() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * mean; // F32 [n_batch, n_batch]
    const llama_cparams & cparams;
 };
 class llm_graph_input_cls : public llm_graph_input_i {
 public:
    llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
    virtual ~llm_graph_input_cls() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * cls; // I32 [n_batch]
    const llama_cparams & cparams;
 };
 class llm_graph_input_s_copy : public llm_graph_input_i {
 public:
    llm_graph_input_s_copy(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_copy() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_copy; // I32 [kv_size]
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_s_mask : public llm_graph_input_i {
 public:
    llm_graph_input_s_mask(const llama_kv_cache_unified * kv_self) : kv_self(kv_self) {}
    virtual ~llm_graph_input_s_mask() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * s_mask; // F32 [1, n_kv]
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_cross_embd : public llm_graph_input_i {
 public:
    llm_graph_input_cross_embd(
            const llama_cross * cross) : cross(cross) {}
    virtual ~llm_graph_input_cross_embd() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
    const llama_cross * cross;
 };
 class llm_graph_input_attn_no_cache : public llm_graph_input_i {
 public:
    llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
        hparams(hparams),
        cparams(cparams) {
    }
    ~llm_graph_input_attn_no_cache() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
    ggml_tensor * kq_mask     = nullptr; // F32 [n_tokens, n_batch]
    ggml_tensor * kq_mask_cnv = nullptr; //     [n_tokens, n_batch]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
 };
 class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
 public:
    llm_graph_input_attn_kv_unified(
            const llama_hparams & hparams,
            const llama_cparams & cparams,
            const llama_kv_cache_unified * kv_self) :
        hparams(hparams),
        cparams(cparams),
        kv_self(kv_self) {
    }
    ~llm_graph_input_attn_kv_unified() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask()     const { return self_kq_mask_cnv; }
    ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
    ggml_tensor * self_kq_mask         = nullptr; // F32 [n_kv, n_batch]
    ggml_tensor * self_kq_mask_cnv     = nullptr; //     [n_kv, n_batch]
    ggml_tensor * self_kq_mask_swa     = nullptr; // F32 [n_kv, n_batch]
    ggml_tensor * self_kq_mask_swa_cnv = nullptr; //     [n_kv, n_batch]
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_kv_cache_unified * kv_self;
 };
 class llm_graph_input_attn_cross : public llm_graph_input_i {
 public:
    llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
    ~llm_graph_input_attn_cross() = default;
    void set_input(const llama_ubatch * ubatch) override;
    ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
    ggml_tensor * cross_kq_mask     = nullptr; // F32 [n_outputs_enc, n_batch]
    ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch]
    const llama_cross * cross = nullptr;
 };
 //
 // llm_graph_result
 //
 // these objects deliver the result from the graph build process back to the llama_context
 // note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
 //   specific data, by calling the set_inputs() method
 // along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
 //   these are used by the llama_context to extact the relevant data, based on the compute parameters
 class llm_graph_result_i {
 public:
    virtual ~llm_graph_result_i() = default;
    virtual ggml_tensor * get_logits()      = 0;
    virtual ggml_tensor * get_embd()        = 0;
    virtual ggml_tensor * get_embd_pooled() = 0;
    virtual void set_inputs(const llama_ubatch * ubatch) = 0;
 };
 using llm_graph_result_ptr = std::unique_ptr<llm_graph_result_i>;
 class llm_graph_result : public llm_graph_result_i {
 public:
    virtual ~llm_graph_result() = default;
    ggml_tensor * get_logits()      override { return t_logits; }
    ggml_tensor * get_embd()        override { return t_embd; }
    ggml_tensor * get_embd_pooled() override { return t_embd_pooled; }
    void set_inputs(const llama_ubatch * ubatch) override {
        for (auto & input : inputs) {
            input->set_input(ubatch);
        }
    }
    llm_graph_input_i * add_input(llm_graph_input_ptr input) {
        inputs.emplace_back(std::move(input));
        return inputs.back().get();
    }
    // important graph nodes
    ggml_tensor * t_logits      = nullptr;
    ggml_tensor * t_embd        = nullptr;
    ggml_tensor * t_embd_pooled = nullptr;
    std::vector<llm_graph_input_ptr> inputs;
 };
 //
 // llm_graph_context
 //
 // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
 using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
 struct llm_graph_params {
    ggml_context * ctx;
    const llm_arch arch;
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_ubatch  & ubatch;
    ggml_backend_sched * sched;
    ggml_backend * backend_cpu;
    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
    const llama_memory_i      * memory;
    const llama_cross         * cross;
    int32_t n_outputs;
    const llm_graph_cb & cb;
 };
 struct llm_graph_context {
    const llm_arch arch;
    const llama_hparams & hparams;
    const llama_cparams & cparams;
    const llama_ubatch  & ubatch;
    const int64_t n_embd;
    const int64_t n_layer;
    const int64_t n_rot;
    const int64_t n_ctx;       // user-specified context size (can be different from n_ctx_train)
    const int64_t n_ctx_per_seq;
    const int64_t n_head;
    const int64_t n_head_kv;
    const int64_t n_embd_head_k;
    const int64_t n_embd_k_gqa;
    const int64_t n_embd_head_v;
    const int64_t n_embd_v_gqa;
    const int64_t n_expert;
    const int64_t n_expert_used;
    const float freq_base;
    const float freq_scale;
    const float ext_factor;
    const float attn_factor;
    const float beta_fast;
    const float beta_slow;
    const float norm_eps;
    const float norm_rms_eps;
    const int32_t n_tokens;
    const int32_t n_outputs;
    const int32_t n_ctx_orig; // yarn
    const enum llama_pooling_type pooling_type;
    const enum llama_rope_type    rope_type;
    ggml_context * ctx0 = nullptr;
    ggml_backend_sched * sched;
    ggml_backend * backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
    const llama_adapter_cvec  * cvec;
    const llama_adapter_loras * loras;
    const llama_memory_i      * memory;
    const llama_cross         * cross;
    const llm_graph_cb & cb_func;
    std::unique_ptr<llm_graph_result> res;
    llm_graph_context(const llm_graph_params & params);
    int64_t n_pos_per_token() const;
    void cb(ggml_tensor * cur, const char * name, int il) const;
    //
    // common
    //
    ggml_tensor * build_cvec(
             ggml_tensor * cur,
                     int   il) const;
    // do mat_mul, while optionally apply lora
    ggml_tensor * build_lora_mm(
              ggml_tensor * w,
              ggml_tensor * cur) const;
    // do mat_mul_id, while optionally apply lora
    ggml_tensor * build_lora_mm_id(
              ggml_tensor * w,   // ggml_tensor * as
              ggml_tensor * cur, // ggml_tensor * b
              ggml_tensor * ids) const;
    ggml_tensor * build_norm(
             ggml_tensor * cur,
             ggml_tensor * mw,
             ggml_tensor * mb,
           llm_norm_type   type,
                     int   il) const;
    ggml_tensor * build_ffn(
             ggml_tensor * cur,
             ggml_tensor * up,
             ggml_tensor * up_b,
             ggml_tensor * up_s,
             ggml_tensor * gate,
             ggml_tensor * gate_b,
             ggml_tensor * gate_s,
             ggml_tensor * down,
             ggml_tensor * down_b,
             ggml_tensor * down_s,
             ggml_tensor * act_scales,
         llm_ffn_op_type   type_op,
       llm_ffn_gate_type   type_gate,
                     int   il) const;
    ggml_tensor * build_moe_ffn(
             ggml_tensor * cur,
             ggml_tensor * gate_inp,
             ggml_tensor * up_exps,
             ggml_tensor * gate_exps,
             ggml_tensor * down_exps,
             ggml_tensor * exp_probs_b,
                 int64_t   n_expert,
                 int64_t   n_expert_used,
         llm_ffn_op_type   type_op,
                    bool   norm_w,
                    bool   scale_w,
                   float   w_scale,
            llama_expert_gating_func_type gating_op,
                     int   il) const;
    //
    // inputs
    //
    ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
    ggml_tensor * build_inp_pos() const;
    ggml_tensor * build_inp_out_ids() const;
    ggml_tensor * build_inp_mean() const;
    ggml_tensor * build_inp_cls() const;
    ggml_tensor * build_inp_s_copy() const;
    ggml_tensor * build_inp_s_mask() const;
    ggml_tensor * build_inp_cross_embd() const;
    ggml_tensor * build_inp_pos_bucket_enc() const;
    ggml_tensor * build_inp_pos_bucket_dec() const;
    ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
    //
    // attention
    //
    ggml_tensor * build_attn_mha(
             ggml_cgraph * gf,
             ggml_tensor * q,
             ggml_tensor * k,
             ggml_tensor * v,
             ggml_tensor * kq_b,
             ggml_tensor * kq_mask,
                    bool   v_trans,
                   float   kq_scale) const;
    llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_no_cache * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur,
            ggml_tensor * k_cur,
            ggml_tensor * v_cur,
            ggml_tensor * kq_b,
                  float   kq_scale,
                    int   il) const;
    llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified(
            bool causal,
            bool swa) const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_kv_unified * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur,
            ggml_tensor * k_cur,
            ggml_tensor * v_cur,
            ggml_tensor * kq_b,
                  float   kq_scale,
                    int   il) const;
    llm_graph_input_attn_cross * build_attn_inp_cross() const;
    ggml_tensor * build_attn(
            llm_graph_input_attn_cross * inp,
            ggml_cgraph * gf,
            ggml_tensor * wo,
            ggml_tensor * wo_b,
            ggml_tensor * q_cur,
            ggml_tensor * k_cur,
            ggml_tensor * v_cur,
            ggml_tensor * kq_b,
                  float   kq_scale,
                    int   il) const;
    //
    // recurrent
    //
    ggml_tensor * build_copy_mask_state(
             ggml_cgraph * gf,
             ggml_tensor * s,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
                 int32_t   n_state,
                 int32_t   n_seqs) const;
    ggml_tensor * build_rwkv_token_shift_load(
             ggml_cgraph * gf,
             ggml_tensor * state_copy,
             ggml_tensor * state_mask,
      const llama_ubatch & ubatch,
                     int   il) const;
    ggml_tensor * build_rwkv_token_shift_store(
             ggml_tensor * token_shift,
      const llama_ubatch & ubatch,
                     int   il) const;
    //
    // pooling
    //
    void build_pooling(
            ggml_cgraph * gf,
            ggml_tensor * cls,
            ggml_tensor * cls_b,
            ggml_tensor * cls_out,
            ggml_tensor * cls_out_b) const;
 };
--- a/src/llama-io.cpp
+++ b/src/llama-io.cpp
@ -0,0 +1,15 @@
 #include "llama-io.h"
 void llama_io_write_i::write_string(const std::string & str) {
    uint32_t str_size = str.size();
    write(&str_size,  sizeof(str_size));
    write(str.data(), str_size);
 }
 void llama_io_read_i::read_string(std::string & str) {
    uint32_t str_size;
    read_to(&str_size, sizeof(str_size));
    str.assign((const char *) read(str_size), str_size);
 }
--- a/src/llama-io.h
+++ b/src/llama-io.h
@ -0,0 +1,35 @@
 #pragma once
 #include <cstddef>
 #include <cstdint>
 #include <string>
 struct ggml_tensor;
 class llama_io_write_i {
 public:
    llama_io_write_i() = default;
    virtual ~llama_io_write_i() = default;
    virtual void write(const void * src, size_t size) = 0;
    virtual void write_tensor(const ggml_tensor * tensor, size_t offset, size_t size) = 0;
    // bytes written so far
    virtual size_t n_bytes() = 0;
    void write_string(const std::string & str);
 };
 class llama_io_read_i {
 public:
    llama_io_read_i() = default;
    virtual ~llama_io_read_i() = default;
    virtual const uint8_t * read(size_t size) = 0;
    virtual void read_to(void * dst, size_t size) = 0;
    // bytes read so far
    virtual size_t n_bytes() = 0;
    void read_string(std::string & str);
 };
--- a/src/llama-kv-cache.cpp
+++ b/src/llama-kv-cache.cpp
--- a/src/llama-kv-cache.h
+++ b/src/llama-kv-cache.h
@ -1,12 +1,29 @@
 #pragma once
 #include "llama.h"
 #include "llama-io.h"
 #include "llama-memory.h"
 #include "ggml-cpp.h"
 #include <functional>
 #include <set>
 #include <vector>
-#include <algorithm>
+
 struct llama_cparams;
 struct llama_hparams;
 struct llama_ubatch;
 struct llama_kv_cache : public llama_memory_i {
    using llama_memory_i::llama_memory_i;
    virtual int32_t  get_n_tokens()   const = 0;
    virtual uint32_t get_used_cells() const = 0; // TODO: remove, this is too-specific to the unified cache
    virtual bool get_can_shift() const = 0;
    bool get_can_edit() const override { return get_can_shift(); }
 };
 struct llama_kv_cell {
    llama_pos pos   = -1;
@ -29,11 +46,105 @@ struct llama_kv_cell {
    }
 };
 // a structure holds information about the slot found in llama_kv_cache_find_slot
 struct llama_kv_cache_slot_info {
    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
    bool found = false;                       // the slot was found
    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
    operator bool() const { return found; }
 };
 // ring-buffer of cached KV data
-struct llama_kv_cache {
+// TODO: pimpl
 // TODO: add notion of max sequences
 class llama_kv_cache_unified : public llama_kv_cache {
 public:
    // can be used to query data from the model if needed
    struct callbacks {
        std::function<ggml_tensor * (uint32_t n_ctx_per_seq, int il)> get_rope_factors;
    };
    llama_kv_cache_unified(
            const llama_hparams & hparams,
            callbacks             cbs);
    virtual ~llama_kv_cache_unified() = default;
    // TODO: become constructor
    bool init(
            const llama_model & model,   // TODO: do not reference the model
          const llama_cparams & cparams,
                    ggml_type   type_k,
                    ggml_type   type_v,
                     uint32_t   kv_size,
                         bool   offload);
    int32_t  get_n_tokens()   const override;
    uint32_t get_used_cells() const override;
    size_t total_size() const;
    // TODO: better data structures to reduce the cost of this operation
    llama_pos pos_max() const;
    void clear() override;
    void defrag() override;
    bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) override;
    void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) override;
    void seq_keep(llama_seq_id seq_id) override;
    void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) override;
    void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) override;
    llama_pos seq_pos_max(llama_seq_id seq_id) override;
    bool get_can_shift() const override;
    // find an empty slot of size "n_tokens" in the cache
    // updates the cache head
    // returns a structure holding information about the slot found
    // Note: On success, it's important that cache.head points
    // to the first cell of the slot.
    llama_kv_cache_slot_info find_slot(const llama_ubatch & batch);
    // TODO: maybe not needed
    uint32_t get_padding(const llama_cparams & cparams) const;
    // find how many cells are currently in use
    uint32_t cell_max() const;
    size_t size_k_bytes() const;
    size_t size_v_bytes() const;
    // defrag
    struct {
        std::vector<uint32_t> ids;
    } defrag_info;
    // return true if cells have been moved
    bool defrag_prepare(int32_t n_max_nodes);
    // state save/load
    void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;
    void state_read (llama_io_read_i  & io, llama_seq_id seq_id = -1);
    // members
    const llama_hparams & hparams;
    callbacks cbs;
    bool has_shift = false;
    bool do_defrag = false;
    // TODO: remove this and implement llama_kv_cache_recurrent instead
    bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token
    bool v_trans   = true;  // the value tensor is transposed
    bool can_shift = false;
@ -47,124 +158,30 @@ struct llama_kv_cache {
    // computed before each graph build
    uint32_t n = 0;
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;
    std::vector<llama_kv_cell> cells;
-    std::vector<struct ggml_tensor *> k_l; // per layer
+    std::vector<ggml_tensor *> k_l; // per layer
-    std::vector<struct ggml_tensor *> v_l;
+    std::vector<ggml_tensor *> v_l;
 private:
    ggml_type type_k = GGML_TYPE_F16;
    ggml_type type_v = GGML_TYPE_F16;
    std::vector<ggml_context_ptr>        ctxs;
    std::vector<ggml_backend_buffer_ptr> bufs;
-    size_t total_size() const {
+    void state_write_meta(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges, llama_seq_id seq_id = -1) const;
-        size_t size = 0;
+    void state_write_data(llama_io_write_i & io, const std::vector<std::pair<uint32_t, uint32_t>> & cell_ranges) const;
        for (const auto & buf : bufs) {
            size += ggml_backend_buffer_get_size(buf.get());
        }
-        return size;
+    bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1);
-    }
+    bool state_read_data(llama_io_read_i & io, uint32_t cell_count);
    // TODO: better data structures to reduce the cost of this operation
    llama_pos max_pos() const {
        llama_pos max_pos = -1;
        for (const auto & cell : cells) {
            max_pos = std::max(max_pos, cell.pos);
        }
        return max_pos;
    }
 };
-// a structure holds information about the slot found in llama_kv_cache_find_slot
+// TODO: temporary reusing llama_kv_cache_unified -- implement recurrent cache and simplify llama_kv_cache_unified
-struct llama_kv_cache_slot_info {
+//class llama_kv_cache_recurrent : public llama_kv_cache_unified {
-    std::pair<uint32_t, uint32_t> boundaries; // slot boundaries [begin, end)
+//public:
-    bool found = false;                       // the slot was found
+//    using llama_kv_cache_unified::llama_kv_cache_unified;
-
+//};
    explicit llama_kv_cache_slot_info(bool found_) : found{found_} {}
    llama_kv_cache_slot_info(uint32_t begin, uint32_t end) : boundaries{begin, end}, found{true} {}
    operator bool() const { return found; }
 };
 // TODO: maybe not needed
 uint32_t llama_kv_cache_get_padding(const struct llama_cparams & cparams);
 bool llama_kv_cache_init(
        struct llama_kv_cache & cache,
            const llama_model & model,
          const llama_cparams & cparams,
                    ggml_type   type_k,
                    ggml_type   type_v,
                     uint32_t   kv_size,
                         bool   offload);
 // find an empty slot of size "n_tokens" in the cache
 // updates the cache head
 // returns a structure holding information about the slot found
 // Note: On success, it's important that cache.head points
 // to the first cell of the slot.
 struct llama_kv_cache_slot_info llama_kv_cache_find_slot(
           struct llama_kv_cache & cache,
       const struct llama_ubatch & batch);
 // find how many cells are currently in use
 uint32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache);
 void llama_kv_cache_clear(struct llama_kv_cache & cache);
 bool llama_kv_cache_seq_rm(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1);
 void llama_kv_cache_seq_cp(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id_src,
                 llama_seq_id   seq_id_dst,
                    llama_pos   p0,
                    llama_pos   p1);
 void llama_kv_cache_seq_keep(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id);
 void llama_kv_cache_seq_add(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
                    llama_pos   delta);
 void llama_kv_cache_seq_div(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id,
                    llama_pos   p0,
                    llama_pos   p1,
                          int   d);
 llama_pos llama_kv_cache_seq_pos_max(
        struct llama_kv_cache & cache,
                 llama_seq_id   seq_id);
 void llama_kv_cache_defrag(struct llama_kv_cache & cache);
 int32_t llama_get_kv_cache_token_count(const struct llama_kv_cache & kv);
 int32_t llama_get_kv_cache_used_cells(const struct llama_kv_cache & kv);
 bool llama_kv_cache_can_shift(const struct llama_kv_cache & kv);
 //
 // kv cache view
 //
 struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_kv_cache & kv, int32_t n_seq_max);
 void llama_kv_cache_view_update(struct llama_kv_cache_view * view, const struct llama_kv_cache & kv);
 //
 // kv cache restore
@ -184,13 +201,15 @@ struct llama_kv_slot_restorer {
    bool do_restore = false;
-    explicit llama_kv_slot_restorer(const struct llama_kv_cache & cache) {
+    llama_kv_cache_unified & cache;
    explicit llama_kv_slot_restorer(llama_kv_cache_unified & cache) : cache(cache) {
        old_state.head = cache.head;
        old_state.n    = cache.n;
    }
    // saves a slot information for future restoration
-    void save(const struct llama_kv_cache_slot_info & slot) {
+    void save(const llama_kv_cache_slot_info & slot) {
        if (slot) {
            do_restore = true;
            if (slot.boundaries.first != slot.boundaries.second) {
@ -201,19 +220,68 @@ struct llama_kv_slot_restorer {
    // must be explicitly called to restore the kv_cache state
    // and rollback changes from all llama_kv_cache_find_slot calls
-    void restore(struct llama_kv_cache & cache) {
+    void restore() {
        if (do_restore) {
            cache.head = old_state.head;
            cache.n    = old_state.n;
            if (cache.recurrent) { // recurrent models like Mamba or RWKV can't have a state partially erased
-                llama_kv_cache_seq_rm(cache, -1, -1, -1);
+                cache.seq_rm(-1, -1, -1);
            } else {
                for (auto & slot : slot_boundaries) {
-                    llama_kv_cache_seq_rm(cache, -1, slot.first, slot.second);
+                    cache.seq_rm(-1, slot.first, slot.second);
                }
            }
        }
    }
 };
 // TODO: maybe become part of the public llama_kv_cache in the future
 int32_t llama_kv_cache_n_tokens(const llama_kv_cache * kv);
 int32_t llama_kv_cache_used_cells(const llama_kv_cache * kv);
 void llama_kv_cache_clear(llama_kv_cache * kv);
 bool llama_kv_cache_seq_rm(
        llama_kv_cache * kv,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1);
 void llama_kv_cache_seq_cp(
        llama_kv_cache * kv,
          llama_seq_id   seq_id_src,
          llama_seq_id   seq_id_dst,
             llama_pos   p0,
             llama_pos   p1);
 void llama_kv_cache_seq_keep(llama_kv_cache * kv, llama_seq_id seq_id);
 void llama_kv_cache_seq_add(
        llama_kv_cache * kv,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1,
             llama_pos   delta);
 void llama_kv_cache_seq_div(
        llama_kv_cache * kv,
          llama_seq_id   seq_id,
             llama_pos   p0,
             llama_pos   p1,
                   int   d);
 llama_pos llama_kv_cache_seq_pos_max(llama_kv_cache * kv, llama_seq_id seq_id);
 void llama_kv_cache_defrag(llama_kv_cache * kv);
 bool llama_kv_cache_can_shift(const llama_kv_cache * kv);
 //
 // kv cache view
 //
 llama_kv_cache_view llama_kv_cache_view_init(const llama_kv_cache & kv, int32_t n_seq_max);
 void llama_kv_cache_view_update(llama_kv_cache_view * view, const llama_kv_cache * kv);
--- a/src/llama-memory.cpp
+++ b/src/llama-memory.cpp
@ -0,0 +1 @@
 #include "llama-memory.h"
--- a/src/llama-memory.h
+++ b/src/llama-memory.h
@ -0,0 +1,21 @@
 #pragma once
 #include "llama.h"
 // general concept of LLM memory
 // the KV cache is a type of LLM memory, but there can be other types
 class llama_memory_i {
 public:
    virtual void clear() = 0;
    virtual void defrag() = 0;
    virtual bool seq_rm  (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1) = 0;
    virtual void seq_cp  (llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) = 0;
    virtual void seq_keep(llama_seq_id seq_id) = 0;
    virtual void seq_add (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, llama_pos delta) = 0;
    virtual void seq_div (llama_seq_id seq_id,                              llama_pos p0, llama_pos p1, int d) = 0;
    virtual llama_pos seq_pos_max(llama_seq_id seq_id) = 0;
    virtual bool get_can_edit() const = 0;
 };
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -2,7 +2,9 @@
 #include "llama.h"
 #include "llama-arch.h"
 #include "llama-graph.h"
 #include "llama-hparams.h"
 #include "llama-memory.h"
 #include "llama-vocab.h"
 #include <memory>
@ -10,6 +12,8 @@
 #include <unordered_map>
 #include <vector>
 struct llama_cparams;
 struct llama_ubatch;
 struct llama_model_loader;
 // available models
@ -347,7 +351,7 @@ struct llama_model {
    std::string desc() const;
    size_t size() const;
-    size_t max_nodes() const;
+    size_t n_tensors() const;
    size_t n_devices() const;
    // total number of parameters in the model
@ -362,9 +366,22 @@ struct llama_model {
    const struct ggml_tensor * get_tensor(const char * name) const;
    // TODO: move this to new llm_arch_model_i interface
    llama_memory_i * create_memory() const; // TODO: params
    // TODO: move this to new llm_arch_model_i interface
    llm_graph_result_ptr build_graph(
            const llm_graph_params & params,
                       ggml_cgraph * gf,
                    llm_graph_type   type) const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
 };
 const char * llm_type_name(llm_type type);
 // For internal test use
 // TODO: remove
 const std::vector<std::pair<std::string, ggml_tensor *>> & llama_internal_get_tensor_map(const llama_model * model);
--- a/src/llama.cpp
+++ b/src/llama.cpp