diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 454e141c8..bec82b446 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -2533,7 +2533,7 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - kv_self.build_shift(ctx_compute.get(), gf, this); + build_kv_self_shift(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2559,7 +2559,7 @@ void llama_context_kv_self::kv_self_update() { auto * gf = graph_init(); - kv_self.build_defrag(ctx_compute.get(), gf, max_nodes(), !cparams.flash_attn); + build_kv_self_defrag(ctx_compute.get(), gf); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2817,6 +2817,309 @@ ggml_tensor * llama_context_kv_self::build_attn_soft_max( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } +void llama_context_kv_self::build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) { + const auto & hparams = model.hparams; + + const auto & n_layer = hparams.n_layer; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + //GGML_ASSERT(kv_self.size == n_ctx); + + ggml_tensor * inp_k_shift = build_inp_k_shift(ctx0); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + struct ggml_tensor * rope_factors = build_rope_factors(il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, kv_self.k_l[il], + n_embd_head_k, n_head_kv, kv_self.size, + ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + 0); + + ggml_tensor * cur = build_rope_shift(ctx0, k, inp_k_shift, rope_factors, kv_self.k_l[il]->buffer); + + ggml_build_forward_expand(gf, cur); + } +} + +void llama_context_kv_self::build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) { + const auto & hparams = model.hparams; + + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = kv_self.cell_max(); + const uint32_t n_used = kv_self.used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see build_kv_self_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = max_nodes()/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (max_nodes() - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = kv_self.cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = kv_self.cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = kv_self.cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + kv_self.cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + kv_self.head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(v_l[il]->type); + const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == ids.size()) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < ids.size() && ids[i + nm] == id + nm) { + nm++; + } + + for (uint32_t il = 0; il < n_layer; ++il) { // NOLINT + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), + ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (cparams.flash_attn) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), + ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(kv_self.v_l[il]->type, kv_self.size), + ggml_row_size(kv_self.v_l[il]->type, id)); + } + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); +#endif +} + ggml_tensor * llama_context_kv_self::build_inp_embd_enc( ggml_context * ctx0, int32_t n_tokens, diff --git a/src/llama-context.h b/src/llama-context.h index 0311ad473..a256f3042 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -378,7 +378,7 @@ public: virtual void build_attn_kv_store( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * k_cur, ggml_tensor * v_cur, int32_t n_tokens, @@ -387,7 +387,7 @@ public: virtual ggml_tensor * build_attn_qkv( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -401,6 +401,15 @@ public: ggml_tensor * kq, float kq_scale) override; + virtual void build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) override; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) override; + // === encoder-decoder === // whether we are computing encoder output or decoder output @@ -443,7 +452,7 @@ public: virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -454,7 +463,7 @@ public: virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -464,7 +473,7 @@ public: virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, @@ -480,7 +489,7 @@ public: virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor * state_copy, diff --git a/src/llama-graph.h b/src/llama-graph.h index 6098d2b92..bb51b9a91 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -113,6 +113,15 @@ public: ggml_tensor * kq, float kq_scale) = 0; + virtual void build_kv_self_shift( + ggml_context * ctx0, + ggml_cgraph * gf) = 0; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_kv_self_defrag( + ggml_context * ctx0, + ggml_cgraph * gf) = 0; + virtual ggml_tensor * build_inp_k_shift( ggml_context * ctx0) = 0; @@ -182,18 +191,3 @@ public: int il, bool worst_case) = 0; }; - -class llama_graph_kv_cache_i { -public: - virtual void build_shift( - ggml_context * ctx0, - ggml_cgraph * gf, - llama_graph_i * lgf) = 0; - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - int32_t max_nodes, - bool v_trans) = 0; -}; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index 5dde8b870..8a87f9129 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -699,308 +699,6 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } -void llama_kv_cache::build_shift( - ggml_context * ctx0, - ggml_cgraph * gf, - llama_graph_i * lgf) { - const auto & n_layer = hparams.n_layer; - - const auto & n_embd_head_k = hparams.n_embd_head_k; - //const auto & n_embd_head_v = hparams.n_embd_head_v; - - //GGML_ASSERT(kv_self.size == n_ctx); - - ggml_tensor * inp_k_shift = lgf->build_inp_k_shift(ctx0); - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - - struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); - - struct ggml_tensor * k = - ggml_view_3d(ctx0, k_l[il], - n_embd_head_k, n_head_kv, size, - ggml_row_size(k_l[il]->type, n_embd_head_k), - ggml_row_size(k_l[il]->type, n_embd_k_gqa), - 0); - - ggml_tensor * cur = lgf->build_rope_shift(ctx0, k, inp_k_shift, rope_factors, k_l[il]->buffer); - - ggml_build_forward_expand(gf, cur); - } -} - -void llama_kv_cache::build_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - int32_t max_nodes, - bool v_trans) { - const uint32_t n_layer = hparams.n_layer; - - const uint32_t n_kv = cell_max(); - const uint32_t n_used = used; - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see build_kv_self_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = max_nodes/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (max_nodes - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = cells[i0]; - - if (!cell0.is_empty()) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && cells[i0 + nh].is_empty()) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - const auto & cell1 = cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - // move the cell meta data - cells[i0 + nf] = cell1; - - // clear the old cell and move the head there - cell1 = llama_kv_cell(); - head = n_used; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(v_l[il]->type); - const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(k_l[il]->type, n_embd_k_gqa), - ggml_row_size(k_l[il]->type, n_embd_k_gqa*i)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(k_l[il]->type, n_embd_k_gqa), - ggml_row_size(k_l[il]->type, n_embd_k_gqa*id)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (!v_trans) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(v_l[il]->type, n_embd_v_gqa), - ggml_row_size(v_l[il]->type, n_embd_v_gqa*i)); - - view_v_dst = ggml_view_2d(ctx0, v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(v_l[il]->type, n_embd_v_gqa), - ggml_row_size(v_l[il]->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx0, v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(v_l[il]->type, size), - ggml_row_size(v_l[il]->type, i)); - - view_v_dst = ggml_view_2d(ctx0, v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(v_l[il]->type, size), - ggml_row_size(v_l[il]->type, id)); - } - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } - - i += nm - 1; - } - - //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); -#endif -} - void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 67e59bc09..049193fd0 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -49,7 +49,7 @@ struct llama_kv_cache_slot_info { // TODO: pimpl // TODO: add notion of max sequences // TODO: add llama_hparams & -struct llama_kv_cache : public llama_graph_kv_cache_i { +struct llama_kv_cache { llama_kv_cache(const llama_hparams & hparams); virtual ~llama_kv_cache() = default; @@ -97,19 +97,6 @@ struct llama_kv_cache : public llama_graph_kv_cache_i { size_t size_k_bytes() const; size_t size_v_bytes() const; - // graph build API - - virtual void build_shift( - ggml_context * ctx0, - ggml_cgraph * gf, - llama_graph_i * lgf) override; - - virtual void build_defrag( - ggml_context * ctx0, - ggml_cgraph * gf, - int32_t max_nodes, - bool v_trans) override; - // state save/load void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const;