diff --git a/src/llama-context.cpp b/src/llama-context.cpp index bfcdf6cdd..454e141c8 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -201,7 +201,7 @@ void llama_context::init() { backend_ptrs.push_back(backend.get()); } - const size_t max_nodes = model.max_nodes(); + const size_t max_nodes = this->max_nodes(); // buffer used to store the computation graph and the tensor meta data // TODO: move to base class @@ -255,39 +255,36 @@ void llama_context::init() { // reserve pp graph first so that buffers are only allocated once { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res_pp = graph_build(ctx.get(), ubatch_pp, true); - auto & gf_pp = res_pp.gf; - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_pp, true); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); } n_splits_pp = ggml_backend_sched_get_n_splits(sched.get()); - n_nodes_pp = ggml_graph_n_nodes(gf_pp); + n_nodes_pp = ggml_graph_n_nodes(gf); } // reserve with tg graph to get the number of splits and nodes { llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res_tg = graph_build(ctx.get(), ubatch_tg, true); - auto & gf_tg = res_tg.gf; - if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) { + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_tg, true); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); } n_splits_tg = ggml_backend_sched_get_n_splits(sched.get()); - n_nodes_tg = ggml_graph_n_nodes(gf_tg); + n_nodes_tg = ggml_graph_n_nodes(gf); } // reserve again with pp graph to avoid ggml-alloc reallocations during inference { llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res_pp = graph_build(ctx.get(), ubatch_pp, true); - auto & gf_pp = res_pp.gf; - if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) { + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch_pp, true); + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__); throw std::runtime_error("failed to allocate compute buffers"); } @@ -350,6 +347,10 @@ uint32_t llama_context::n_threads_batch() const { return cparams.n_threads_batch; } +int32_t llama_context::max_nodes() const { + return std::max(8192, 5*model.n_tensors()); +} + enum llama_pooling_type llama_context::pooling_type() const { return cparams.pooling_type; } @@ -555,7 +556,7 @@ void llama_context::synchronize() { t_compute_start_us = 0; } -ggml_context_ptr llama_context::graph_init() { +ggml_cgraph * llama_context::graph_init() { inp_tokens = nullptr; inp_embd = nullptr; inp_pos = nullptr; @@ -569,18 +570,21 @@ ggml_context_ptr llama_context::graph_init() { /*.no_alloc =*/ true, }; - return ggml_context_ptr { ggml_init(params) }; + ctx_compute.reset(ggml_init(params)); + + return ggml_new_graph_custom(ctx_compute.get(), max_nodes(), false); } llama_graph_result llama_context::graph_build( ggml_context * ctx, + ggml_cgraph * gf, const llama_ubatch & ubatch, bool worst_case) { - return model.build_graph(ctx, this, cparams, ubatch, worst_case); + return model.build_graph(ctx, gf, this, cparams, ubatch, worst_case); } enum ggml_status llama_context::graph_compute( - ggml_cgraph * graph, + ggml_cgraph * gf, bool batched) { int n_threads = batched ? cparams.n_threads_batch : cparams.n_threads; ggml_threadpool_t tp = batched ? threadpool_batch : threadpool; @@ -596,7 +600,7 @@ enum ggml_status llama_context::graph_compute( set_n_threads_fn.second(set_n_threads_fn.first, n_threads); } - auto status = ggml_backend_sched_graph_compute_async(sched.get(), graph); + auto status = ggml_backend_sched_graph_compute_async(sched.get(), gf); if (status != GGML_STATUS_SUCCESS) { LLAMA_LOG_ERROR("%s: ggml_backend_sched_graph_compute_async failed with error %d\n", __func__, status); } @@ -881,7 +885,6 @@ void llama_context::output_reorder() { } } - void llama_context::build_cb( ggml_tensor * cur, const char * name, @@ -1010,6 +1013,55 @@ ggml_tensor * llama_context::build_rope_factors(int il) { return model.layers[il].rope_short; } +ggml_tensor * llama_context::build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuf) { + const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; + const auto & freq_base = cparams.rope_freq_base; + const auto & freq_scale = cparams.rope_freq_scale; + + const auto & yarn_ext_factor = cparams.yarn_ext_factor; + const auto & yarn_attn_factor = cparams.yarn_attn_factor; + const auto & yarn_beta_fast = cparams.yarn_beta_fast; + const auto & yarn_beta_slow = cparams.yarn_beta_slow; + + const auto & n_rot = model.hparams.n_rot; + const auto & rope_type = model.hparams.rope_type; + + struct ggml_tensor * tmp; + + if (ggml_is_quantized(cur->type)) { + // dequantize to f32 -> RoPE -> quantize back + tmp = ggml_cast(ctx0, cur, GGML_TYPE_F32); + + if (bbuf) { + for (auto & backend : backends) { + // Figure out which backend KV cache belongs to + if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(bbuf))) { + ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); + break; + } + } + } + + tmp = ggml_rope_ext_inplace(ctx0, tmp, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + + tmp = ggml_cpy(ctx0, tmp, cur); + } else { + // we rotate only the first n_rot dimensions + tmp = ggml_rope_ext_inplace(ctx0, cur, + shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); + } + + return tmp; +} + ggml_tensor * llama_context::build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, @@ -1579,7 +1631,8 @@ void llama_context::perf_reset() { llama_context_kv_self::llama_context_kv_self( const llama_model & model, const llama_context_params & params) : - llama_context(model, params) { + llama_context(model, params), + kv_self(model.hparams) { const auto & hparams = model.hparams; LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx); @@ -1640,13 +1693,13 @@ const llama_kv_cache * llama_context_kv_self::get_kv_self() const { return &kv_self; } -ggml_context_ptr llama_context_kv_self::graph_init() { +ggml_cgraph * llama_context_kv_self::graph_init() { inp_KQ_mask = nullptr; inp_KQ_mask_cnv = nullptr; inp_KQ_mask_swa = nullptr; inp_KQ_mask_swa_cnv = nullptr; inp_KQ_mask_cross = nullptr; - inp_K_shift = nullptr; + inp_k_shift = nullptr; inp_s_copy = nullptr; inp_s_mask = nullptr; inp_embd_enc = nullptr; @@ -1719,10 +1772,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - auto ctx = graph_init(); - auto res = graph_build(ctx.get(), ubatch, false); - - auto * gf = res.gf; + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, false); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -1999,12 +2050,12 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr}; - auto ctx = graph_init(); - auto res = graph_build(ctx.get(), ubatch, true); + auto * gf = graph_init(); + graph_build(ctx_compute.get(), gf, ubatch, true); // initialize scheduler with the worst-case graph ggml_backend_sched_reset(sched.get()); - if (!ggml_backend_sched_reserve(sched.get(), res.gf)) { + if (!ggml_backend_sched_reserve(sched.get(), gf)) { LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__); } @@ -2014,10 +2065,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) { ggml_backend_sched_reset(sched.get()); ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data); - auto ctx = graph_init(); - auto res = graph_build(ctx.get(), ubatch, false); - - auto * gf = res.gf; + auto * gf = graph_init(); + auto res = graph_build(ctx_compute.get(), gf, ubatch, false); // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); @@ -2195,10 +2244,10 @@ uint32_t llama_context_kv_self::get_ctx_padding(const llama_cparams & cparams) c void llama_context_kv_self::input_set(const llama_ubatch & ubatch) { const llama_hparams & hparams = model.hparams; - if (inp_K_shift) { - assert(ggml_backend_buffer_is_host(inp_K_shift->buffer)); + if (inp_k_shift) { + assert(ggml_backend_buffer_is_host(inp_k_shift->buffer)); - int32_t * data = (int32_t *) inp_K_shift->data; + int32_t * data = (int32_t *) inp_k_shift->data; for (uint32_t i = 0; i < kv_self.size; ++i) { data[i] = kv_self.cells[i].delta; @@ -2482,11 +2531,9 @@ void llama_context_kv_self::kv_self_update() { if (model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) { ggml_backend_sched_reset(sched.get()); - auto ctx = graph_init(); + auto * gf = graph_init(); - ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false); - - build_kv_self_shift(ctx.get(), gf); + kv_self.build_shift(ctx_compute.get(), gf, this); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2510,11 +2557,9 @@ void llama_context_kv_self::kv_self_update() { if (kv.do_defrag) { ggml_backend_sched_reset(sched.get()); - auto ctx = graph_init(); + auto * gf = graph_init(); - ggml_cgraph * gf = ggml_new_graph_custom(ctx.get(), model.max_nodes(), false); - - build_kv_self_defrag(ctx.get(), gf); + kv_self.build_defrag(ctx_compute.get(), gf, max_nodes(), !cparams.flash_attn); ggml_backend_sched_alloc_graph(sched.get(), gf); @@ -2529,6 +2574,13 @@ void llama_context_kv_self::kv_self_update() { } } +ggml_tensor * llama_context_kv_self::build_inp_k_shift(ggml_context * ctx0) { + inp_k_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx()); + ggml_set_input(inp_k_shift); + + return inp_k_shift; +} + void llama_context_kv_self::build_attn_inp( ggml_context * ctx0, int32_t n_tokens, @@ -2765,348 +2817,6 @@ ggml_tensor * llama_context_kv_self::build_attn_soft_max( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -void llama_context_kv_self::build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * graph) { - const auto & n_ctx = cparams.n_ctx; - const auto & n_ctx_orig = cparams.n_ctx_orig_yarn; - const auto & freq_base = cparams.rope_freq_base; - const auto & freq_scale = cparams.rope_freq_scale; - - const auto & yarn_ext_factor = cparams.yarn_ext_factor; - const auto & yarn_attn_factor = cparams.yarn_attn_factor; - const auto & yarn_beta_fast = cparams.yarn_beta_fast; - const auto & yarn_beta_slow = cparams.yarn_beta_slow; - - const auto & hparams = model.hparams; - - const auto & n_rot = hparams.n_rot; - const auto & n_layer = hparams.n_layer; - const auto & rope_type = hparams.rope_type; - - const auto & n_embd_head_k = hparams.n_embd_head_k; - //const auto & n_embd_head_v = hparams.n_embd_head_v; - - GGML_ASSERT(kv_self.size == n_ctx); - - inp_K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx); - //cb(inp_K_shift, "K_shift", -1); - ggml_set_input(inp_K_shift); - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_head_kv = hparams.n_head_kv(il); - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - - struct ggml_tensor * rope_factors = build_rope_factors(il); - - struct ggml_tensor * k = - ggml_view_3d(ctx0, kv_self.k_l[il], - n_embd_head_k, n_head_kv, n_ctx, - ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - 0); - - struct ggml_tensor * tmp; - if (ggml_is_quantized(k->type)) { - // dequantize to f32 -> RoPE -> quantize back - tmp = ggml_cast(ctx0, k, GGML_TYPE_F32); - //cb(tmp, "K_f32", il); - - for (auto & backend : backends) { - // Figure out which backend KV cache belongs to - if (ggml_backend_supports_buft(backend.get(), ggml_backend_buffer_get_type(kv_self.k_l[il]->buffer))) { - ggml_backend_sched_set_tensor_backend(sched.get(), tmp, backend.get()); - break; - } - } - tmp = ggml_rope_ext_inplace(ctx0, tmp, - inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - //cb(tmp, "K_shifted_f32", il); - - tmp = ggml_cpy(ctx0, tmp, k); - } else { - // we rotate only the first n_rot dimensions - tmp = ggml_rope_ext_inplace(ctx0, k, - inp_K_shift, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, - yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow); - } - //cb(tmp, "K_shifted", il); - - ggml_build_forward_expand(graph, tmp); - } -} - -void llama_context_kv_self::build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * graph) { - const auto & hparams = model.hparams; - - const uint32_t n_layer = hparams.n_layer; - - const uint32_t n_kv = kv_self.cell_max(); - const uint32_t n_used = kv_self.used; - - assert(n_used <= n_kv); - - //const int64_t t_start = ggml_time_us(); - - // number of cells moved - uint32_t n_moves = 0; - - // each move requires 6*n_layer tensors (see build_kv_self_defrag) - // - source view, destination view, copy operation - // - x2 for keys and values - //const uint32_t max_moves = model.max_nodes()/(6*n_layer); - // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 - const uint32_t max_moves = (model.max_nodes() - 2*n_layer)/(6*n_layer); - - // determine which KV cells to move where - // - // cell i moves to ids[i] - // - // if ids[i] == i || ids[i] == n_kv, then cell i is not moved - // - std::vector ids(n_kv, n_kv); - - for (uint32_t i0 = 0; i0 < n_used; ++i0) { - const auto & cell0 = kv_self.cells[i0]; - - if (!cell0.is_empty()) { - ids[i0] = i0; - - continue; - } - - // found a hole - fill it with data from the end of the cache - - uint32_t nh = 1; - - // determine the size of the hole - while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) { - nh++; - } - - uint32_t nf = 0; - uint32_t is = n_kv - 1; - - // starting from the end, find nh non-empty cells - for (; is > i0; --is) { - const auto & cell1 = kv_self.cells[is]; - - if (cell1.is_empty() || ids[is] != n_kv) { - continue; - } - - // non-empty cell which is not yet moved - nf++; - - if (nf == nh) { - break; - } - } - - // this can only happen if `n_used` is not accurate, which would be a bug - GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); - - nf = 0; - - uint32_t i1 = is; - - // are we moving a continuous block of memory? - bool cont = false; - - // should we stop searching for the next move? - bool stop = false; - - // go back and move the nf cells to the hole - for (; i1 < n_kv; ++i1) { - auto & cell1 = kv_self.cells[i1]; - - if (cell1.is_empty() || ids[i1] != n_kv) { - if (n_moves == max_moves) { - stop = true; - break; - } - - cont = false; - continue; - } - - // this cell goes to (i0 + nf) - ids[i1] = i0 + nf; - - // move the cell meta data - kv_self.cells[i0 + nf] = cell1; - - // clear the old cell and move the head there - cell1 = llama_kv_cell(); - kv_self.head = n_used; - - if (!cont) { - n_moves++; - cont = true; - } - - nf++; - - if (nf == nh) { - break; - } - } - - if (stop || n_moves == max_moves) { - break; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); - - i0 += nh - 1; - } - - if (n_moves == 0) { - return; - } - - //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); - - //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); - -#if 0 - // CPU defrag - // - // TODO: optimizations are possible: - // - multiple threads - // - avoid copying to the host memory when already there - // - // likely not worth the effort, as we have ggml_graph based defrag - // - - const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); - const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - - const uint32_t kv_size = kv_self.size; - - std::vector buf_k; - std::vector buf_v; - - for (uint32_t il = 0; il < n_layer; ++il) { - const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa); - const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size); - - const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type); - const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size); - - buf_k.resize(k_size); - buf_v.resize(v_size); - - ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - - // batch move [i, i+nm) to [id, id+nm) - // note: cells can move only to a lower index - for (uint32_t i = 0; i < n_kv; ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == n_kv) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < n_kv && ids[i + nm] == id + nm) { - nm++; - } - - // move keys - { - const int64_t os = i*k_size_row; - const int64_t od = id*k_size_row; - - memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); - } - - // move values (note: they are transposed) - { - const int64_t os = i; - const int64_t od = id; - - for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { - memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); - } - } - - i += nm - 1; - } - - ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size()); - ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size()); - } -#else - for (uint32_t i = 0; i < ids.size(); ++i) { - const uint32_t id = ids[i]; - - if (i == id || id == ids.size()) { - continue; - } - - uint32_t nm = 1; - - while (i + nm < ids.size() && ids[i + nm] == id + nm) { - nm++; - } - - for (uint32_t il = 0; il < n_layer; ++il) { - const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); - - ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i)); - - ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il], - n_embd_k_gqa, nm, - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa), - ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id)); - - ggml_tensor * view_v_src; - ggml_tensor * view_v_dst; - - if (cparams.flash_attn) { - // NOTE: the V cache is not transposed when using flash attention - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - n_embd_v_gqa, nm, - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa), - ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*id)); - } else { - view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, i)); - - view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il], - nm, n_embd_v_gqa, - ggml_row_size(kv_self.v_l[il]->type, kv_self.size), - ggml_row_size(kv_self.v_l[il]->type, id)); - } - - ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_k_src, view_k_dst)); - ggml_build_forward_expand(graph, ggml_cpy(ctx0, view_v_src, view_v_dst)); - } - - i += nm - 1; - } - - //LLAMA_LOG_INFO("graph->n_nodes = %d\n", graph->n_nodes); -#endif -} - ggml_tensor * llama_context_kv_self::build_inp_embd_enc( ggml_context * ctx0, int32_t n_tokens, @@ -3162,7 +2872,7 @@ ggml_tensor * llama_context_kv_self::build_inp_s_mask( ggml_tensor * llama_context_kv_self::build_copy_mask_state( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -3185,7 +2895,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state( states = ggml_mul(ctx0, states, state_mask); // copy states which won't be changed further (between n_seqs and n_kv) - ggml_build_forward_expand(graph, + ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, states, n_state*(n_kv - n_seqs), (n_seqs )*n_state*ggml_element_size(states)), ggml_view_1d(ctx0, s, n_state*(n_kv - n_seqs), (kv_head + n_seqs)*n_state*ggml_element_size(s)))); @@ -3197,7 +2907,7 @@ ggml_tensor * llama_context_kv_self::build_copy_mask_state( // TODO: split ggml_tensor * llama_context_kv_self::build_mamba_layer( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -3231,11 +2941,11 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( // (ab)using the KV cache to store the states struct ggml_tensor * conv = build_copy_mask_state( - ctx0, graph, conv_states_all, state_copy, state_mask, + ctx0, gf, conv_states_all, state_copy, state_mask, n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); conv = ggml_reshape_3d(ctx0, conv, d_conv - 1, d_inner, n_seqs); struct ggml_tensor * ssm = build_copy_mask_state( - ctx0, graph, ssm_states_all, state_copy, state_mask, + ctx0, gf, ssm_states_all, state_copy, state_mask, n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); ssm = ggml_reshape_3d(ctx0, ssm, d_state, d_inner, n_seqs); @@ -3257,7 +2967,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( // copy last (d_conv - 1) columns back into the state cache struct ggml_tensor * last_conv = ggml_view_3d(ctx0, conv_x, d_conv - 1, d_inner, n_seqs, conv_x->nb[1], conv_x->nb[2], n_seq_tokens*(conv_x->nb[0])); - ggml_build_forward_expand(graph, + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv, ggml_view_1d(ctx0, conv_states_all, (d_conv - 1)*(d_inner)*(n_seqs), @@ -3306,7 +3016,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( struct ggml_tensor * y_ssm = ggml_ssm_scan(ctx0, ssm, x, dt, model.layers[il].ssm_a, B, C); // store last states - ggml_build_forward_expand(graph, + ggml_build_forward_expand(gf, ggml_cpy(ctx0, ggml_view_1d(ctx0, y_ssm, d_state*d_inner*n_seqs, x->nb[3]), ggml_view_1d(ctx0, ssm_states_all, d_state*d_inner*n_seqs, kv_head*d_state*d_inner*ggml_element_size(ssm_states_all)))); @@ -3333,7 +3043,7 @@ ggml_tensor * llama_context_kv_self::build_mamba_layer( ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, @@ -3349,7 +3059,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_load( struct ggml_tensor * token_shift_all = kv_self.k_l[il]; struct ggml_tensor * token_shift = build_copy_mask_state( - ctx0, graph, token_shift_all, state_copy, state_mask, + ctx0, gf, token_shift_all, state_copy, state_mask, n_tokens, hparams.n_embd_k_s(), n_seqs, worst_case); token_shift = ggml_reshape_3d(ctx0, token_shift, hparams.n_embd, token_shift_count, n_seqs); @@ -3384,7 +3094,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv_token_shift_store( ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor * state_copy, @@ -3509,7 +3219,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( } struct ggml_tensor * wkv_state = build_copy_mask_state( - ctx0, graph, kv_self.v_l[il], state_copy, state_mask, + ctx0, gf, kv_self.v_l[il], state_copy, state_mask, n_tokens, hparams.n_embd_v_s(), n_seqs, worst_case); struct ggml_tensor * wkv_output; @@ -3522,7 +3232,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( wkv_state = ggml_view_1d(ctx0, wkv_output, n_embd * head_size * n_seqs, n_embd * n_tokens * sizeof(float)); ggml_build_forward_expand( - graph, + gf, ggml_cpy( ctx0, wkv_state, @@ -3558,7 +3268,7 @@ ggml_tensor * llama_context_kv_self::build_rwkv6_time_mix( size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { llama_context::state_get_data(io); - kv_self.state_write(io, model.hparams); + kv_self.state_write(io); return io.n_bytes(); } @@ -3566,7 +3276,7 @@ size_t llama_context_kv_self::state_get_data(llama_io_write_i & io) { size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { llama_context::state_set_data(io); - kv_self.state_read(io, model.hparams); + kv_self.state_read(io); return io.n_bytes(); } @@ -3574,7 +3284,7 @@ size_t llama_context_kv_self::state_set_data(llama_io_read_i & io) { size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_seq_id seq_id) { llama_context::state_seq_get_data(io, seq_id); - kv_self.state_write(io, model.hparams, seq_id); + kv_self.state_write(io, seq_id); return io.n_bytes(); } @@ -3582,7 +3292,7 @@ size_t llama_context_kv_self::state_seq_get_data(llama_io_write_i & io, llama_se size_t llama_context_kv_self::state_seq_set_data(llama_io_read_i & io, llama_seq_id seq_id) { llama_context::state_seq_set_data(io, seq_id); - kv_self.state_read(io, model.hparams, seq_id); + kv_self.state_read(io, seq_id); return io.n_bytes(); } diff --git a/src/llama-context.h b/src/llama-context.h index 4bf8244e6..0311ad473 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -43,6 +43,8 @@ struct llama_context : public llama_graph_i { virtual uint32_t n_threads() const; virtual uint32_t n_threads_batch() const; + virtual int32_t max_nodes() const; + virtual llama_kv_cache * get_kv_self() = 0; virtual const llama_kv_cache * get_kv_self() const = 0; @@ -93,18 +95,19 @@ struct llama_context : public llama_graph_i { virtual void synchronize(); // zero-out inputs and create ggml_context - virtual ggml_context_ptr graph_init(); + virtual ggml_cgraph * graph_init(); // TODO: add encode/decode graphs virtual llama_graph_result graph_build( - ggml_context * ctx, - const llama_ubatch & ubatch, - bool worst_case); + ggml_context * ctx, + ggml_cgraph * gf, + const llama_ubatch & ubatch, + bool worst_case); // returns the result of ggml_backend_sched_graph_compute_async execution virtual enum ggml_status graph_compute( - ggml_cgraph * graph, - bool batched); + ggml_cgraph * gf, + bool batched); virtual void input_set(const llama_ubatch & ubatch); @@ -172,6 +175,13 @@ struct llama_context : public llama_graph_i { virtual ggml_tensor * build_rope_factors(int il); + virtual ggml_tensor * build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuf); + virtual ggml_tensor * build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, @@ -274,6 +284,8 @@ protected: ggml_backend_sched_ptr sched; + ggml_context_ptr ctx_compute; + // memory buffers used to evaluate the model std::vector buf_compute_meta; @@ -332,7 +344,7 @@ public: virtual void kv_self_update() override; - virtual ggml_context_ptr graph_init() override; + virtual ggml_cgraph * graph_init() override; virtual void input_set(const llama_ubatch & ubatch) override; @@ -349,11 +361,13 @@ public: llama_kv_cache kv_self; - struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] - struct ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] - struct ggml_tensor * inp_K_shift; // I32 [kv_size] + ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] + ggml_tensor * inp_KQ_mask_cnv; // [kv_size, n_batch] + ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch] + ggml_tensor * inp_KQ_mask_swa_cnv; // [kv_size, n_batch] + ggml_tensor * inp_k_shift; // I32 [kv_size] + + virtual ggml_tensor * build_inp_k_shift(ggml_context * ctx0) override; virtual void build_attn_inp( ggml_context * ctx0, @@ -387,15 +401,6 @@ public: ggml_tensor * kq, float kq_scale) override; - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * graph) override; - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * graph) override; - // === encoder-decoder === // whether we are computing encoder output or decoder output diff --git a/src/llama-graph.h b/src/llama-graph.h index 14d0c5da0..6098d2b92 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -8,11 +8,10 @@ struct ggml_cgraph; struct ggml_context; struct ggml_tensor; +struct ggml_backend_buffer; struct llama_ubatch; struct llama_graph_result { - ggml_cgraph * gf = nullptr; - // important graph nodes ggml_tensor * t_logits = nullptr; ggml_tensor * t_embd = nullptr; @@ -50,6 +49,14 @@ public: virtual ggml_tensor * build_rope_factors(int il) = 0; + // note: optionally set the backend to be the same as the bbuf's backend + virtual ggml_tensor * build_rope_shift( + ggml_context * ctx0, + ggml_tensor * cur, + ggml_tensor * shift, + ggml_tensor * factors, + ggml_backend_buffer * bbuft) = 0; + // graph build API (context-specific) virtual ggml_tensor * build_inp_embd( @@ -83,7 +90,7 @@ public: virtual void build_attn_kv_store( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * k_cur, ggml_tensor * v_cur, int32_t n_tokens, @@ -92,7 +99,7 @@ public: virtual ggml_tensor * build_attn_qkv( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * wo, ggml_tensor * wo_b, ggml_tensor * q_cur, @@ -106,14 +113,8 @@ public: ggml_tensor * kq, float kq_scale) = 0; - virtual void build_kv_self_shift( - ggml_context * ctx0, - ggml_cgraph * graph) = 0; - - // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache - virtual void build_kv_self_defrag( - ggml_context * ctx0, - ggml_cgraph * graph) = 0; + virtual ggml_tensor * build_inp_k_shift( + ggml_context * ctx0) = 0; virtual ggml_tensor * build_inp_embd_enc( ggml_context * ctx0, @@ -135,7 +136,7 @@ public: virtual ggml_tensor * build_copy_mask_state( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * s, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -146,7 +147,7 @@ public: virtual ggml_tensor * build_mamba_layer( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * state_copy, ggml_tensor * state_mask, @@ -156,7 +157,7 @@ public: virtual ggml_tensor * build_rwkv_token_shift_load( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * state_copy, ggml_tensor * state_mask, const llama_ubatch & ubatch, @@ -172,7 +173,7 @@ public: virtual ggml_tensor * build_rwkv6_time_mix( ggml_context * ctx0, - ggml_cgraph * graph, + ggml_cgraph * gf, ggml_tensor * cur, ggml_tensor * x_prev, ggml_tensor * state_copy, @@ -181,3 +182,18 @@ public: int il, bool worst_case) = 0; }; + +class llama_graph_kv_cache_i { +public: + virtual void build_shift( + ggml_context * ctx0, + ggml_cgraph * gf, + llama_graph_i * lgf) = 0; + + // find holes from the beginning of the KV cache and fill them by moving data from the end of the cache + virtual void build_defrag( + ggml_context * ctx0, + ggml_cgraph * gf, + int32_t max_nodes, + bool v_trans) = 0; +}; diff --git a/src/llama-kv-cache.cpp b/src/llama-kv-cache.cpp index c93410f0a..5dde8b870 100644 --- a/src/llama-kv-cache.cpp +++ b/src/llama-kv-cache.cpp @@ -13,6 +13,9 @@ static const llama_kv_cache_slot_info llama_kv_cache_slot_info_failed{false}; +llama_kv_cache::llama_kv_cache(const llama_hparams & hparams) : hparams(hparams) { +} + bool llama_kv_cache::init( const llama_model & model, const llama_cparams & cparams, @@ -20,8 +23,6 @@ bool llama_kv_cache::init( ggml_type type_v, uint32_t kv_size, bool offload) { - const struct llama_hparams & hparams = model.hparams; - const int32_t n_layer = hparams.n_layer; has_shift = false; @@ -698,7 +699,309 @@ size_t llama_kv_cache::size_v_bytes() const { return size_v_bytes; } -void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id) const { +void llama_kv_cache::build_shift( + ggml_context * ctx0, + ggml_cgraph * gf, + llama_graph_i * lgf) { + const auto & n_layer = hparams.n_layer; + + const auto & n_embd_head_k = hparams.n_embd_head_k; + //const auto & n_embd_head_v = hparams.n_embd_head_v; + + //GGML_ASSERT(kv_self.size == n_ctx); + + ggml_tensor * inp_k_shift = lgf->build_inp_k_shift(ctx0); + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + + struct ggml_tensor * rope_factors = lgf->build_rope_factors(il); + + struct ggml_tensor * k = + ggml_view_3d(ctx0, k_l[il], + n_embd_head_k, n_head_kv, size, + ggml_row_size(k_l[il]->type, n_embd_head_k), + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + 0); + + ggml_tensor * cur = lgf->build_rope_shift(ctx0, k, inp_k_shift, rope_factors, k_l[il]->buffer); + + ggml_build_forward_expand(gf, cur); + } +} + +void llama_kv_cache::build_defrag( + ggml_context * ctx0, + ggml_cgraph * gf, + int32_t max_nodes, + bool v_trans) { + const uint32_t n_layer = hparams.n_layer; + + const uint32_t n_kv = cell_max(); + const uint32_t n_used = used; + + assert(n_used <= n_kv); + + //const int64_t t_start = ggml_time_us(); + + // number of cells moved + uint32_t n_moves = 0; + + // each move requires 6*n_layer tensors (see build_kv_self_defrag) + // - source view, destination view, copy operation + // - x2 for keys and values + //const uint32_t max_moves = max_nodes/(6*n_layer); + // TODO: tmp fix https://github.com/ggerganov/llama.cpp/issues/6685#issuecomment-2057579516 + const uint32_t max_moves = (max_nodes - 2*n_layer)/(6*n_layer); + + // determine which KV cells to move where + // + // cell i moves to ids[i] + // + // if ids[i] == i || ids[i] == n_kv, then cell i is not moved + // + std::vector ids(n_kv, n_kv); + + for (uint32_t i0 = 0; i0 < n_used; ++i0) { + const auto & cell0 = cells[i0]; + + if (!cell0.is_empty()) { + ids[i0] = i0; + + continue; + } + + // found a hole - fill it with data from the end of the cache + + uint32_t nh = 1; + + // determine the size of the hole + while (i0 + nh < n_used && cells[i0 + nh].is_empty()) { + nh++; + } + + uint32_t nf = 0; + uint32_t is = n_kv - 1; + + // starting from the end, find nh non-empty cells + for (; is > i0; --is) { + const auto & cell1 = cells[is]; + + if (cell1.is_empty() || ids[is] != n_kv) { + continue; + } + + // non-empty cell which is not yet moved + nf++; + + if (nf == nh) { + break; + } + } + + // this can only happen if `n_used` is not accurate, which would be a bug + GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh"); + + nf = 0; + + uint32_t i1 = is; + + // are we moving a continuous block of memory? + bool cont = false; + + // should we stop searching for the next move? + bool stop = false; + + // go back and move the nf cells to the hole + for (; i1 < n_kv; ++i1) { + auto & cell1 = cells[i1]; + + if (cell1.is_empty() || ids[i1] != n_kv) { + if (n_moves == max_moves) { + stop = true; + break; + } + + cont = false; + continue; + } + + // this cell goes to (i0 + nf) + ids[i1] = i0 + nf; + + // move the cell meta data + cells[i0 + nf] = cell1; + + // clear the old cell and move the head there + cell1 = llama_kv_cell(); + head = n_used; + + if (!cont) { + n_moves++; + cont = true; + } + + nf++; + + if (nf == nh) { + break; + } + } + + if (stop || n_moves == max_moves) { + break; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh); + + i0 += nh - 1; + } + + if (n_moves == 0) { + return; + } + + //LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves); + + //LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer); + +#if 0 + // CPU defrag + // + // TODO: optimizations are possible: + // - multiple threads + // - avoid copying to the host memory when already there + // + // likely not worth the effort, as we have ggml_graph based defrag + // + + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); + + const uint32_t kv_size = size; + + std::vector buf_k; + std::vector buf_v; + + for (uint32_t il = 0; il < n_layer; ++il) { + const size_t k_size_row = ggml_row_size(k_l[il]->type, n_embd_k_gqa); + const size_t k_size = ggml_row_size(k_l[il]->type, n_embd_k_gqa*kv_size); + + const size_t v_size_el = ggml_type_size(v_l[il]->type); + const size_t v_size = ggml_row_size (v_l[il]->type, n_embd_v_gqa*kv_size); + + buf_k.resize(k_size); + buf_v.resize(v_size); + + ggml_backend_tensor_get(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_get(v_l[il], buf_v.data(), 0, buf_v.size()); + + // batch move [i, i+nm) to [id, id+nm) + // note: cells can move only to a lower index + for (uint32_t i = 0; i < n_kv; ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == n_kv) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < n_kv && ids[i + nm] == id + nm) { + nm++; + } + + // move keys + { + const int64_t os = i*k_size_row; + const int64_t od = id*k_size_row; + + memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row); + } + + // move values (note: they are transposed) + { + const int64_t os = i; + const int64_t od = id; + + for (uint32_t j = 0; j < n_embd_v_gqa; ++j) { + memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el); + } + } + + i += nm - 1; + } + + ggml_backend_tensor_set(k_l[il], buf_k.data(), 0, buf_k.size()); + ggml_backend_tensor_set(v_l[il], buf_v.data(), 0, buf_v.size()); + } +#else + for (uint32_t i = 0; i < ids.size(); ++i) { + const uint32_t id = ids[i]; + + if (i == id || id == ids.size()) { + continue; + } + + uint32_t nm = 1; + + while (i + nm < ids.size() && ids[i + nm] == id + nm) { + nm++; + } + + for (uint32_t il = 0; il < n_layer; ++il) { + const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); + const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(il); + + ggml_tensor * view_k_src = ggml_view_2d(ctx0, k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + ggml_row_size(k_l[il]->type, n_embd_k_gqa*i)); + + ggml_tensor * view_k_dst = ggml_view_2d(ctx0, k_l[il], + n_embd_k_gqa, nm, + ggml_row_size(k_l[il]->type, n_embd_k_gqa), + ggml_row_size(k_l[il]->type, n_embd_k_gqa*id)); + + ggml_tensor * view_v_src; + ggml_tensor * view_v_dst; + + if (!v_trans) { + // NOTE: the V cache is not transposed when using flash attention + view_v_src = ggml_view_2d(ctx0, v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), + ggml_row_size(v_l[il]->type, n_embd_v_gqa*i)); + + view_v_dst = ggml_view_2d(ctx0, v_l[il], + n_embd_v_gqa, nm, + ggml_row_size(v_l[il]->type, n_embd_v_gqa), + ggml_row_size(v_l[il]->type, n_embd_v_gqa*id)); + } else { + view_v_src = ggml_view_2d(ctx0, v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), + ggml_row_size(v_l[il]->type, i)); + + view_v_dst = ggml_view_2d(ctx0, v_l[il], + nm, n_embd_v_gqa, + ggml_row_size(v_l[il]->type, size), + ggml_row_size(v_l[il]->type, id)); + } + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst)); + } + + i += nm - 1; + } + + //LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes); +#endif +} + +void llama_kv_cache::state_write(llama_io_write_i & io, llama_seq_id seq_id) const { std::vector> cell_ranges; // ranges, from inclusive, to exclusive uint32_t cell_count = 0; @@ -733,16 +1036,16 @@ void llama_kv_cache::state_write(llama_io_write_i & io, const llama_hparams & hp io.write(&cell_count, sizeof(cell_count)); state_write_meta(io, cell_ranges, seq_id); - state_write_data(io, cell_ranges, hparams); + state_write_data(io, cell_ranges); } -void llama_kv_cache::state_read(llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id) { +void llama_kv_cache::state_read(llama_io_read_i & io, llama_seq_id seq_id) { uint32_t cell_count; io.read_to(&cell_count, sizeof(cell_count)); bool res = true; res = res && state_read_meta(io, cell_count, seq_id); - res = res && state_read_data(io, hparams, cell_count); + res = res && state_read_data(io, cell_count); if (!res) { if (seq_id == -1) { @@ -773,7 +1076,7 @@ void llama_kv_cache::state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const { +void llama_kv_cache::state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const { const uint32_t v_trans = this->v_trans ? 1 : 0; const uint32_t n_layer = hparams.n_layer; @@ -955,7 +1258,7 @@ bool llama_kv_cache::state_read_meta(llama_io_read_i & io, uint32_t cell_count, return true; } -bool llama_kv_cache::state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count) { +bool llama_kv_cache::state_read_data(llama_io_read_i & io, uint32_t cell_count) { uint32_t v_trans; uint32_t n_layer; io.read_to(&v_trans, sizeof(v_trans)); diff --git a/src/llama-kv-cache.h b/src/llama-kv-cache.h index 3ea9abfce..67e59bc09 100644 --- a/src/llama-kv-cache.h +++ b/src/llama-kv-cache.h @@ -2,12 +2,12 @@ #include "llama.h" #include "llama-io.h" +#include "llama-graph.h" #include "ggml-cpp.h" #include #include -#include struct llama_cparams; struct llama_hparams; @@ -49,31 +49,13 @@ struct llama_kv_cache_slot_info { // TODO: pimpl // TODO: add notion of max sequences // TODO: add llama_hparams & -struct llama_kv_cache { - bool has_shift = false; - bool do_defrag = false; - bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token - bool v_trans = true; // the value tensor is transposed - bool can_shift = false; - - // Note: The value of head isn't only used to optimize searching - // for a free KV slot. llama_decode_impl also uses it, so it - // cannot be freely changed after a slot has been allocated. - uint32_t head = 0; - uint32_t size = 0; - uint32_t used = 0; // used cells (i.e. at least one seq_id) - - // computed before each graph build - uint32_t n = 0; - - std::vector cells; - - std::vector k_l; // per layer - std::vector v_l; +struct llama_kv_cache : public llama_graph_kv_cache_i { + llama_kv_cache(const llama_hparams & hparams); + virtual ~llama_kv_cache() = default; // TODO: become constructor bool init( - const llama_model & model, + const llama_model & model, // TODO: do not reference the model const llama_cparams & cparams, ggml_type type_k, ggml_type type_v, @@ -115,8 +97,48 @@ struct llama_kv_cache { size_t size_k_bytes() const; size_t size_v_bytes() const; - void state_write(llama_io_write_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1) const; - void state_read (llama_io_read_i & io, const llama_hparams & hparams, llama_seq_id seq_id = -1); + // graph build API + + virtual void build_shift( + ggml_context * ctx0, + ggml_cgraph * gf, + llama_graph_i * lgf) override; + + virtual void build_defrag( + ggml_context * ctx0, + ggml_cgraph * gf, + int32_t max_nodes, + bool v_trans) override; + + // state save/load + + void state_write(llama_io_write_i & io, llama_seq_id seq_id = -1) const; + void state_read (llama_io_read_i & io, llama_seq_id seq_id = -1); + + // members + + const llama_hparams & hparams; + + bool has_shift = false; + bool do_defrag = false; + bool recurrent = false; // with recurrent state models, a cell can hold the state for more than one past token + bool v_trans = true; // the value tensor is transposed + bool can_shift = false; + + // Note: The value of head isn't only used to optimize searching + // for a free KV slot. llama_decode_impl also uses it, so it + // cannot be freely changed after a slot has been allocated. + uint32_t head = 0; + uint32_t size = 0; + uint32_t used = 0; // used cells (i.e. at least one seq_id) + + // computed before each graph build + uint32_t n = 0; + + std::vector cells; + + std::vector k_l; // per layer + std::vector v_l; private: ggml_type type_k = GGML_TYPE_F16; @@ -126,10 +148,10 @@ private: std::vector bufs; void state_write_meta(llama_io_write_i & io, const std::vector> & cell_ranges, llama_seq_id seq_id = -1) const; - void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges, const llama_hparams & hparams) const; + void state_write_data(llama_io_write_i & io, const std::vector> & cell_ranges) const; bool state_read_meta(llama_io_read_i & io, uint32_t cell_count, llama_seq_id dest_seq_id = -1); - bool state_read_data(llama_io_read_i & io, const llama_hparams & hparams, uint32_t cell_count); + bool state_read_data(llama_io_read_i & io, uint32_t cell_count); }; // diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 350dfd89c..09fd63f61 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -3579,8 +3579,8 @@ size_t llama_model::size() const { return pimpl->n_bytes; } -size_t llama_model::max_nodes() const { - return std::max(8192, tensors_by_name.size()*5); +size_t llama_model::n_tensors() const { + return tensors_by_name.size(); } size_t llama_model::n_devices() const { @@ -3900,6 +3900,38 @@ struct llm_build_context { return inpL; } + // TODO: tmp + struct ggml_tensor * build_inp_pos() { + ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens); + cb(cur, "inp_pos", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_out_ids() { + ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case); + cb(cur, "inp_out_ids", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_mean() { + ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens); + cb(cur, "inp_mean", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_cls() { + ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens); + cb(cur, "inp_cls", -1); + + return cur; + } + // TODO: tmp struct ggml_tensor * build_lora_mm( struct ggml_tensor * w, @@ -3915,6 +3947,22 @@ struct llm_build_context { return lgf->build_lora_mm_id(ctx0, w, cur, ids); } + // TODO: tmp + struct ggml_tensor * build_inp_embd_enc() { + ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case); + cb(cur, "embd_enc", -1); + + return cur; + } + + // TODO: tmp + struct ggml_tensor * build_inp_KQ_mask_cross() { + ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); + cb(cur, "KQ_mask_cross", -1); + + return cur; + } + struct ggml_tensor * build_norm( struct ggml_tensor * cur, struct ggml_tensor * mw, @@ -4195,7 +4243,7 @@ struct llm_build_context { } struct ggml_tensor * build_attn( - struct ggml_cgraph * graph, + struct ggml_cgraph * gf, struct ggml_tensor * wo, struct ggml_tensor * wo_b, struct ggml_tensor * k_cur, @@ -4206,17 +4254,17 @@ struct llm_build_context { int il) { // these nodes are added to the graph together so that they are not reordered // by doing so, the number of splits in the graph is reduced - ggml_build_forward_expand(graph, q_cur); - ggml_build_forward_expand(graph, k_cur); - ggml_build_forward_expand(graph, v_cur); + ggml_build_forward_expand(gf, q_cur); + ggml_build_forward_expand(gf, k_cur); + ggml_build_forward_expand(gf, v_cur); - //build_kv_store(graph, k_cur, v_cur, il); - lgf->build_attn_kv_store(ctx0, graph, k_cur, v_cur, n_tokens, il, worst_case); + //build_kv_store(gf, k_cur, v_cur, il); + lgf->build_attn_kv_store(ctx0, gf, k_cur, v_cur, n_tokens, il, worst_case); struct ggml_tensor * cur; - //cur = build_kqv(graph, wo, wo_b, q_cur, kq_mask, kq_scale, il); - cur = lgf->build_attn_qkv(ctx0, graph, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); + //cur = build_kqv(gf, wo, wo_b, q_cur, kq_mask, kq_scale, il); + cur = lgf->build_attn_qkv(ctx0, gf, wo, wo_b, q_cur, n_tokens, kq_scale, il, worst_case); cb(cur, "kqv_out", il); return cur; @@ -4251,34 +4299,6 @@ struct llm_build_context { return cur; } - struct ggml_tensor * build_inp_pos() { - ggml_tensor * cur = lgf->build_inp_pos(ctx0, n_tokens); - cb(cur, "inp_pos", -1); - - return cur; - } - - struct ggml_tensor * build_inp_out_ids() { - ggml_tensor * cur = lgf->build_inp_out_ids(ctx0, n_tokens, worst_case); - cb(cur, "inp_out_ids", -1); - - return cur; - } - - struct ggml_tensor * build_inp_mean() { - ggml_tensor * cur = lgf->build_inp_mean(ctx0, n_tokens); - cb(cur, "inp_mean", -1); - - return cur; - } - - struct ggml_tensor * build_inp_cls() { - ggml_tensor * cur = lgf->build_inp_cls(ctx0, n_tokens); - cb(cur, "inp_cls", -1); - - return cur; - } - void append_pooling(struct ggml_cgraph * gf) { struct ggml_tensor * inp = res.t_embd; @@ -4377,20 +4397,6 @@ struct llm_build_context { // return pos_bias; //} - struct ggml_tensor * build_inp_embd_enc() { - ggml_tensor * cur = lgf->build_inp_embd_enc(ctx0, n_tokens, worst_case); - cb(cur, "embd_enc", -1); - - return cur; - } - - struct ggml_tensor * build_inp_KQ_mask_cross() { - ggml_tensor * cur = lgf->build_inp_KQ_mask_cross(ctx0, n_tokens, worst_case); - cb(cur, "KQ_mask_cross", -1); - - return cur; - } - void build_llama(ggml_cgraph * gf) { const int64_t n_embd_head = hparams.n_embd_head_v; @@ -10936,16 +10942,13 @@ struct llm_build_context { llama_graph_result llama_model::build_graph( ggml_context * ctx, + ggml_cgraph * gf, llama_graph_i * lgf, const llama_cparams & cparams, const llama_ubatch & ubatch, - bool worst_case) const { + bool worst_case) const { struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case); - auto & gf = llm.res.gf; - - gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false); - switch (arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_MINICPM: diff --git a/src/llama-model.h b/src/llama-model.h index 2a9fca7d4..94e762294 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -353,7 +353,7 @@ struct llama_model { std::string desc() const; size_t size() const; - size_t max_nodes() const; + size_t n_tensors() const; size_t n_devices() const; // total number of parameters in the model @@ -371,6 +371,7 @@ struct llama_model { // TODO: add encode/decode graphs llama_graph_result build_graph( ggml_context * ctx, + ggml_cgraph * gf, llama_graph_i * lgf, const llama_cparams & cparams, const llama_ubatch & ubatch,