From 5eae8e5183f80a8b669757bde7b26cec05923081 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 13:32:02 +0200 Subject: [PATCH] context : move build_rope_factors to base class ggml-ci --- src/llama-context.cpp | 172 +++++++++++++++++++++--------------------- src/llama-context.h | 19 +++-- src/llama.cpp | 14 ++-- 3 files changed, 104 insertions(+), 101 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index f0d8bdaba..b29c98af6 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -57,6 +57,10 @@ uint32_t llama_context::n_ctx() const { return cparams.n_ctx; } +uint32_t llama_context::n_ctx_per_seq() const { + return cparams.n_ctx / cparams.n_seq_max; +} + uint32_t llama_context::n_batch() const { return cparams.n_batch; } @@ -122,8 +126,8 @@ void llama_context::synchronize() { } void llama_context::attach_threadpool( - ggml_threadpool_t threadpool, - ggml_threadpool_t threadpool_batch) { + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { this->threadpool = threadpool; this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; } @@ -202,6 +206,86 @@ llama_perf_context_data llama_context::perf_get_data() const { return data; } +ggml_tensor * llama_context::build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il) { + return cvec.apply_to(ctx0, cur, il); +} + +ggml_tensor * llama_context::build_lora_mm( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur) { + struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); + + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float adapter_scale = lora.second; + const float scale = lw->get_scale(lora.first->alpha, adapter_scale); + + struct ggml_tensor * ab_cur = ggml_mul_mat( + ctx0, lw->b, + ggml_mul_mat(ctx0, lw->a, cur) + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +ggml_tensor * llama_context::build_lora_mm_id( + ggml_context * ctx0, + ggml_tensor * w, + ggml_tensor * cur, + ggml_tensor * ids) { + struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); + for (const auto & lora : loras) { + struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); + if (lw == nullptr) { + continue; + } + + const float alpha = lora.first->alpha; + const float rank = (float) lw->b->ne[0]; + const float scale = alpha ? lora.second * alpha / rank : lora.second; + + struct ggml_tensor * ab_cur = ggml_mul_mat_id( + ctx0, lw->b, + ggml_mul_mat_id(ctx0, lw->a, cur, ids), + ids + ); + + ab_cur = ggml_scale(ctx0, ab_cur, scale); + res = ggml_add(ctx0, res, ab_cur); + } + + return res; +} + +ggml_tensor * llama_context::build_rope_factors(int il) { + const auto & hparams = model.hparams; + + // choose long/short freq factors based on the context size + const auto n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; + + if (model.layers[il].rope_freqs != nullptr) { + return model.layers[il].rope_freqs; + } + + if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) { + return model.layers[il].rope_long; + } + + return model.layers[il].rope_short; +} + void llama_context::perf_reset() { t_start_us = ggml_time_us(); t_eval_us = n_eval = 0; @@ -217,7 +301,7 @@ llama_context_unified::llama_context_unified( const llama_context_params & params, build_graph_callback && cb_build_graph) : llama_context(model), - cb_build_graph(std::move(cb_build_graph)){ + cb_build_graph(std::move(cb_build_graph)) { const auto & hparams = model.hparams; @@ -1825,69 +1909,6 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) { return n_outputs_max; } -ggml_tensor * llama_context::build_cvec( - ggml_context * ctx0, - ggml_tensor * cur, - int il) { - return cvec.apply_to(ctx0, cur, il); -} - -ggml_tensor * llama_context::build_lora_mm( - ggml_context * ctx0, - ggml_tensor * w, - ggml_tensor * cur) { - struct ggml_tensor * res = ggml_mul_mat(ctx0, w, cur); - - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); - if (lw == nullptr) { - continue; - } - - const float adapter_scale = lora.second; - const float scale = lw->get_scale(lora.first->alpha, adapter_scale); - - struct ggml_tensor * ab_cur = ggml_mul_mat( - ctx0, lw->b, - ggml_mul_mat(ctx0, lw->a, cur) - ); - - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - - return res; -} - -ggml_tensor * llama_context::build_lora_mm_id( - ggml_context * ctx0, - ggml_tensor * w, - ggml_tensor * cur, - ggml_tensor * ids) { - struct ggml_tensor * res = ggml_mul_mat_id(ctx0, w, cur, ids); - for (const auto & lora : loras) { - struct llama_adapter_lora_weight * lw = lora.first->get_weight(w); - if (lw == nullptr) { - continue; - } - - const float alpha = lora.first->alpha; - const float rank = (float) lw->b->ne[0]; - const float scale = alpha ? lora.second * alpha / rank : lora.second; - - struct ggml_tensor * ab_cur = ggml_mul_mat_id( - ctx0, lw->b, - ggml_mul_mat_id(ctx0, lw->a, cur, ids), - ids - ); - - ab_cur = ggml_scale(ctx0, ab_cur, scale); - res = ggml_add(ctx0, res, ab_cur); - } - - return res; -} - void llama_context_unified::kv_self_update() { auto & kv = kv_self; @@ -2189,23 +2210,6 @@ ggml_tensor * llama_context_unified::build_soft_max_ext( return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias); } -ggml_tensor * llama_context_unified::get_rope_factors(int il) { - const auto & hparams = model.hparams; - - // choose long/short freq factors based on the context size - const auto n_ctx_pre_seq = cparams.n_ctx / cparams.n_seq_max; - - if (model.layers[il].rope_freqs != nullptr) { - return model.layers[il].rope_freqs; - } - - if (n_ctx_pre_seq > hparams.n_ctx_orig_yarn) { - return model.layers[il].rope_long; - } - - return model.layers[il].rope_short; -} - ggml_tensor * llama_context_unified::build_inp_embd( ggml_context * ctx0, ggml_tensor * tok_embd, @@ -2327,7 +2331,7 @@ void llama_context_unified::build_k_shift( const int64_t n_head_kv = hparams.n_head_kv(il); const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(il); - struct ggml_tensor * rope_factors = get_rope_factors(il); + struct ggml_tensor * rope_factors = build_rope_factors(il); struct ggml_tensor * k = ggml_view_3d(ctx0, kv_self.k_l[il], diff --git a/src/llama-context.h b/src/llama-context.h index 8ec7d3e2b..dd1030388 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -23,10 +23,11 @@ struct llama_context { const llama_model & get_model() const; const llama_cparams & get_cparams() const; - virtual uint32_t n_ctx() const; - virtual uint32_t n_batch() const; - virtual uint32_t n_ubatch() const; - virtual uint32_t n_seq_max() const = 0; + virtual uint32_t n_ctx() const; + virtual uint32_t n_ctx_per_seq() const; + virtual uint32_t n_batch() const; + virtual uint32_t n_ubatch() const; + virtual uint32_t n_seq_max() const = 0; virtual uint32_t n_threads() const; virtual uint32_t n_threads_batch() const; @@ -126,6 +127,8 @@ struct llama_context { ggml_tensor * cur, // struct ggml_tensor * b ggml_tensor * ids); + virtual ggml_tensor * build_rope_factors(int il); + // graph build API (context-specific) virtual ggml_tensor * build_inp_embd( @@ -182,8 +185,6 @@ struct llama_context { ggml_tensor * kq, float kq_scale) = 0; - virtual ggml_tensor * get_rope_factors(int il) = 0; - virtual void build_k_shift( ggml_context * ctx0, ggml_cgraph * graph) = 0; @@ -342,7 +343,7 @@ class llama_context_unified : public llama_context { public: struct batch_manager; - // TODO: tmp until llama-model starts implementing the graph build function + // TODO: tmp until llama_model starts implementing the graph build function typedef std::function build_graph_callback; llama_context_unified( @@ -496,8 +497,6 @@ public: ggml_tensor * kq, float kq_scale) override; - virtual ggml_tensor * get_rope_factors(int il) override; - virtual void build_k_shift( ggml_context * ctx0, ggml_cgraph * graph) override; @@ -601,7 +600,7 @@ public: virtual size_t state_get_data( uint8_t * dst, size_t size) override; virtual size_t state_set_data(const uint8_t * src, size_t size) override; - virtual size_t state_seq_get_size(llama_seq_id seq_id) override; + virtual size_t state_seq_get_size(llama_seq_id seq_id) override; virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) override; virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) override; diff --git a/src/llama.cpp b/src/llama.cpp index c568f8d15..9e37b0cd4 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -685,7 +685,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -857,7 +857,7 @@ struct llm_build_context { } else if (n_head > 0) { // self-attention // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -2999,7 +2999,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); struct ggml_tensor* attn_norm_output = build_norm(inpL, model.layers[il].attn_norm, @@ -3706,7 +3706,7 @@ struct llm_build_context { for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * inpSA = inpL; - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // norm cur = build_norm(inpL, model.layers[il].attn_norm, NULL, @@ -4480,7 +4480,7 @@ struct llm_build_context { // self-attention { // rope freq factors for 128k context - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -5373,7 +5373,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur); @@ -6572,7 +6572,7 @@ struct llm_build_context { // self-attention { // rope freq factors for llama3; may return nullptr for llama2 and other models - struct ggml_tensor * rope_factors = lctx.get_rope_factors(il); + struct ggml_tensor * rope_factors = lctx.build_rope_factors(il); // compute Q and K and RoPE them struct ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);