diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp index 8a0800463..3ce36886c 100644 --- a/src/llama-adapter.cpp +++ b/src/llama-adapter.cpp @@ -91,7 +91,7 @@ bool llama_adapter_cvec::init(const llama_model & model) { return true; } -int32_t llama_adapter_cvec::apply( +bool llama_adapter_cvec::apply( const llama_model & model, const float * data, size_t len, @@ -104,17 +104,17 @@ int32_t llama_adapter_cvec::apply( // disable the current control vector (but leave allocated for later) layer_start = -1; layer_end = -1; - return 0; + return true; } if (n_embd != (int) hparams.n_embd) { LLAMA_LOG_ERROR("%s: control vector n_embd does not match model\n", __func__); - return 1; + return false; } if (tensors.empty()) { if (!init(model)) { - return 1; + return false; } } @@ -130,7 +130,7 @@ int32_t llama_adapter_cvec::apply( } } - return 0; + return true; } // lora diff --git a/src/llama-adapter.h b/src/llama-adapter.h index 603fa08f6..4332ccd57 100644 --- a/src/llama-adapter.h +++ b/src/llama-adapter.h @@ -19,7 +19,7 @@ struct llama_adapter_cvec { struct ggml_tensor * apply_to(struct ggml_context * ctx, struct ggml_tensor * cur, int il) const; - int32_t apply( + bool apply( const llama_model & model, const float * data, size_t len, diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4e02f155b..353fc7fea 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -33,7 +33,9 @@ static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t return relative_bucket; } +// // llama_context +// llama_context::llama_context(const llama_model & model) : model (model), @@ -43,6 +45,52 @@ llama_context::llama_context(const llama_model & model) : llama_context::~llama_context() = default; +const llama_model & llama_context::get_model() const { + return model; +} + +const llama_cparams & llama_context::get_cparams() const { + return cparams; +} + +uint32_t llama_context::n_ctx() const { + return cparams.n_ctx; +} + +uint32_t llama_context::n_batch() const { + return cparams.n_batch; +} + +uint32_t llama_context::n_ubatch() const { + return cparams.n_ubatch; +} + +uint32_t llama_context::n_threads() const { + return cparams.n_threads; +} + +uint32_t llama_context::n_threads_batch() const { + return cparams.n_threads_batch; +} + +enum llama_pooling_type llama_context::pooling_type() const { + return cparams.pooling_type; +} + +int64_t llama_context::n_pos_per_token() const { + return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; +} + +ggml_context_ptr llama_context::init() { + struct ggml_init_params params = { + /*.mem_size =*/ buf_compute_meta.size(), + /*.mem_buffer =*/ buf_compute_meta.data(), + /*.no_alloc =*/ true, + }; + + return ggml_context_ptr { ggml_init(params) }; +} + void llama_context::synchronize() { ggml_backend_sched_synchronize(sched.get()); @@ -73,21 +121,96 @@ void llama_context::synchronize() { t_compute_start_us = 0; } -int64_t llama_context::n_pos_per_token() const { - return model.arch == LLM_ARCH_QWEN2VL ? 4 : 1; +void llama_context::attach_threadpool( + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { + this->threadpool = threadpool; + this->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; } -ggml_context_ptr llama_context::init() { - struct ggml_init_params params = { - /*.mem_size =*/ buf_compute_meta.size(), - /*.mem_buffer =*/ buf_compute_meta.data(), - /*.no_alloc =*/ true, - }; - - return ggml_context_ptr { ggml_init(params) }; +void llama_context::detach_threadpool() { + this->threadpool = nullptr; + this->threadpool_batch = nullptr; } +void llama_context::set_n_threads(int32_t n_threads, int32_t n_threads_batch) { + cparams.n_threads = n_threads; + cparams.n_threads_batch = n_threads_batch; +} + +void llama_context::set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data) { + this->abort_callback = abort_callback; + this->abort_callback_data = abort_callback_data; + + for (auto & backend : backends) { + auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); + auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); + if (set_abort_callback_fn) { + set_abort_callback_fn(backend.get(), this->abort_callback, this->abort_callback_data); + } + } +} + +void llama_context::set_embeddings(bool value) { + cparams.embeddings = value; +} + +void llama_context::set_causal_attn(bool value) { + cparams.causal_attn = value; +} + +void llama_context::set_adapter_lora( + struct llama_adapter_lora * adapter, + float scale) { + loras[adapter] = scale; +} + +bool llama_context::rm_adapter_lora( + struct llama_adapter_lora * adapter) { + auto pos = loras.find(adapter); + if (pos != loras.end()) { + loras.erase(pos); + return true; + } + + return false; +} + +void llama_context::clear_adapter_lora() { + loras.clear(); +} + +bool llama_context::apply_adapter_cvec( + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end) { + return cvec.apply(model, data, len, n_embd, il_start, il_end); +} + +llama_perf_context_data llama_context::get_perf() const { + llama_perf_context_data data = {}; + + data.t_start_ms = 1e-3 * t_start_us; + data.t_load_ms = 1e-3 * t_load_us; + data.t_p_eval_ms = 1e-3 * t_p_eval_us; + data.t_eval_ms = 1e-3 * t_eval_us; + data.n_p_eval = std::max(1, n_p_eval); + data.n_eval = std::max(1, n_eval); + + return data; +} + +void llama_context::perf_reset() { + t_start_us = ggml_time_us(); + t_eval_us = n_eval = 0; + t_p_eval_us = n_p_eval = 0; +} + +// // llama_context_unified +// llama_context_unified::llama_context_unified( const llama_model & model, @@ -396,18 +519,6 @@ llama_context_unified::llama_context_unified( llama_context_unified::~llama_context_unified() = default; -uint32_t llama_context_unified::n_ctx() const { - return cparams.n_ctx; -} - -uint32_t llama_context_unified::n_batch() const { - return cparams.n_batch; -} - -uint32_t llama_context_unified::n_ubatch() const { - return cparams.n_ubatch; -} - uint32_t llama_context_unified::n_seq_max() const { // TODO: add notion of n_seq_max to llama_kv_cache and use it here return kv_self.size; @@ -421,10 +532,6 @@ const llama_kv_cache * llama_context_unified::get_kv_self() const { return &kv_self; } -enum llama_pooling_type llama_context_unified::pooling_type() const { - return cparams.pooling_type; -} - float * llama_context_unified::get_logits() { // reorder logits for backward compatibility reorder_outputs(); @@ -1718,7 +1825,13 @@ size_t llama_context_unified::reserve_outputs(size_t n_outputs) { return n_outputs_max; } -// do mat_mul, while optionally apply lora +ggml_tensor * llama_context::build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il) { + return cvec.apply_to(ctx0, cur, il); +} + ggml_tensor * llama_context::build_lora_mm( ggml_context * ctx0, ggml_tensor * w, @@ -1746,7 +1859,6 @@ ggml_tensor * llama_context::build_lora_mm( return res; } -// do mat_mul_id, while optionally apply lora ggml_tensor * llama_context::build_lora_mm_id( ggml_context * ctx0, ggml_tensor * w, @@ -2994,7 +3106,8 @@ struct llama_data_write { } void write_model_info() { - const std::string arch_str = llm_arch_name(ctx->model.arch); + const auto & model = ctx->get_model(); + const std::string arch_str = llm_arch_name(model.arch); write_string(arch_str); // TODO: add more model-specific info which should prevent loading the session file if not identical } @@ -3015,7 +3128,7 @@ struct llama_data_write { std::vector output_pos; - const size_t n_batch = ctx->cparams.n_batch; + const size_t n_batch = ctx->n_batch(); const auto & output_ids = ctx->output_ids; GGML_ASSERT(n_outputs <= ctx->output_size); @@ -3040,7 +3153,9 @@ struct llama_data_write { } void write_logits() { - const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * ctx->model.vocab.n_tokens()); + const auto & model = ctx->get_model(); + + const uint64_t logits_size = std::min((uint64_t) ctx->logits_size, (uint64_t) ctx->n_outputs * model.vocab.n_tokens()); write(&logits_size, sizeof(logits_size)); @@ -3050,7 +3165,9 @@ struct llama_data_write { } void write_embeddings() { - const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * ctx->model.hparams.n_embd); + const auto & model = ctx->get_model(); + + const uint64_t embeddings_size = std::min((uint64_t) ctx->embd_size, (uint64_t) ctx->n_outputs * model.hparams.n_embd); write(&embeddings_size, sizeof(embeddings_size)); @@ -3079,7 +3196,9 @@ struct llama_data_read { // validate model information void read_model_info() { - const std::string cur_arch_str = llm_arch_name(ctx->model.arch); + const auto & model = ctx->get_model(); + + const std::string cur_arch_str = llm_arch_name(model.arch); std::string arch_str; read_string(arch_str); @@ -3117,8 +3236,8 @@ struct llama_data_read { for (int32_t i = 0; i < (int32_t) output_pos.size(); ++i) { int32_t id = output_pos[i]; - if ((uint32_t) id >= ctx->cparams.n_batch) { - throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->cparams.n_batch)); + if ((uint32_t) id >= ctx->n_batch()) { + throw std::runtime_error(format("invalid output id, %d does not fit in batch size of %u", id, ctx->n_batch())); } ctx->output_ids[id] = i; } @@ -3598,7 +3717,7 @@ uint32_t llama_n_seq_max(const struct llama_context * ctx) { } const llama_model * llama_get_model(const llama_context * ctx) { - return &ctx->model; + return &ctx->get_model(); } llama_kv_cache * llama_get_kv_self(llama_context * ctx) { @@ -3614,50 +3733,38 @@ enum llama_pooling_type llama_pooling_type(const llama_context * ctx) { } void llama_attach_threadpool( - struct llama_context * ctx, - ggml_threadpool_t threadpool, - ggml_threadpool_t threadpool_batch) { - ctx->threadpool = threadpool; - ctx->threadpool_batch = threadpool_batch ? threadpool_batch : threadpool; + struct llama_context * ctx, + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch) { + ctx->attach_threadpool(threadpool, threadpool_batch); } void llama_detach_threadpool(struct llama_context * ctx) { - ctx->threadpool = nullptr; - ctx->threadpool_batch = nullptr; + ctx->detach_threadpool(); } void llama_set_n_threads(struct llama_context * ctx, int32_t n_threads, int32_t n_threads_batch) { - ctx->cparams.n_threads = n_threads; - ctx->cparams.n_threads_batch = n_threads_batch; + ctx->set_n_threads(n_threads, n_threads_batch); } int32_t llama_n_threads(struct llama_context * ctx) { - return ctx->cparams.n_threads; + return ctx->n_threads(); } int32_t llama_n_threads_batch(struct llama_context * ctx) { - return ctx->cparams.n_threads_batch; + return ctx->n_threads_batch(); } void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { - ctx->abort_callback = abort_callback; - ctx->abort_callback_data = abort_callback_data; - - for (auto & backend : ctx->backends) { - auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get())); - auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback"); - if (set_abort_callback_fn) { - set_abort_callback_fn(backend.get(), ctx->abort_callback, ctx->abort_callback_data); - } - } + ctx->set_abort_callback(abort_callback, abort_callback_data); } void llama_set_embeddings(struct llama_context * ctx, bool embeddings) { - ctx->cparams.embeddings = embeddings; + ctx->set_embeddings(embeddings); } void llama_set_causal_attn(struct llama_context * ctx, bool causal_attn) { - ctx->cparams.causal_attn = causal_attn; + ctx->set_causal_attn(causal_attn); } void llama_synchronize(struct llama_context * ctx) { @@ -3700,24 +3807,21 @@ int32_t llama_set_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter, float scale) { - ctx->loras[adapter] = scale; + ctx->set_adapter_lora(adapter, scale); + return 0; } int32_t llama_rm_adapter_lora( struct llama_context * ctx, struct llama_adapter_lora * adapter) { - auto pos = ctx->loras.find(adapter); - if (pos != ctx->loras.end()) { - ctx->loras.erase(pos); - return 0; - } + bool res = ctx->rm_adapter_lora(adapter); - return -1; + return res ? 0 : -1; } void llama_clear_adapter_lora(struct llama_context * ctx) { - ctx->loras.clear(); + ctx->clear_adapter_lora(); } int32_t llama_apply_adapter_cvec( @@ -3727,7 +3831,9 @@ int32_t llama_apply_adapter_cvec( int32_t n_embd, int32_t il_start, int32_t il_end) { - return ctx->cvec.apply(ctx->model, data, len, n_embd, il_start, il_end); + bool res = ctx->apply_adapter_cvec(data, len, n_embd, il_start, il_end); + + return res ? 0 : -1; } // @@ -4008,5 +4114,5 @@ int32_t llama_decode( const std::vector> & llama_internal_get_tensor_map( struct llama_context * ctx ) { - return ctx->model.tensors_by_name; + return ctx->get_model().tensors_by_name; } diff --git a/src/llama-context.h b/src/llama-context.h index ac842dc8b..7b7699952 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -20,19 +20,23 @@ struct llama_context { llama_context(const llama_model & model); virtual ~llama_context(); - virtual void synchronize(); + const llama_model & get_model() const; + const llama_cparams & get_cparams() const; - virtual uint32_t n_ctx() const = 0; - virtual uint32_t n_batch() const = 0; - virtual uint32_t n_ubatch() const = 0; + virtual uint32_t n_ctx() const; + virtual uint32_t n_batch() const; + virtual uint32_t n_ubatch() const; virtual uint32_t n_seq_max() const = 0; + virtual uint32_t n_threads() const; + virtual uint32_t n_threads_batch() const; + virtual llama_kv_cache * get_kv_self() = 0; virtual const llama_kv_cache * get_kv_self() const = 0; virtual void kv_self_update() = 0; - virtual enum llama_pooling_type pooling_type() const = 0; + virtual enum llama_pooling_type pooling_type() const; virtual float * get_logits() = 0; virtual float * get_logits_ith(int32_t i) = 0; @@ -41,10 +45,41 @@ struct llama_context { virtual float * get_embeddings_ith(int32_t i) = 0; virtual float * get_embeddings_seq(llama_seq_id seq_id) = 0; - int64_t n_pos_per_token() const; // vision + virtual int64_t n_pos_per_token() const; // vision virtual ggml_context_ptr init(); + virtual void synchronize(); + + virtual void attach_threadpool( + ggml_threadpool_t threadpool, + ggml_threadpool_t threadpool_batch); + + virtual void detach_threadpool(); + + virtual void set_n_threads(int32_t n_threads, int32_t n_threads_batch); + + virtual void set_abort_callback(bool (*abort_callback)(void * data), void * abort_callback_data); + + virtual void set_embeddings (bool value); + virtual void set_causal_attn(bool value); + + virtual void set_adapter_lora( + struct llama_adapter_lora * adapter, + float scale); + + virtual bool rm_adapter_lora( + struct llama_adapter_lora * adapter); + + virtual void clear_adapter_lora(); + + virtual bool apply_adapter_cvec( + const float * data, + size_t len, + int32_t n_embd, + int32_t il_start, + int32_t il_end); + // decode a batch of tokens by evaluating the transformer // in case of unsuccessful decoding (error or warning), // the kv_cache state will be returned to its original state @@ -73,6 +108,12 @@ struct llama_context { // graph build API (generic) + // apply control vector for layer il + virtual ggml_tensor * build_cvec( + ggml_context * ctx0, + ggml_tensor * cur, + int il); + // do mat_mul, while optionally apply lora virtual ggml_tensor * build_lora_mm( ggml_context * ctx0, @@ -221,11 +262,11 @@ struct llama_context { // state save/load - virtual size_t state_get_size() = 0; + virtual size_t state_get_size() = 0; virtual size_t state_get_data( uint8_t * dst, size_t size) = 0; virtual size_t state_set_data(const uint8_t * src, size_t size) = 0; - virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0; + virtual size_t state_seq_get_size(llama_seq_id seq_id) = 0; virtual size_t state_seq_get_data(llama_seq_id seq_id, uint8_t * dst, size_t size) = 0; virtual size_t state_seq_set_data(llama_seq_id seq_id, const uint8_t * src, size_t size) = 0; @@ -253,8 +294,19 @@ struct llama_context { const llama_token * tokens, size_t n_token_count) = 0; + // perf + + virtual llama_perf_context_data get_perf() const; + virtual void perf_reset(); + // members + // TODO: temporary public until llama_context implements the graph build function + std::vector backends; + ggml_backend_t backend_cpu = nullptr; + ggml_backend_sched_ptr sched; + +protected: const llama_model & model; llama_cparams cparams; @@ -267,17 +319,11 @@ struct llama_context { ggml_abort_callback abort_callback = nullptr; void * abort_callback_data = nullptr; - std::vector backends; std::vector> set_n_threads_fns; - ggml_backend_t backend_cpu = nullptr; - - ggml_backend_sched_ptr sched; - // memory buffers used to evaluate the model std::vector buf_compute_meta; - // perf bool has_evaluated_once = false; mutable int64_t t_start_us; @@ -306,9 +352,6 @@ struct llama_context_unified : public llama_context { virtual ~llama_context_unified(); - virtual uint32_t n_ctx() const override; - virtual uint32_t n_batch() const override; - virtual uint32_t n_ubatch() const override; virtual uint32_t n_seq_max() const override; virtual llama_kv_cache * get_kv_self() override; @@ -316,8 +359,6 @@ struct llama_context_unified : public llama_context { virtual void kv_self_update() override; - virtual enum llama_pooling_type pooling_type() const override; - virtual float * get_logits() override; virtual float * get_logits_ith(int32_t i) override; diff --git a/src/llama.cpp b/src/llama.cpp index f623dd385..ab6b7f5d3 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -59,8 +59,6 @@ struct llm_build_context { const llama_hparams & hparams; const llama_cparams & cparams; const llama_ubatch & ubatch; - const llama_adapter_cvec & cvec; - const llama_loras & loras; const int64_t n_embd; const int64_t n_layer; @@ -105,12 +103,10 @@ struct llm_build_context { const llm_build_cb & cb, bool worst_case) : lctx (lctx), - model (lctx.model), + model (lctx.get_model()), hparams (model.hparams), - cparams (lctx.cparams), + cparams (lctx.get_cparams()), ubatch (ubatch), - cvec (lctx.cvec), - loras (lctx.loras), n_embd (hparams.n_embd), n_layer (hparams.n_layer), n_rot (hparams.n_rot), @@ -791,7 +787,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -947,7 +943,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1067,7 +1063,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1171,7 +1168,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1287,7 +1285,8 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cur = ggml_add(ctx0, cur, inpL); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1436,7 +1435,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1564,7 +1563,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1670,7 +1669,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -1761,7 +1761,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2057,7 +2058,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2194,7 +2196,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2342,7 +2345,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2454,7 +2458,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2565,7 +2570,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2680,7 +2686,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2823,7 +2830,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -2944,7 +2952,8 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_output); cur = ggml_add(ctx0, cur, inpL); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3083,7 +3092,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, residual, cur); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3190,7 +3200,8 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, sa_out); cur = ggml_add(ctx0, cur, inpL); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3296,7 +3307,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3406,7 +3418,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3521,7 +3534,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3638,7 +3652,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3842,7 +3857,8 @@ struct llm_build_context { cb(cur, "hidden_scaled_ffn", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -3954,7 +3970,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, sa_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4077,7 +4094,8 @@ struct llm_build_context { cb(cur, "ffn_post_norm", -1); cur = ggml_add(ctx0, cur, sa_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4202,7 +4220,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4256,7 +4275,8 @@ struct llm_build_context { // residual cur = ggml_add(ctx0, cur, inpL); - cur = lctx.cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4397,7 +4417,8 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4527,7 +4548,8 @@ struct llm_build_context { // add together residual + FFN + self-attention cur = ggml_add(ctx0, cur, inpL); cur = ggml_add(ctx0, cur, attn_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4655,7 +4677,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4774,7 +4796,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -4899,7 +4921,8 @@ struct llm_build_context { cb(cur, "ffn_moe_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5024,7 +5047,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); inpL = cur; @@ -5137,7 +5161,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, attn_out); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5165,7 +5190,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5293,7 +5319,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_out); cb(cur, "ffn_out", il); - cur = cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5446,7 +5472,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -5673,7 +5700,8 @@ struct llm_build_context { } cur = ggml_add(ctx0, cur, ffn_inp); - cur = cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6492,7 +6520,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6614,7 +6642,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6704,7 +6732,7 @@ struct llm_build_context { cur = ggml_scale(ctx0, cur, 0.5F); } - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6787,7 +6815,8 @@ struct llm_build_context { cb(cur, "ffn_out", il); cur = ggml_add(ctx0, cur, ffn_inp); - cur = lctx.cvec.apply_to(ctx0, cur, il); + + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -6947,7 +6976,7 @@ struct llm_build_context { cur = ggml_add(ctx0, cur, ffn_inp); cb(cur, "ffn_out", il); - cur = lctx.cvec.apply_to(ctx0, cur, il); + cur = lctx.build_cvec(ctx0, cur, il); cb(cur, "l_out", il); // input for next layer @@ -7140,7 +7169,8 @@ static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_ubatch & ubatch, bool worst_case) { - const auto & model = lctx.model; + const auto & model = lctx.get_model(); + const auto & cparams = lctx.get_cparams(); // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { @@ -7150,7 +7180,7 @@ static struct ggml_cgraph * llama_build_graph( ggml_set_name(cur, name); } - if (!lctx.cparams.offload_kqv) { + if (!cparams.offload_kqv) { if (strcmp(name, "kqv_merged_cont") == 0) { // all nodes between the KV store and the attention output are run on the CPU ggml_backend_sched_set_tensor_backend(lctx.sched.get(), cur, lctx.backend_cpu); @@ -7159,10 +7189,10 @@ static struct ggml_cgraph * llama_build_graph( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched - const bool full_offload = lctx.model.params.n_gpu_layers > (int) lctx.model.hparams.n_layer; + const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { - const auto & dev_layer = lctx.model.dev_layer(il); + const auto & dev_layer = model.dev_layer(il); for (auto & backend : lctx.backends) { if (ggml_backend_get_device(backend.get()) == dev_layer) { if (ggml_backend_supports_op(backend.get(), cur)) { @@ -7394,7 +7424,7 @@ static struct ggml_cgraph * llama_build_graph( } // add on pooling layer - if (lctx.cparams.embeddings) { + if (cparams.embeddings) { result = llm.append_pooling(result); } @@ -7824,12 +7854,7 @@ struct llama_perf_context_data llama_perf_context(const struct llama_context * c return data; } - data.t_start_ms = 1e-3 * ctx->t_start_us; - data.t_load_ms = 1e-3 * ctx->t_load_us; - data.t_p_eval_ms = 1e-3 * ctx->t_p_eval_us; - data.t_eval_ms = 1e-3 * ctx->t_eval_us; - data.n_p_eval = std::max(1, ctx->n_p_eval); - data.n_eval = std::max(1, ctx->n_eval); + data = ctx->get_perf(); return data; } @@ -7848,7 +7873,5 @@ void llama_perf_context_print(const struct llama_context * ctx) { } void llama_perf_context_reset(struct llama_context * ctx) { - ctx->t_start_us = ggml_time_us(); - ctx->t_eval_us = ctx->n_eval = 0; - ctx->t_p_eval_us = ctx->n_p_eval = 0; + ctx->perf_reset(); }