llama : models now build their graphs using llama_graph_i

ggml-ci
2025-07-17 16:19:46 +00:00 · 2025-02-12 15:08:40 +02:00
parent 0ab50f1bbb
commit f63aeecce6
6 changed files with 7457 additions and 7441 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@ -193,6 +193,47 @@ bool llama_context::apply_adapter_cvec(
    return cvec.apply(model, data, len, n_embd, il_start, il_end);
 }

+void llama_context::build_cb(
+         ggml_tensor * cur,
+          const char * name,
+                 int   il) {
+    if (il >= 0) {
+        ggml_format_name(cur, "%s-%d", name, il);
+    } else {
+        ggml_set_name(cur, name);
+    }
+
+    if (!cparams.offload_kqv) {
+        if (strcmp(name, "kqv_merged_cont") == 0) {
+            // all nodes between the KV store and the attention output are run on the CPU
+            ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend_cpu);
+        }
+    }
+
+    // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends
+    // FIXME: fix in ggml_backend_sched
+    const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer;
+    // TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify
+    //       not sure if this is still needed, but it can be brought back if needed
+    //if (ubatch.n_tokens < 32 || full_offload) {
+    if (full_offload) {
+        if (il != -1 && strcmp(name, "norm") == 0) {
+            const auto & dev_layer = model.dev_layer(il);
+            for (auto & backend : backends) {
+                if (ggml_backend_get_device(backend.get()) == dev_layer) {
+                    if (ggml_backend_supports_op(backend.get(), cur)) {
+                        ggml_backend_sched_set_tensor_backend(sched.get(), cur, backend.get());
+                    }
+                }
+            }
+        }
+    }
+}
+
+ggml_cgraph * llama_context::build_graph(const llama_ubatch & ubatch, bool worst_case) {
+    return model.build_graph(*this, cparams, ubatch, init(), worst_case);
+}
+
 llama_perf_context_data llama_context::perf_get_data() const {
    llama_perf_context_data data = {};

@ -298,11 +339,7 @@ void llama_context::perf_reset() {

 llama_context_unified::llama_context_unified(
        const llama_model & model,
-        const llama_context_params & params,
-        build_graph_callback && cb_build_graph) :
-    llama_context(model),
-    cb_build_graph(std::move(cb_build_graph)) {
-
+        const llama_context_params & params) : llama_context(model) {
    const auto & hparams = model.hparams;

    cparams.n_seq_max        = std::max(1u, params.n_seq_max);
@ -555,7 +592,7 @@ llama_context_unified::llama_context_unified(
            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph

            llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
+            ggml_cgraph * gf_pp = build_graph(ubatch_pp, true);

            // reserve pp graph first so that buffers are only allocated once
            ggml_backend_sched_reserve(sched.get(), gf_pp);
@ -564,13 +601,13 @@ llama_context_unified::llama_context_unified(

            // reserve with tg graph to get the number of splits and nodes
            llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            ggml_cgraph * gf_tg = this->cb_build_graph(*this, ubatch_tg, true);
+            ggml_cgraph * gf_tg = build_graph(ubatch_tg, true);
            ggml_backend_sched_reserve(sched.get(), gf_tg);
            int n_splits_tg = ggml_backend_sched_get_n_splits(sched.get());
            int n_nodes_tg = ggml_graph_n_nodes(gf_tg);

            // reserve again with pp graph to avoid ggml-alloc reallocations during inference
-            gf_pp = this->cb_build_graph(*this, ubatch_pp, true);
+            gf_pp = build_graph(ubatch_pp, true);
            if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                LLAMA_LOG_ERROR("%s: failed to allocate compute buffers\n", __func__);
                throw std::runtime_error("failed to allocate compute buffers");
@ -893,7 +930,7 @@ struct llama_context_unified::batch_manager {
            llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
            llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};

-            ggml_cgraph * gf = lctx.cb_build_graph(lctx, ubatch, true);
+            ggml_cgraph * gf = lctx.build_graph(ubatch, true);

            // initialize scheduler with the worst-case graph
            ggml_backend_sched_reset(lctx.sched.get());
@ -1004,7 +1041,7 @@ int llama_context_unified::decode(llama_batch & inp_batch) {
        ggml_backend_sched_reset(sched.get());
        ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);

-        ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+        ggml_cgraph * gf = build_graph(ubatch, false);

        // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);

@ -1227,7 +1264,7 @@ int llama_context_unified::encode(llama_batch & inp_batch) {
    ggml_backend_sched_reset(sched.get());
    ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);

-    ggml_cgraph * gf = cb_build_graph(*this, ubatch, false);
+    ggml_cgraph * gf = build_graph(ubatch, false);

    ggml_backend_sched_alloc_graph(sched.get(), gf);

--- a/src/llama-context.h
+++ b/src/llama-context.h
@ -82,6 +82,14 @@ struct llama_context : public llama_graph_i {
                int32_t   il_start,
                int32_t   il_end);

+    virtual void build_cb(
+             ggml_tensor * cur,
+              const char * name,
+                     int   il);
+
+    // TODO: add encode/decode graphs
+    virtual ggml_cgraph * build_graph(const llama_ubatch & ubatch, bool worst_case);
+
    // decode a batch of tokens by evaluating the transformer
    // in case of unsuccessful decoding (error or warning),
    // the kv_cache state will be returned to its original state
@ -171,11 +179,6 @@ struct llama_context : public llama_graph_i {

    // members

-    // TODO: temporary public until llama_context implements the graph build function
-    std::vector<ggml_backend_ptr> backends;
-    ggml_backend_t backend_cpu = nullptr;
-    ggml_backend_sched_ptr sched;
-
 protected:
    const llama_model & model;

@ -189,8 +192,13 @@ protected:
    ggml_abort_callback abort_callback      = nullptr;
    void *              abort_callback_data = nullptr;

+    ggml_backend_t backend_cpu = nullptr;
+    std::vector<ggml_backend_ptr> backends;
+
    std::vector<std::pair<ggml_backend_t, ggml_backend_set_n_threads_t>> set_n_threads_fns;

+    ggml_backend_sched_ptr sched;
+
    // memory buffers used to evaluate the model
    std::vector<uint8_t> buf_compute_meta;

@ -213,13 +221,9 @@ class llama_context_unified : public llama_context {
 public:
    struct batch_manager;

-    // TODO: tmp until llama_model starts implementing the graph build function
-    typedef std::function<ggml_cgraph *(llama_context &, const llama_ubatch &, bool worst_case)> build_graph_callback;
-
    llama_context_unified(
            const llama_model & model,
-            const llama_context_params & params,
-            build_graph_callback && cb_build_graph);
+            const llama_context_params & params);

    virtual ~llama_context_unified();

@ -244,8 +248,6 @@ public:

    llama_sbatch sbatch;

-    build_graph_callback cb_build_graph;
-
    // host buffer for the model output (logits and embeddings)
    ggml_backend_buffer_ptr buf_output;

--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@ -7,9 +7,15 @@ struct ggml_context;
 struct ggml_tensor;
 struct llama_ubatch;

-// TODO: pass to llama_model graph build
+// TODO: can become more granular in the future
 class llama_graph_i {
 public:
+    // callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
+    virtual void build_cb(
+             ggml_tensor * cur,
+              const char * name,
+                     int   il) = 0;
+
    // apply control vector for layer il
    virtual ggml_tensor * build_cvec(
            ggml_context * ctx0,
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
--- a/src/llama-model.h
+++ b/src/llama-model.h
@ -5,11 +5,16 @@
 #include "llama-hparams.h"
 #include "llama-vocab.h"

+#include "ggml-cpp.h"
+
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>

+class  llama_graph_i;
+struct llama_cparams;
+struct llama_ubatch;
 struct llama_model_loader;

 // available models
@ -362,6 +367,14 @@ struct llama_model {

    const struct ggml_tensor * get_tensor(const char * name) const;

+    // TODO: add encode/decode graphs
+    ggml_cgraph * build_graph(
+             llama_graph_i &  lgf,
+       const llama_cparams &  cparams,
+       const llama_ubatch  &  ubatch,
+          ggml_context_ptr && ctx,
+                      bool    worst_case) const;
+
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
--- a/src/llama.cpp
+++ b/src/llama.cpp