diff --git a/src/llama-context.cpp b/src/llama-context.cpp
index 55f1c0382..d39263d28 100644
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -255,7 +255,8 @@ void llama_context::init() {
         // reserve pp graph first so that buffers are only allocated once
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto res_pp = graph_build(ubatch_pp, true);
+            auto ctx = graph_init();
+            auto res_pp = graph_build(ctx, ubatch_pp, true);
             auto & gf_pp = res_pp.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
@@ -269,7 +270,8 @@ void llama_context::init() {
         // reserve with tg graph to get the number of splits and nodes
         {
             llama_ubatch ubatch_tg = { true, 1, 1, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto res_tg = graph_build(ubatch_tg, true);
+            auto ctx = graph_init();
+            auto res_tg = graph_build(ctx, ubatch_tg, true);
             auto & gf_tg = res_tg.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_tg)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute tg buffers\n", __func__);
@@ -282,7 +284,8 @@ void llama_context::init() {
         // reserve again with pp graph to avoid ggml-alloc reallocations during inference
         {
             llama_ubatch ubatch_pp = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
-            auto res_pp = graph_build(ubatch_pp, true);
+            auto ctx = graph_init();
+            auto res_pp = graph_build(ctx, ubatch_pp, true);
             auto & gf_pp = res_pp.gf;
             if (!ggml_backend_sched_reserve(sched.get(), gf_pp)) {
                 LLAMA_LOG_ERROR("%s: failed to allocate compute pp buffers\n", __func__);
@@ -569,6 +572,13 @@ ggml_context_ptr llama_context::graph_init() {
     return ggml_context_ptr { ggml_init(params) };
 }
 
+llama_graph_result llama_context::graph_build(
+        ggml_context_ptr & ctx,
+      const llama_ubatch & ubatch,
+                    bool   worst_case) {
+    return model.build_graph(ctx, *this, cparams, ubatch, worst_case);
+}
+
 enum ggml_status llama_context::graph_compute(
             ggml_cgraph * graph,
                    bool   batched) {
@@ -907,10 +917,6 @@ void llama_context::build_cb(
     }
 }
 
-llama_graph_result llama_context::graph_build(const llama_ubatch & ubatch, bool worst_case) {
-    return model.build_graph(*this, cparams, ubatch, graph_init(), worst_case);
-}
-
 llama_perf_context_data llama_context::perf_get_data() const {
     llama_perf_context_data data = {};
 
@@ -1831,7 +1837,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
             llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
             llama_ubatch ubatch = { true, n_tokens, n_tokens / n_seqs, n_seqs, &token, nullptr, nullptr, nullptr, nullptr, nullptr};
 
-            auto res = graph_build(ubatch, true);
+            auto ctx = graph_init();
+            auto res = graph_build(ctx, ubatch, true);
 
             // initialize scheduler with the worst-case graph
             ggml_backend_sched_reset(sched.get());
@@ -1845,7 +1852,8 @@ int llama_context_kv_self::decode(llama_batch & inp_batch) {
         ggml_backend_sched_reset(sched.get());
         ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-        auto res = graph_build(ubatch, false);
+        auto ctx = graph_init();
+        auto res = graph_build(ctx, ubatch, false);
 
         auto & gf = res.gf;
 
@@ -2092,7 +2100,8 @@ int llama_context_kv_self::encode(llama_batch & inp_batch) {
     ggml_backend_sched_reset(sched.get());
     ggml_backend_sched_set_eval_callback(sched.get(), cparams.cb_eval, cparams.cb_eval_user_data);
 
-    auto res = graph_build(ubatch, false);
+    auto ctx = graph_init();
+    auto res = graph_build(ctx, ubatch, false);
 
     auto & gf = res.gf;
 
diff --git a/src/llama-context.h b/src/llama-context.h
index 981afcc00..e3ab12e59 100644
--- a/src/llama-context.h
+++ b/src/llama-context.h
@@ -96,7 +96,10 @@ struct llama_context : public llama_graph_i {
     virtual ggml_context_ptr graph_init();
 
     // TODO: add encode/decode graphs
-    virtual llama_graph_result graph_build(const llama_ubatch & ubatch, bool worst_case);
+    virtual llama_graph_result graph_build(
+            ggml_context_ptr & ctx,
+          const llama_ubatch & ubatch,
+                        bool   worst_case);
 
     // returns the result of ggml_backend_sched_graph_compute_async execution
     virtual enum ggml_status graph_compute(
diff --git a/src/llama-graph.h b/src/llama-graph.h
index de3cd2f04..14d0c5da0 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -13,8 +13,10 @@ struct llama_ubatch;
 struct llama_graph_result {
     ggml_cgraph * gf = nullptr;
 
-    ggml_tensor * t_logits = nullptr;
-    ggml_tensor * t_embd   = nullptr;
+    // important graph nodes
+    ggml_tensor * t_logits      = nullptr;
+    ggml_tensor * t_embd        = nullptr;
+    ggml_tensor * t_embd_pooled = nullptr;
 };
 
 // TODO: can become more granular in the future
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 4950af59b..ecfd6f185 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3841,17 +3841,19 @@ struct llm_build_context {
     const enum llama_pooling_type pooling_type;
     const enum llama_rope_type    rope_type;
 
-    const ggml_context_ptr   ctx  = nullptr;
-          ggml_context     * ctx0 = nullptr;
+    ggml_context_ptr & ctx;
+    ggml_context     * ctx0 = nullptr;
+
+    llama_graph_result res;
 
     // TODO: consider making the entire interface noexcept
     llm_build_context(
-             llama_graph_i &  lgf,
-       const llama_model   &  model,
-       const llama_cparams &  cparams,
-       const llama_ubatch  &  ubatch,
-          ggml_context_ptr && ctx,
-                      bool    worst_case) :
+            ggml_context_ptr & ctx,
+            llama_graph_i    & lgf,
+      const llama_model      & model,
+      const llama_cparams    & cparams,
+      const llama_ubatch     & ubatch,
+            bool               worst_case) :
         lgf              (lgf),
         model            (model),
         hparams          (model.hparams),
@@ -3883,7 +3885,7 @@ struct llm_build_context {
         flash_attn       (cparams.flash_attn),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
-        ctx              (std::move(ctx)),
+        ctx              (ctx),
         ctx0             (this->ctx.get()) {
         }
 
@@ -4280,16 +4282,18 @@ struct llm_build_context {
     }
 
     void append_pooling(struct ggml_cgraph * gf) {
-        // find result_norm tensor for input
-        struct ggml_tensor * inp = nullptr;
-        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
-            inp = ggml_graph_node(gf, i);
-            if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
-                break;
-            }
+        struct ggml_tensor * inp = res.t_embd;
+
+        //// find result_norm tensor for input
+        //for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+        //    inp = ggml_graph_node(gf, i);
+        //    if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
+        //        break;
+        //    }
+
+        //    inp = nullptr;
+        //}
 
-            inp = nullptr;
-        }
         GGML_ASSERT(inp != nullptr && "missing result_norm/result_embd tensor");
 
         struct ggml_tensor * cur;
@@ -4338,6 +4342,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_embd_pooled", -1);
+        res.t_embd_pooled = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4390,6 +4395,7 @@ struct llm_build_context {
 
     void build_llama(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4530,7 +4536,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -4541,12 +4549,14 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_deci(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4682,7 +4692,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -4693,12 +4705,14 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_baichuan(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4799,17 +4813,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_xverse(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -4898,11 +4917,15 @@ struct llm_build_context {
         cur = inpL;
 
         cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -4910,6 +4933,7 @@ struct llm_build_context {
     void build_falcon(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5015,16 +5039,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_grok(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5158,7 +5187,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -5169,6 +5200,7 @@ struct llm_build_context {
         cur = ggml_scale(ctx0, cur, 0.5773502691896257f);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5176,6 +5208,7 @@ struct llm_build_context {
     void build_dbrx(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -5282,12 +5315,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5295,6 +5331,7 @@ struct llm_build_context {
     void build_starcoder(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5384,16 +5421,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_refact(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5473,11 +5515,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5668,6 +5714,7 @@ struct llm_build_context {
         cur = inpL;
 
         cb(cur, "result_embd", -1);
+        res.t_embd = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5675,6 +5722,7 @@ struct llm_build_context {
     void build_bloom(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5761,10 +5809,14 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -5772,6 +5824,7 @@ struct llm_build_context {
     void build_mpt(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -5897,16 +5950,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_stablelm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6042,17 +6100,22 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6150,17 +6213,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6258,17 +6326,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen2vl(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6371,17 +6444,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_qwen2moe(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6511,11 +6589,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6523,6 +6605,7 @@ struct llm_build_context {
     void build_phi2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6628,13 +6711,17 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
         cb(cur, "result_output_no_bias", -1);
 
         cur = ggml_add(ctx0, cur, model.output_b);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6642,6 +6729,7 @@ struct llm_build_context {
     void build_phi3(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6656,7 +6744,7 @@ struct llm_build_context {
         lgf.build_attn_inp(ctx0, n_tokens, true, true, worst_case);
 
         for (int il = 0; il < n_layer; ++il) {
-            auto residual = inpL;
+            auto * residual = inpL;
 
             // self-attention
             {
@@ -6766,7 +6854,9 @@ struct llm_build_context {
             model.output_norm,
             model.output_norm_b,
             LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
@@ -6774,13 +6864,16 @@ struct llm_build_context {
             cb(cur, "result_output_no_bias", -1);
             cur = ggml_add(ctx0, cur, model.output_b);
         }
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_plamo(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -6870,11 +6963,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6882,6 +6979,7 @@ struct llm_build_context {
     void build_gpt2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -6972,10 +7070,14 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -6983,6 +7085,7 @@ struct llm_build_context {
     void build_codeshell(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7079,16 +7182,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_orion(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7192,17 +7300,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_internlm2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7306,11 +7419,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7507,7 +7624,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head scaling
         const float scale_lmhead = float(n_embd_base)/float(n_embd);
@@ -7516,7 +7635,9 @@ struct llm_build_context {
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7616,11 +7737,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7736,7 +7861,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -7747,6 +7874,7 @@ struct llm_build_context {
         cur = ggml_scale(ctx0, cur, hparams.f_final_logit_softcapping);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7754,6 +7882,7 @@ struct llm_build_context {
     // TODO: move up next to build_starcoder
     void build_starcoder2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -7858,11 +7987,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -7908,18 +8041,24 @@ struct llm_build_context {
         cur = build_norm(inpL,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_command_r(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
         const float f_logit_scale = hparams.f_logit_scale;
 
         struct ggml_tensor * cur;
@@ -8046,7 +8185,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -8056,13 +8197,16 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_cohere2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
+
         const float f_logit_scale = hparams.f_logit_scale;
 
         struct ggml_tensor * cur;
@@ -8170,7 +8314,9 @@ struct llm_build_context {
         cur = inpL;
 
         cur = build_norm(cur, model.output_norm, NULL, LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -8180,6 +8326,7 @@ struct llm_build_context {
         }
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8192,6 +8339,7 @@ struct llm_build_context {
     //   * removed MoE
     void build_olmo(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8296,17 +8444,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 NULL, NULL,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_olmo2(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8411,11 +8564,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8426,6 +8583,7 @@ struct llm_build_context {
     //   * added q, k norm
     void build_olmoe(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8533,17 +8691,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_openelm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -8655,10 +8818,14 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -8666,6 +8833,7 @@ struct llm_build_context {
     void build_gptneox(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -8794,16 +8962,21 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_arctic(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -8918,17 +9091,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_deepseek(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -9068,12 +9246,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9292,17 +9473,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = ggml_mul_mat(ctx0, model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_bitnet(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -9438,12 +9624,16 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         // FIXME: do not use model.tok_embd directly, duplicate as model.output
         cur = build_lora_mm(model.tok_embd, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9451,6 +9641,7 @@ struct llm_build_context {
     //void build_t5_enc(ggml_cgraph * gf) {
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
     //    struct ggml_tensor * cur;
@@ -9567,7 +9758,9 @@ struct llm_build_context {
     //    cur = build_norm(cur,
     //            model.output_norm_enc, NULL,
     //            LLM_NORM_RMS, -1);
+    //
     //    cb(cur, "result_norm", -1);
+    //    res.t_embd = cur;
 
     //    ggml_build_forward_expand(gf, cur);
     //}
@@ -9575,6 +9768,7 @@ struct llm_build_context {
     //void build_t5_dec(ggml_cgraph * gf) {
     //    const int64_t n_embd_head = hparams.n_embd_head_v;
     //    const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
     //    GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
     //    struct ggml_tensor * cur;
@@ -9760,11 +9954,15 @@ struct llm_build_context {
     //    cur = build_norm(cur,
     //            model.output_norm, NULL,
     //            LLM_NORM_RMS, -1);
+
     //    cb(cur, "result_norm", -1);
+    //    res.t_embd = cur;
 
     //    // lm_head
     //    cur = build_lora_mm(model.output, cur);
+
     //    cb(cur, "result_output", -1);
+    //    res.t_logits = cur;
 
     //    ggml_build_forward_expand(gf, cur);
 
@@ -9774,6 +9972,7 @@ struct llm_build_context {
     void build_jais(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -9849,11 +10048,14 @@ struct llm_build_context {
                 model.output_norm,
                 model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
 
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -9861,6 +10063,7 @@ struct llm_build_context {
     void build_chatglm(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
         const int64_t n_embd_gqa  = hparams.n_embd_v_gqa();
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
 
         struct ggml_tensor * cur;
@@ -9975,16 +10178,21 @@ struct llm_build_context {
                 model.output_norm,
                 NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_nemotron(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         //GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -10090,17 +10298,22 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, model.output_norm_b,
                 LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 
     void build_exaone(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -10208,11 +10421,15 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10290,15 +10507,21 @@ struct llm_build_context {
         }
 
         cur = inpL;
+
         struct ggml_tensor * inp_out_ids = build_inp_out_ids();
+
         cur = ggml_reshape_2d(ctx0, cur, n_embd, n_tokens);
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10375,10 +10598,14 @@ struct llm_build_context {
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
 
         cur = build_norm(cur, model.output_norm, model.output_norm_b, LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         cur = build_lora_mm(model.output, cur);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10391,6 +10618,7 @@ struct llm_build_context {
     //   * removed MoE
     void build_chameleon(ggml_cgraph * gf) {
         const int64_t n_embd_head = hparams.n_embd_head_v;
+
         GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
         GGML_ASSERT(n_embd_head == hparams.n_rot);
 
@@ -10530,7 +10758,9 @@ struct llm_build_context {
         cur = build_norm(cur,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, -1);
+
         cb(cur, "result_norm", -1);
+        res.t_embd = cur;
 
         // lm_head
         cur = build_lora_mm(model.output, cur);
@@ -10546,8 +10776,11 @@ struct llm_build_context {
         struct ggml_tensor * img_logits = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, num_img_tokens);
         img_logits = ggml_clamp(ctx0, img_logits, -FLT_MAX, -FLT_MAX);
         cb(img_logits, "img_logits", -1);
+
         cur = ggml_set_1d(ctx0, cur, img_logits, ggml_element_size(cur) * img_token_start_idx);
+
         cb(cur, "result_output", -1);
+        res.t_logits = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
@@ -10695,23 +10928,23 @@ struct llm_build_context {
         cur = build_lora_mm(model.output, cur);
 
         cur = ggml_add(ctx0, cur, model.output_b);
+
         cb(cur, "result_embd", -1);
+        res.t_embd = cur;
 
         ggml_build_forward_expand(gf, cur);
     }
 };
 
 llama_graph_result llama_model::build_graph(
-         llama_graph_i &  lgf,
-   const llama_cparams &  cparams,
-   const llama_ubatch  &  ubatch,
-      ggml_context_ptr && ctx,
+      ggml_context_ptr & ctx,
+         llama_graph_i & lgf,
+   const llama_cparams & cparams,
+   const llama_ubatch  & ubatch,
                   bool    worst_case) const {
-    llama_graph_result result = {};
+    struct llm_build_context llm(ctx, lgf, *this, cparams, ubatch, worst_case);
 
-    struct llm_build_context llm(lgf, *this, cparams, ubatch, std::move(ctx), worst_case);
-
-    auto & gf = result.gf;
+    auto & gf = llm.res.gf;
 
     gf = ggml_new_graph_custom(llm.ctx0, max_nodes(), false);
 
@@ -10935,7 +11168,7 @@ llama_graph_result llama_model::build_graph(
         llm.append_pooling(gf);
     }
 
-    return result;
+    return llm.res;
 }
 
 //
diff --git a/src/llama-model.h b/src/llama-model.h
index a3267bbbb..f5d1f7b79 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -370,11 +370,11 @@ struct llama_model {
 
     // TODO: add encode/decode graphs
     llama_graph_result build_graph(
-             llama_graph_i &  lgf,
-       const llama_cparams &  cparams,
-       const llama_ubatch  &  ubatch,
-          ggml_context_ptr && ctx,
-                      bool    worst_case) const;
+          ggml_context_ptr & ctx,
+             llama_graph_i & lgf,
+       const llama_cparams & cparams,
+       const llama_ubatch  & ubatch,
+                      bool   worst_case) const;
 
 private:
     struct impl;