context : always use non-causal attention for encoder graphs

ggml-ci
2025-08-06 01:05:03 -04:00 · 2025-03-18 11:14:48 +02:00
parent d9a14523bb
commit a0554c3cdc
1 changed files with 16 additions and 1 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -1627,7 +1627,16 @@ llm_graph_result_ptr llama_context::graph_build(
             ggml_cgraph * gf,
      const llama_ubatch & ubatch,
            llm_graph_type gtype) {
-    return model.build_graph(
+    const auto causal_attn_org = cparams.causal_attn;
+
+    // always use non-causal attention for encoder graphs
+    // TODO: this is a tmp solution until we have a proper way to support enc-dec models
+    //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
+    if (gtype == LLM_GRAPH_TYPE_ENCODER) {
+        cparams.causal_attn = false;
+    }
+
+    auto res = model.build_graph(
            {
                /*.ctx         =*/ ctx,
                /*.arch        =*/ model.arch,
@@ -1643,6 +1652,12 @@ llm_graph_result_ptr llama_context::graph_build(
                /*.n_outputs   =*/ n_outputs,
                /*.cb          =*/ graph_get_cb(),
            }, gf, gtype);
+
+    if (gtype == LLM_GRAPH_TYPE_ENCODER) {
+        cparams.causal_attn = causal_attn_org;
+    }
+
+    return res;
 }

 ggml_status llama_context::graph_compute(