llama_encode : only force non-causal attention for enc-dec models

2025-08-21 15:13:07 -04:00 · 2025-05-19 13:38:36 -04:00
parent 8960efd0a6
commit b06a954bbc
1 changed files with 6 additions and 4 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -732,10 +732,12 @@ int llama_context::encode(llama_batch & inp_batch) {

    const auto causal_attn_org = cparams.causal_attn;

-    // always use non-causal attention for encoder graphs
-    // TODO: this is a tmp solution until we have a proper way to support enc-dec models
-    //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
-    cparams.causal_attn = false;
+    if (model.arch == LLM_ARCH_T5) {
+        // always use non-causal attention for encoder graphs
+        // TODO: this is a tmp solution until we have a proper way to support enc-dec models
+        //       ref: https://github.com/ggml-org/llama.cpp/pull/12181#issuecomment-2730451223
+        cparams.causal_attn = false;
+    }

    auto * gf = graph_init();
    auto res = graph_build(ctx_compute.get(), gf, ubatch, LLM_GRAPH_TYPE_ENCODER);