From 4227c9be4268ac844921b90f31595f81236bd317 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Thu, 14 Aug 2025 23:21:24 +0200 Subject: [PATCH] CUDA: fix negative KV_max values in FA (#15321) --- ggml/src/ggml-cuda/fattn-common.cuh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index e46f0e208..d4ed93839 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -539,11 +539,15 @@ static __global__ void flash_attn_mask_to_KV_max( all_inf = warp_reduce_all(all_inf); if (!all_inf) { - KV_max_sj += FATTN_KQ_STRIDE; break; } } + // If the break in the loop was not triggered, KV_max_sj is now -FATTN_KQ_STRIDE. + // If the break was triggered it's the lower edge of the tile with the first non-masked values. + // In either case, walk back the decrementation by FATTN_KQ_STRIDE. + KV_max_sj += FATTN_KQ_STRIDE; + if (threadIdx.x != 0) { return; }