CUDA: fix Pascal FA, deq. KV to FP16 for batch > 8 (#7681)

2025-07-26 03:03:25 -04:00 · 2024-06-01 15:47:04 +02:00
parent 9b596417af
commit 750f60c03e
7 changed files with 73 additions and 29 deletions
--- a/ggml-cuda/fattn-tile-f32.cu
+++ b/ggml-cuda/fattn-tile-f32.cu
@ -275,13 +275,13 @@ void launch_fattn_tile_f32_64_128(ggml_backend_cuda_context & ctx, ggml_tensor *
            constexpr int      D = 64;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        case 128: {
            constexpr int      D = 128;
            constexpr int nwarps = 8;
            fattn_kernel_t fattn_kernel = flash_attn_tile_ext_f32<D, cols_per_block, nwarps, parallel_blocks>;
-            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block);
+            launch_fattn<D, parallel_blocks>(ctx, dst, fattn_kernel, nwarps, cols_per_block, true, true);
        } break;
        default: {
            GGML_ASSERT(false && "FlashAttention without tensor cores only supports head sizes 64 and 128.");