CUDA: generalize FP16 fattn vec kernel (#7061)

* CUDA: generalize FP16 fattn vec kernel * disable unsupported head sizes for AMD in test * try AMD fix * fix batch size 2-8 * partially revert changes
2025-07-02 05:15:47 +00:00 · 2024-05-09 14:32:02 +02:00
parent f31ec120bc
commit a743d76a01
4 changed files with 374 additions and 220 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -15519,13 +15519,6 @@ struct llama_context * llama_new_context_with_model(
        cparams.flash_attn = false;
    }

-#ifdef GGML_USE_HIPBLAS
-    if (cparams.flash_attn) {
-        LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
-        cparams.flash_attn = false;
-    }
-#endif
-
    if (params.seed == LLAMA_DEFAULT_SEED) {
        params.seed = time(NULL);
    }