mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 11:45:21 +00:00
vulkan: enable coopmat2 FA gqa and split_k optimizations more often (#12931)
The grouped query attention optmization doesn't require a power of two ratio, the only thing relying on it was the modulo operation written as bitwise &. split_k need not depend on gqa_ratio - enable it any time there's only one workgroup in the X dimension. The shader gets the split index from the x coord, and multiple workgroups in the X dimension (pre-split) indicates a larger FA operation that wouldn't need splitting.
This commit is contained in:
@ -4532,7 +4532,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
|
||||
|
||||
for (int kv : { 4096, 8192, 16384, }) {
|
||||
for (int hs : { 64, 128, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, 4, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
for (int nr : { 1, 4, }) {
|
||||
test_cases.emplace_back(new test_flash_attn_ext(hs, hs, 8, nr, kv, 1, true, 0, 0, GGML_PREC_F32, GGML_TYPE_F16));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user