mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-29 12:35:16 +00:00
metal : optimize multi-sequence FA vec kernel (#13493)
* batched-bench : fix pp batch contents * metal : optimize multi-sequence FA vec kernel ggml-ci
This commit is contained in:
@ -3887,6 +3887,11 @@ kernel void kernel_flash_attn_ext_vec(
|
|||||||
sm[tiisg] = pm[ic + tiisg];
|
sm[tiisg] = pm[ic + tiisg];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// skip -INF blocks
|
||||||
|
if (simd_max(sm[tiisg]) == -INFINITY) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
// Q*K^T
|
// Q*K^T
|
||||||
{
|
{
|
||||||
// each simdgroup processes 1 query and NE (NW/NL) head elements
|
// each simdgroup processes 1 query and NE (NW/NL) head elements
|
||||||
|
Reference in New Issue
Block a user