metal : optimize FA vec for large sequences and BS <= 8 (#15566)

* metal : optmize FA vec for large heads and sequences

* metal : adjust small-batch mul mv kernels

ggml-ci

* batched-bench : fix total speed computation

ggml-ci

* cont : add comments

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-08-26 14:22:14 +03:00
committed by GitHub
parent 79a546220c
commit b3964c1e89
4 changed files with 183 additions and 25 deletions

View File

@@ -191,7 +191,7 @@ int main(int argc, char ** argv) {
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
const float speed_tg = pl*tg / t_tg;
const float speed = n_kv / t;
const float speed = ((is_pp_shared ? pp : pl*pp) + pl*tg) / t;
if(params.batched_bench_output_jsonl) {
LOG(