metal : optimize FA vec for large sequences and BS <= 8 (#15566)

* metal : optmize FA vec for large heads and sequences * metal : adjust small-batch mul mv kernels ggml-ci * batched-bench : fix total speed computation ggml-ci * cont : add comments ggml-ci
2025-08-29 11:39:14 -04:00 · 2025-08-26 14:22:14 +03:00
parent 79a546220c
commit b3964c1e89
4 changed files with 183 additions and 25 deletions
--- a/tools/batched-bench/batched-bench.cpp
+++ b/tools/batched-bench/batched-bench.cpp
@@ -191,7 +191,7 @@ int main(int argc, char ** argv) {

                const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
                const float speed_tg = pl*tg / t_tg;
-                const float speed    = n_kv / t;
+                const float speed    = ((is_pp_shared ? pp : pl*pp) + pl*tg) / t;

                if(params.batched_bench_output_jsonl) {
                    LOG(