mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-29 11:39:14 -04:00
metal : optimize FA vec for large sequences and BS <= 8 (#15566)
* metal : optmize FA vec for large heads and sequences * metal : adjust small-batch mul mv kernels ggml-ci * batched-bench : fix total speed computation ggml-ci * cont : add comments ggml-ci
This commit is contained in:
@@ -191,7 +191,7 @@ int main(int argc, char ** argv) {
|
||||
|
||||
const float speed_pp = is_pp_shared ? pp / t_pp : pl*pp / t_pp;
|
||||
const float speed_tg = pl*tg / t_tg;
|
||||
const float speed = n_kv / t;
|
||||
const float speed = ((is_pp_shared ? pp : pl*pp) + pl*tg) / t;
|
||||
|
||||
if(params.batched_bench_output_jsonl) {
|
||||
LOG(
|
||||
|
Reference in New Issue
Block a user