graph : make FA compatible with MLA + add initial Metal kernels (#12953)

* graph : make mla compatible with FA

* metal : add exp FA kernels for DeepSeek models

ggml-ci

* llama : minor naming updates

ggml-ci

* ggml : disable FA for DS head sizes

* tests : add FA tests for MLA shapes

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-04-17 18:16:36 +03:00
committed by GitHub
parent 207c22ec2d
commit 2f74c354c0
8 changed files with 117 additions and 26 deletions

View File

@ -10050,7 +10050,7 @@ struct llm_build_deepseek2 : public llm_graph_context {
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float mscale = attn_factor * (1.0f + hparams.rope_yarn_log_mul * logf(1.0f / freq_scale));
const float kq_scale = 1.0f*mscale*mscale/sqrtf(float(n_embd_head_k));
const float attn_factor_scaled = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
const float attn_factor = 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale));
ggml_tensor * cur;
ggml_tensor * inpL;
@ -10127,13 +10127,13 @@ struct llm_build_deepseek2 : public llm_graph_context {
q_pe = ggml_rope_ext(ctx0, q_pe, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor_scaled, beta_fast, beta_slow
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(q_pe, "q_pe", il);
k_pe = ggml_rope_ext(ctx0, k_pe, inp_pos, nullptr,
n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
ext_factor, attn_factor_scaled, beta_fast, beta_slow
ext_factor, attn_factor, beta_fast, beta_slow
);
cb(k_pe, "k_pe", il);