llama : DeepSeek V2/V3 MLA implementation (#12801)

* Merged using squash to remove all noise commit messages

* Force flash attention off for `LLM_ARCH_DEEPSEEK2` - embedding too large

* Removed 3 conts (2x RoPE and 1x RMS-norm)

* Changed to use `<cmath>` instead of `<math.h>`

* Reverted removal of the 3 conts

* Used `reshape` in `llm_graph_context::build_attn_mha()`

* Use `k_pe = ggml_reshape`

* Removed the 3 conts again

* Removed the 3D views of `wk_b` and `wv_b`, and just save and 3D in GGUF

* Removed MQA optimisation from `build_attn_mha()` as no gains now

* Simplified `is_mla` branch in `llm_build_deepseek2()`

* Removed `build_attn_mla` and added `nullptr` to all `build_atnn` calls

* Fixed call to `build_attn` in `llm_build_t5_enc`
This commit is contained in:
Juk Armstrong
2025-04-15 07:49:57 +01:00
committed by GitHub
parent eccc7a1602
commit daa422881a
13 changed files with 289 additions and 165 deletions

View File

@ -10,6 +10,7 @@
#include <cstring>
#include <stdexcept>
#include <cinttypes>
#include <cmath>
//
// llama_context
@ -473,7 +474,6 @@ ggml_tensor * llama_context::build_rope_shift(
const auto & n_ctx_orig = cparams.n_ctx_orig_yarn;
const auto & yarn_ext_factor = cparams.yarn_ext_factor;
const auto & yarn_attn_factor = cparams.yarn_attn_factor;
const auto & yarn_beta_fast = cparams.yarn_beta_fast;
const auto & yarn_beta_slow = cparams.yarn_beta_slow;
@ -482,6 +482,10 @@ ggml_tensor * llama_context::build_rope_shift(
const auto & n_rot = hparams.n_rot;
const auto & rope_type = hparams.rope_type;
// See llm_build_deepseek2() for why attn_factor has to be scaled for YaRN RoPE to work correctly.
// See https://github.com/ggerganov/llama.cpp/discussions/7416 for detailed explanation.
const float yarn_attn_factor_scaled = model.arch == LLM_ARCH_DEEPSEEK2 ? 1.0f / (1.0f + 0.1f * logf(1.0f / freq_scale)) : cparams.yarn_attn_factor;
ggml_tensor * tmp;
if (ggml_is_quantized(cur->type)) {
@ -500,14 +504,14 @@ ggml_tensor * llama_context::build_rope_shift(
tmp = ggml_rope_ext_inplace(ctx0, tmp,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow);
tmp = ggml_cpy(ctx0, tmp, cur);
} else {
// we rotate only the first n_rot dimensions
tmp = ggml_rope_ext_inplace(ctx0, cur,
shift, factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
yarn_ext_factor, yarn_attn_factor, yarn_beta_fast, yarn_beta_slow);
yarn_ext_factor, yarn_attn_factor_scaled, yarn_beta_fast, yarn_beta_slow);
}
return tmp;
@ -2274,6 +2278,11 @@ llama_context * llama_init_from_model(
params.flash_attn = false;
}
if (params.flash_attn && model->arch == LLM_ARCH_DEEPSEEK2) {
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Deepseek2 - forcing off\n", __func__);
params.flash_attn = false;
}
if (ggml_is_quantized(params.type_v) && !params.flash_attn) {
LLAMA_LOG_ERROR("%s: V cache quantization requires flash_attn\n", __func__);
return nullptr;