llama : Support llama 4 text-only (#12791)

* llama4 conversion

* initial support, no chat template

* clean up a bit

* fix tokenizer conversion

* correct hparams

* try this

* fix shexp

* ffn_inp_normed

* chat template

* clean up model conversion

* add_bos

* add scale_before_ffn

* fix order

* weight_before_ffn

* llm_graph_input_attn_temp

* add chunk attn mask

* build_inp_attn_scale()

* add comment about ggml_repeat

* clarify comments

* fix build
This commit is contained in:
Xuan-Son Nguyen
2025-04-07 23:06:44 +02:00
committed by GitHub
parent 82974011f3
commit 1466621e73
17 changed files with 532 additions and 22 deletions

View File

@ -59,6 +59,22 @@ void llm_graph_input_pos::set_input(const llama_ubatch * ubatch) {
}
}
void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
if (ubatch->pos && attn_scale) {
const int64_t n_tokens = ubatch->n_tokens;
std::vector<float> attn_scale_data(n_tokens, 0.0f);
for (int i = 0; i < n_tokens; ++i) {
const float pos = ubatch->pos[i];
attn_scale_data[i] = std::log(
std::floor((pos + 1.0f) / n_attn_temp_floor_scale) + 1.0
) * f_attn_temp_scale + 1.0;
}
ggml_backend_tensor_set(attn_scale, attn_scale_data.data(), 0, n_tokens*n_pos_per_token*ggml_element_size(attn_scale));
}
}
void llm_graph_input_pos_bucket::set_input(const llama_ubatch * ubatch) {
if (pos_bucket) {
const int64_t n_tokens = ubatch->n_tokens;
@ -458,9 +474,17 @@ void llm_graph_input_attn_kv_unified::set_input(const llama_ubatch * ubatch) {
}
// may need to cut off old tokens for sliding window
// TODO @ngxson : we are currently re-using the swa logic to store the chunked mask, we should rename SWA to something more generic like "aux mask"
if (data_swa) {
if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
f = -INFINITY;
if (hparams.n_attn_chunk) {
llama_pos pos_chunk_start = (pos / hparams.n_attn_chunk) * hparams.n_attn_chunk;
if (kv_self->cells[i].pos < pos_chunk_start || pos < pos_chunk_start) {
f = -INFINITY;
}
} else {
if (pos - kv_self->cells[i].pos >= (int32_t)hparams.n_swa) {
f = -INFINITY;
}
}
data_swa[h*(n_kv*n_tokens) + s*(n_kv*n_seq_tokens) + j*n_kv + i] = f;
}
@ -812,8 +836,9 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
float w_scale,
llama_expert_gating_func_type gating_op,
int il) const {
int64_t n_embd = cur->ne[0];
int64_t n_tokens = cur->ne[1];
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
const bool weight_before_ffn = arch == LLM_ARCH_LLAMA4; // for llama4, we apply the sigmoid-ed weights before the FFN
ggml_tensor * logits = build_lora_mm(gate_inp, cur); // [n_expert, n_tokens]
cb(logits, "ffn_moe_logits", il);
@ -841,6 +866,12 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
cb(selection_probs, "ffn_moe_probs_biased", il);
}
// llama4 doesn't have exp_probs_b, and sigmoid is only used after top_k
// see: https://github.com/meta-llama/llama-models/blob/699a02993512fb36936b1b0741e13c06790bcf98/models/llama4/moe.py#L183-L198
if (arch == LLM_ARCH_LLAMA4) {
selection_probs = logits;
}
// select experts
ggml_tensor * selected_experts = ggml_top_k(ctx0, selection_probs, n_expert_used); // [n_expert_used, n_tokens]
cb(selected_experts->src[0], "ffn_moe_argsort", il);
@ -867,6 +898,15 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
}
cur = ggml_reshape_3d(ctx0, cur, n_embd, 1, n_tokens);
if (weight_before_ffn) {
// TODO: this is a workaround as we don't yet have a repeat op that takes custom dim (ggml_repeat_4d)
ggml_tensor * repeated = ggml_new_tensor_3d(ctx0, cur->type, n_embd, n_expert_used, n_tokens);
repeated = ggml_repeat(ctx0, cur, repeated); // [n_embd, n_expert_used, n_tokens]
cur = ggml_mul(ctx0, repeated, weights);
cb(cur, "ffn_moe_weighted", il);
}
ggml_tensor * up = build_lora_mm_id(up_exps, cur, selected_experts); // [n_ff, n_expert_used, n_tokens]
cb(up, "ffn_moe_up", il);
@ -894,7 +934,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
ggml_tensor * experts = build_lora_mm_id(down_exps, par, selected_experts); // [n_embd, n_expert_used, n_tokens]
cb(experts, "ffn_moe_down", il);
experts = ggml_mul(ctx0, experts, weights);
if (!weight_before_ffn) {
experts = ggml_mul(ctx0, experts, weights);
cb(cur, "ffn_moe_weighted", il);
}
// aggregate experts
ggml_tensor * moe_out = nullptr;
@ -914,6 +957,8 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
moe_out = ggml_cont(ctx0, moe_out);
}
cb(moe_out, "ffn_moe_out", il);
return moe_out;
}
@ -981,6 +1026,19 @@ ggml_tensor * llm_graph_context::build_inp_pos() const {
return cur;
}
ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
auto inp = std::make_unique<llm_graph_input_attn_temp>(n_pos_per_token(), hparams.n_attn_temp_floor_scale, hparams.f_attn_temp_scale);
auto & cur = inp->attn_scale;
cur = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 1, 1, n_tokens*n_pos_per_token());
ggml_set_input(cur);
res->add_input(std::move(inp));
return cur;
}
ggml_tensor * llm_graph_context::build_inp_out_ids() const {
auto inp = std::make_unique<llm_graph_input_out_ids>(hparams, cparams, n_outputs);