mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-20 09:37:40 +00:00
ggml : implement GEGLU_ERF and GEGLU_QUICK ops (#14445)
This commit is contained in:
@ -1405,8 +1405,7 @@ struct clip_graph {
|
||||
ggml_tensor * x = embeddings;
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, embeddings);
|
||||
x = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w,x);
|
||||
embeddings = ggml_silu_inplace(ctx0, embeddings);
|
||||
embeddings = ggml_mul(ctx0, embeddings,x);
|
||||
embeddings = ggml_swiglu_split(ctx0, embeddings, x);
|
||||
embeddings = ggml_mul_mat(ctx0, model.mm_model_mlp_3_w, embeddings);
|
||||
}
|
||||
// arrangement of BOI/EOI token embeddings
|
||||
@ -1502,15 +1501,8 @@ struct clip_graph {
|
||||
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
||||
|
||||
// swiglu
|
||||
{
|
||||
int64_t split_point = cur->ne[0] / 2;
|
||||
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
||||
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
||||
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
x1 = ggml_silu(ctx0, x1);
|
||||
cur = ggml_mul(ctx0, x0, x1);
|
||||
}
|
||||
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
||||
cur = ggml_swiglu_swapped(ctx0, cur);
|
||||
|
||||
// mid-norm
|
||||
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
||||
@ -1769,35 +1761,42 @@ private:
|
||||
cur = tmp;
|
||||
}
|
||||
|
||||
// we only support parallel ffn for now
|
||||
switch (type_op) {
|
||||
case FFN_SILU:
|
||||
{
|
||||
if (gate) {
|
||||
cur = ggml_swiglu_split(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_swiglu", il);
|
||||
} else {
|
||||
cur = ggml_silu(ctx0, cur);
|
||||
cb(cur, "ffn_silu", il);
|
||||
} break;
|
||||
case FFN_GELU:
|
||||
{
|
||||
if (gate) {
|
||||
cur = ggml_geglu_split(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_geglu", il);
|
||||
} else {
|
||||
cur = ggml_gelu(ctx0, cur);
|
||||
cb(cur, "ffn_gelu", il);
|
||||
} break;
|
||||
case FFN_GELU_ERF:
|
||||
{
|
||||
if (gate) {
|
||||
cur = ggml_geglu_erf_split(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_geglu_erf", il);
|
||||
} else {
|
||||
cur = ggml_gelu_erf(ctx0, cur);
|
||||
cb(cur, "ggml_gelu_erf", il);
|
||||
cb(cur, "ffn_gelu_erf", il);
|
||||
} break;
|
||||
case FFN_GELU_QUICK:
|
||||
{
|
||||
if (gate) {
|
||||
cur = ggml_geglu_quick_split(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_geglu_quick", il);
|
||||
} else {
|
||||
cur = ggml_gelu_quick(ctx0, cur);
|
||||
cb(cur, "ffn_relu", il);
|
||||
cb(cur, "ffn_gelu_quick", il);
|
||||
} break;
|
||||
}
|
||||
|
||||
// we only support parallel ffn for now
|
||||
if (gate) {
|
||||
cur = ggml_mul(ctx0, cur, tmp);
|
||||
cb(cur, "ffn_gate_par", il);
|
||||
}
|
||||
|
||||
if (down) {
|
||||
cur = ggml_mul_mat(ctx0, down, cur);
|
||||
}
|
||||
|
Reference in New Issue
Block a user