mtmd : add support for Qwen2-Audio and SeaLLM-Audio (#13760)

* mtmd : add Qwen2-Audio support

* small clean up

* update discussion link

* clarify mtmd_get_output_embd

* clarification in multimodal.md

* fix ultravox bug

* ggml_cont
This commit is contained in:
Xuan-Son Nguyen
2025-05-25 14:06:32 +02:00
committed by GitHub
parent a08c1d2845
commit 40aaa8a403
9 changed files with 144 additions and 52 deletions

View File

@ -546,6 +546,7 @@ class MODEL_TENSOR(IntEnum):
A_ENC_FFN_GATE = auto()
A_ENC_FFN_DOWN = auto()
A_MMPROJ = auto()
A_MMPROJ_FC = auto()
A_MM_NORM_PRE = auto()
A_MM_NORM_MID = auto()
@ -825,6 +826,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
}
@ -885,6 +887,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
MODEL_TENSOR.A_ENC_FFN_GATE,
MODEL_TENSOR.A_ENC_FFN_DOWN,
MODEL_TENSOR.A_MMPROJ,
MODEL_TENSOR.A_MMPROJ_FC,
MODEL_TENSOR.A_MM_NORM_PRE,
MODEL_TENSOR.A_MM_NORM_MID,
],
@ -2256,6 +2259,7 @@ class VisionProjectorType:
QWEN25VL = "qwen2.5vl_merger"
ULTRAVOX = "ultravox"
INTERNVL = "internvl"
QWEN2A = "qwen2a" # audio
# Items here are (block size, type size)

View File

@ -1165,6 +1165,10 @@ class TensorNameMap:
"audio.multi_modal_projector.linear_{bid}", # ultravox
),
MODEL_TENSOR.A_MMPROJ_FC: (
"audio.multi_modal_projector.linear", # qwen2audio
),
MODEL_TENSOR.A_MM_NORM_PRE: (
"audio.multi_modal_projector.ln_pre", # ultravox
),