mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
mtmd : support Qwen 2.5 Omni (input audio+vision, no audio output) (#13784)
* mtmd : allow multiple modalities at the same time * refactor mtmd tokenizer * fix compile * ok, missing SinusoidsPositionEmbedding * first working version * fix style * more strict validate of n_embd * refactor if..else to switch * fix regression * add test for 3B * update docs * fix tokenizing with add_special * add more tests * fix test case "huge" * rm redundant code * set_position_mrope_1d rm n_tokens
This commit is contained in:
@ -2260,6 +2260,7 @@ class VisionProjectorType:
|
||||
ULTRAVOX = "ultravox"
|
||||
INTERNVL = "internvl"
|
||||
QWEN2A = "qwen2a" # audio
|
||||
QWEN25O = "qwen2.5o" # omni
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
|
@ -1125,6 +1125,7 @@ class TensorNameMap:
|
||||
|
||||
MODEL_TENSOR.A_POST_NORM: (
|
||||
"audio_tower.layer_norm", # ultravox
|
||||
"audio_tower.ln_post", # qwen2omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_ENC_ATTN_Q: (
|
||||
@ -1161,12 +1162,16 @@ class TensorNameMap:
|
||||
"audio_tower.layers.{bid}.fc2", # ultravox
|
||||
),
|
||||
|
||||
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
|
||||
# this prefix is added in the conversion code in modify_tensors()
|
||||
|
||||
MODEL_TENSOR.A_MMPROJ: (
|
||||
"audio.multi_modal_projector.linear_{bid}", # ultravox
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MMPROJ_FC: (
|
||||
"audio.multi_modal_projector.linear", # qwen2audio
|
||||
"audio_tower.proj", # qwen2omni
|
||||
),
|
||||
|
||||
MODEL_TENSOR.A_MM_NORM_PRE: (
|
||||
|
Reference in New Issue
Block a user