convert : converting mmproj for Qwen2/2.5VL from convert_hf_to_gguf (#13209)

* wip

* qwen2.5vl ok

* vision: fix models missing "text_config"

* add test

* fix test repo name

* fix 32B model

* Revert "fix 32B model"

This reverts commit 651752f1ae.

* clarify about 32B

* rm qwen surgery script

* update llava/readme

* move V_ENC_EMBD_PATCH handling to Qwen2VLVisionModel
This commit is contained in:
Xuan-Son Nguyen
2025-05-02 17:17:15 +02:00
committed by GitHub
parent c642bc014c
commit 074e42ab31
7 changed files with 132 additions and 233 deletions

View File

@ -234,6 +234,7 @@ class Keys:
SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size"
USE_GELU = "clip.use_gelu"
USE_SILU = "clip.use_silu"
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
class Attention:
HEAD_COUNT = "clip.vision.attention.head_count"
@ -2162,6 +2163,8 @@ class VisionProjectorType:
GEMMA3 = "gemma3"
IDEFICS3 = "idefics3"
PIXTRAL = "pixtral"
QWEN2VL = "qwen2vl_merger"
QWEN25VL = "qwen2.5vl_merger"
# Items here are (block size, type size)

View File

@ -984,6 +984,9 @@ class GGUFWriter:
def add_vision_projector_scale_factor(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
def add_vision_n_wa_pattern(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
pack_prefix = ''
if not skip_pack_prefix:

View File

@ -896,6 +896,7 @@ class TensorNameMap:
MODEL_TENSOR.V_MMPROJ: (
"multi_modal_projector.linear_{bid}",
"visual.merger.mlp.{bid}", # qwen2vl
),
MODEL_TENSOR.V_MMPROJ_FC: (
@ -919,6 +920,7 @@ class TensorNameMap:
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
"vision_tower.patch_conv", # pixtral
"visual.patch_embed.proj", # qwen2vl
),
MODEL_TENSOR.V_ENC_EMBD_POS: (
@ -932,6 +934,7 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral
"visual.blocks.{bid}.attn.q", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_ATTN_K: (
@ -939,6 +942,7 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral
"visual.blocks.{bid}.attn.k", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_ATTN_V: (
@ -946,6 +950,7 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral
"visual.blocks.{bid}.attn.v", # qwen2vl, generated
),
MODEL_TENSOR.V_ENC_INPUT_NORM: (
@ -953,6 +958,7 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral
"visual.blocks.{bid}.norm1", # qwen2vl
),
MODEL_TENSOR.V_ENC_OUTPUT: (
@ -960,6 +966,7 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral
"visual.blocks.{bid}.attn.proj", # qwen2vl
),
MODEL_TENSOR.V_ENC_OUTPUT_NORM: (
@ -967,17 +974,24 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral
"visual.blocks.{bid}.norm2", # qwen2vl
),
# some namings are messed up because the original llava code swapped fc1 and fc2
# we have no better way to fix it, just be careful
# new models like pixtral use the correct naming
MODEL_TENSOR.V_ENC_FFN_UP: (
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral
"visual.blocks.{bid}.mlp.fc2", # qwen2vl
"visual.blocks.{bid}.mlp.up_proj", # qwen2.5vl
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.transformer.layers.{bid}.feed_forward.gate_proj", # pixtral
"visual.blocks.{bid}.mlp.gate_proj", # qwen2.5vl
),
MODEL_TENSOR.V_ENC_FFN_DOWN: (
@ -985,6 +999,8 @@ class TensorNameMap:
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral
"visual.blocks.{bid}.mlp.fc1", # qwen2vl
"visual.blocks.{bid}.mlp.down_proj", # qwen2.5vl
),
MODEL_TENSOR.V_PRE_NORM: (
@ -995,6 +1011,7 @@ class TensorNameMap:
MODEL_TENSOR.V_POST_NORM: (
"vision_tower.vision_model.post_layernorm",
"model.vision_model.post_layernorm", # SmolVLM
"visual.merger.ln_q", # qwen2vl
),
MODEL_TENSOR.V_MM_INP_PROJ: (