This commit is contained in:
Xuan Son Nguyen
2025-04-30 12:51:40 +02:00
parent 44cd8d91ff
commit 792387b164
4 changed files with 105 additions and 0 deletions

View File

@ -2555,6 +2555,81 @@ class Qwen2VLModel(TextModel):
return [(self.map_tensor_name(name), data_torch)]
@ModelBase.register("Qwen2VLForConditionalGeneration", "Qwen2_5_VLForConditionalGeneration")
class Qwen2VLVisionModel(VisionModel):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.hparams["image_size"] = self.hparams.get("image_size", 560)
# rename config.json values
self.hparams["num_attention_heads"] = self.hparams.get("num_heads")
self.hparams["num_hidden_layers"] = self.hparams.get("depth")
self.hparams["intermediate_size"] = self.hparams.get("hidden_size")
self.hparams["hidden_size"] = self.hparams.get("embed_dim")
def set_gguf_parameters(self):
super().set_gguf_parameters()
hparams = self.hparams
if self.global_config['model_type'] == 'qwen2_vl':
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN2VL)
elif self.global_config['model_type'] == 'qwen2_5_vl':
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.QWEN25VL)
self.gguf_writer.add_vision_use_silu(True)
# find n_wa_pattern (window attention pattern)
fullatt_block_indexes = hparams.get("fullatt_block_indexes")
assert fullatt_block_indexes is not None, "fullatt_block_indexes is required for qwen2_5_vl"
n_wa_pattern = fullatt_block_indexes[0] + 1
# validate n_wa_pattern
for i in range(1, len(fullatt_block_indexes)):
if fullatt_block_indexes[i] - fullatt_block_indexes[i - 1] != n_wa_pattern:
raise ValueError(f"Invalid fullatt_block_indexes: {fullatt_block_indexes}")
self.gguf_writer.add_vision_n_wa_pattern(n_wa_pattern)
else:
raise ValueError(f"Unknown QwenVL model type: {self.global_config['model_type']}")
# default values below are taken from HF tranformers code
self.gguf_writer.add_vision_attention_layernorm_eps(self.global_config.get("rms_norm_eps", 1e-6))
def tensor_force_quant(self, name, new_name, bid, n_dims):
del bid, name, n_dims # unused
if ".patch_embd." in new_name:
return gguf.GGMLQuantizationType.F16
if ".position_embd." in new_name:
return gguf.GGMLQuantizationType.F32
return False
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
del bid # unused
if name.startswith("visual."):
# process visual tensors
# split QKV tensors if needed
if ".qkv." in name:
if data_torch.ndim == 2: # weight
c3, _ = data_torch.shape
else: # bias
c3 = data_torch.shape[0]
assert c3 % 3 == 0
c = c3 // 3
wq = data_torch[:c]
wk = data_torch[c: c * 2]
wv = data_torch[c * 2:]
return [
(self.map_tensor_name(name.replace("qkv", "q")), wq),
(self.map_tensor_name(name.replace("qkv", "k")), wk),
(self.map_tensor_name(name.replace("qkv", "v")), wv),
]
elif 'patch_embed.proj.weight' in name:
# split Conv3D into Conv2Ds
c1, c2, kt, kh, kw = data_torch.shape
del c1, c2, kh, kw # unused
assert kt == 2, "Current implmentation only support temporal_patch_size of 2"
return [
(self.map_tensor_name(name), data_torch[:, :, 0, ...]),
(self.map_tensor_name(name + '.1'), data_torch[:, :, 1, ...]),
]
else:
return [(self.map_tensor_name(name), data_torch)]
return [] # skip other tensors
@ModelBase.register("WavTokenizerDec")
class WavTokenizerDecModel(TextModel):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC