mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
mtmd : support SmolVLM (version 1 and 2) (#13050)
* mtmd : support SmolVLM (version 1 and 2) * correct chat template * fix n_patches * scale_factor is an int * add more models to test
This commit is contained in:
@ -231,11 +231,15 @@ class Keys:
|
||||
IMAGE_MEAN = "clip.vision.image_mean"
|
||||
IMAGE_STD = "clip.vision.image_std"
|
||||
USE_GELU = "clip.use_gelu"
|
||||
USE_SILU = "clip.use_silu"
|
||||
|
||||
class Attention:
|
||||
HEAD_COUNT = "clip.vision.attention.head_count"
|
||||
LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon"
|
||||
|
||||
class Projector:
|
||||
SCALE_FACTOR = "clip.vision.projector.scale_factor"
|
||||
|
||||
#
|
||||
# recommended mapping of model tensor names for storage in gguf
|
||||
#
|
||||
@ -2122,6 +2126,11 @@ class GGUFValueType(IntEnum):
|
||||
raise ValueError(f"Unknown type: {type(val)}")
|
||||
|
||||
|
||||
class VisionProjectorType:
|
||||
GEMMA3 = "gemma3"
|
||||
IDEFICS3 = "idefics3"
|
||||
|
||||
|
||||
# Items here are (block size, type size)
|
||||
QK_K = 256
|
||||
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||
|
@ -931,6 +931,53 @@ class GGUFWriter:
|
||||
def add_eom_token_id(self, id: int) -> None:
|
||||
self.add_uint32(Keys.Tokenizer.EOM_ID, id)
|
||||
|
||||
# for vision models
|
||||
|
||||
def add_vision_projection_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
|
||||
|
||||
def add_vision_has_vision_encoder(self, value: bool) -> None:
|
||||
self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
|
||||
|
||||
def add_vision_patch_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
|
||||
|
||||
def add_vision_embedding_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.EMBEDDING_LENGTH, value)
|
||||
|
||||
def add_vision_feed_forward_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.FEED_FORWARD_LENGTH, value)
|
||||
|
||||
def add_vision_block_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.BLOCK_COUNT, value)
|
||||
|
||||
def add_vision_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
|
||||
|
||||
def add_vision_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_vision_attention_layernorm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
|
||||
|
||||
def add_vision_image_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
|
||||
|
||||
def add_vision_image_mean(self, values: Sequence[float]) -> None:
|
||||
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
|
||||
|
||||
def add_vision_image_std(self, values: Sequence[float]) -> None:
|
||||
self.add_array(Keys.ClipVision.IMAGE_STD, values)
|
||||
|
||||
def add_vision_use_gelu(self, value: bool) -> None:
|
||||
self.add_bool(Keys.ClipVision.USE_GELU, value)
|
||||
|
||||
def add_vision_use_silu(self, value: bool) -> None:
|
||||
self.add_bool(Keys.ClipVision.USE_SILU, value)
|
||||
|
||||
def add_vision_projector_scale_factor(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Projector.SCALE_FACTOR, value)
|
||||
|
||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
|
@ -961,13 +961,13 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_ENC_FFN_UP: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc1",
|
||||
"vpm.encoder.layers.{bid}.mlp.fc1",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3 (note: name is swapped)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_ENC_FFN_DOWN: (
|
||||
"vision_tower.vision_model.encoder.layers.{bid}.mlp.fc2",
|
||||
"vpm.encoder.layers.{bid}.mlp.fc2",
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM
|
||||
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3 (note: name is swapped)
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_PRE_NORM: (
|
||||
|
Reference in New Issue
Block a user