mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
mtmd : add **vision** support for Mistral Small 3.1 (#13231)
* convert ok * load ok, missing patch merger * ah sheet it works * update llava/readme * add test * fix test
This commit is contained in:
@ -231,6 +231,7 @@ class Keys:
|
||||
BLOCK_COUNT = "clip.vision.block_count"
|
||||
IMAGE_MEAN = "clip.vision.image_mean"
|
||||
IMAGE_STD = "clip.vision.image_std"
|
||||
SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size"
|
||||
USE_GELU = "clip.use_gelu"
|
||||
USE_SILU = "clip.use_silu"
|
||||
|
||||
@ -491,6 +492,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_ENC_FFN_DOWN = auto()
|
||||
V_PRE_NORM = auto()
|
||||
V_POST_NORM = auto()
|
||||
V_MM_INP_NORM = auto()
|
||||
V_MM_INP_PROJ = auto() # gemma3
|
||||
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
||||
V_RESMPL_POS_EMBD_K = auto() # minicpmv
|
||||
@ -505,6 +507,7 @@ class MODEL_TENSOR(IntEnum):
|
||||
V_RESMPL_PROJ = auto() # minicpmv
|
||||
V_RESMPL_QUERY = auto() # minicpmv
|
||||
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
|
||||
V_MM_PATCH_MERGER = auto() # mistral small 3.1
|
||||
|
||||
|
||||
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
||||
@ -747,6 +750,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
||||
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
||||
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
||||
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k",
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q",
|
||||
@ -760,6 +764,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
||||
MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj",
|
||||
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
|
||||
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
|
||||
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
|
||||
}
|
||||
|
||||
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
@ -783,6 +788,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_PRE_NORM,
|
||||
MODEL_TENSOR.V_POST_NORM,
|
||||
MODEL_TENSOR.V_MM_INP_PROJ,
|
||||
MODEL_TENSOR.V_MM_INP_NORM,
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
||||
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
|
||||
MODEL_TENSOR.V_RESMPL_ATTN_Q,
|
||||
@ -796,6 +802,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
||||
MODEL_TENSOR.V_RESMPL_PROJ,
|
||||
MODEL_TENSOR.V_RESMPL_QUERY,
|
||||
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
|
||||
MODEL_TENSOR.V_MM_PATCH_MERGER,
|
||||
],
|
||||
MODEL_ARCH.LLAMA: [
|
||||
MODEL_TENSOR.TOKEN_EMBD,
|
||||
|
@ -972,6 +972,9 @@ class GGUFWriter:
|
||||
def add_vision_image_std(self, values: Sequence[float]) -> None:
|
||||
self.add_array(Keys.ClipVision.IMAGE_STD, values)
|
||||
|
||||
def add_vision_spatial_merge_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value)
|
||||
|
||||
def add_vision_use_gelu(self, value: bool) -> None:
|
||||
self.add_bool(Keys.ClipVision.USE_GELU, value)
|
||||
|
||||
|
@ -1001,6 +1001,10 @@ class TensorNameMap:
|
||||
"multi_modal_projector.mm_input_projection",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_INP_NORM: (
|
||||
"multi_modal_projector.norm",
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
|
||||
"multi_modal_projector.mm_soft_emb_norm",
|
||||
),
|
||||
@ -1052,6 +1056,10 @@ class TensorNameMap:
|
||||
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
|
||||
"v.token_embd.img_break", # for pixtral, this is a generated vector
|
||||
),
|
||||
|
||||
MODEL_TENSOR.V_MM_PATCH_MERGER: (
|
||||
"multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
|
||||
),
|
||||
}
|
||||
|
||||
# architecture-specific block mappings
|
||||
|
Reference in New Issue
Block a user