mtmd : add **vision** support for Mistral Small 3.1 (#13231)

* convert ok * load ok, missing patch merger * ah sheet it works * update llava/readme * add test * fix test
2025-06-26 19:55:04 +00:00 · 2025-05-01 17:05:42 +02:00
parent 13c9a3319b
commit 8936784f7a
9 changed files with 112 additions and 15 deletions
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -231,6 +231,7 @@ class Keys:
        BLOCK_COUNT         = "clip.vision.block_count"
        IMAGE_MEAN          = "clip.vision.image_mean"
        IMAGE_STD           = "clip.vision.image_std"
+        SPATIAL_MERGE_SIZE  = "clip.vision.spatial_merge_size"
        USE_GELU            = "clip.use_gelu"
        USE_SILU            = "clip.use_silu"

@ -491,6 +492,7 @@ class MODEL_TENSOR(IntEnum):
    V_ENC_FFN_DOWN       = auto()
    V_PRE_NORM           = auto()
    V_POST_NORM          = auto()
+    V_MM_INP_NORM        = auto()
    V_MM_INP_PROJ        = auto() # gemma3
    V_MM_SOFT_EMB_NORM   = auto() # gemma3
    V_RESMPL_POS_EMBD_K  = auto() # minicpmv
@ -505,6 +507,7 @@ class MODEL_TENSOR(IntEnum):
    V_RESMPL_PROJ        = auto() # minicpmv
    V_RESMPL_QUERY       = auto() # minicpmv
    V_TOK_EMBD_IMG_BREAK = auto() # pixtral
+    V_MM_PATCH_MERGER    = auto() # mistral small 3.1


 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@ -747,6 +750,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_PRE_NORM:                "v.pre_ln",
    MODEL_TENSOR.V_POST_NORM:               "v.post_ln",
    MODEL_TENSOR.V_MM_INP_PROJ:             "mm.input_projection",
+    MODEL_TENSOR.V_MM_INP_NORM:             "mm.input_norm",
    MODEL_TENSOR.V_MM_SOFT_EMB_NORM:        "mm.soft_emb_norm",
    MODEL_TENSOR.V_RESMPL_POS_EMBD_K:       "resampler.pos_embd_k",
    MODEL_TENSOR.V_RESMPL_ATTN_Q:           "resampler.attn.q",
@ -760,6 +764,7 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
    MODEL_TENSOR.V_RESMPL_PROJ:             "resampler.proj",
    MODEL_TENSOR.V_RESMPL_QUERY:            "resampler.query",
    MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK:      "v.token_embd.img_break", # pixtral
+    MODEL_TENSOR.V_MM_PATCH_MERGER:         "mm.patch_merger", # mistral small 3.1
 }

 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@ -783,6 +788,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_PRE_NORM,
        MODEL_TENSOR.V_POST_NORM,
        MODEL_TENSOR.V_MM_INP_PROJ,
+        MODEL_TENSOR.V_MM_INP_NORM,
        MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
        MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
        MODEL_TENSOR.V_RESMPL_ATTN_Q,
@ -796,6 +802,7 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.V_RESMPL_PROJ,
        MODEL_TENSOR.V_RESMPL_QUERY,
        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
+        MODEL_TENSOR.V_MM_PATCH_MERGER,
    ],
    MODEL_ARCH.LLAMA: [
        MODEL_TENSOR.TOKEN_EMBD,
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@ -972,6 +972,9 @@ class GGUFWriter:
    def add_vision_image_std(self, values: Sequence[float]) -> None:
        self.add_array(Keys.ClipVision.IMAGE_STD, values)

+    def add_vision_spatial_merge_size(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.SPATIAL_MERGE_SIZE, value)
+
    def add_vision_use_gelu(self, value: bool) -> None:
        self.add_bool(Keys.ClipVision.USE_GELU, value)

--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@ -1001,6 +1001,10 @@ class TensorNameMap:
            "multi_modal_projector.mm_input_projection",
        ),

+        MODEL_TENSOR.V_MM_INP_NORM: (
+            "multi_modal_projector.norm",
+        ),
+
        MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
            "multi_modal_projector.mm_soft_emb_norm",
        ),
@ -1052,6 +1056,10 @@ class TensorNameMap:
        MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: (
            "v.token_embd.img_break", # for pixtral, this is a generated vector
        ),
+
+        MODEL_TENSOR.V_MM_PATCH_MERGER: (
+            "multi_modal_projector.patch_merger.merging_layer", # mistral small 3.1
+        ),
    }

    # architecture-specific block mappings