mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 03:55:20 +00:00
mtmd : Support Pixtral 12B (#13065)
* add pixtral text model (vision is wip) * cgraph ok, just missing 2D RoPE * fix bad rebase * first working version * fix problem with img_break token * support dynamic image size * update docs * update test script
This commit is contained in:
@ -776,6 +776,9 @@ class TextModel(ModelBase):
|
||||
if chkhsh == "a1336059768a55c99a734006ffb02203cd450fed003e9a71886c88acf24fdbc2":
|
||||
# ref: https://huggingface.co/THUDM/glm-4-9b-hf
|
||||
res = "glm4"
|
||||
if chkhsh == "0e9433cbbb161f89e264eb32e8e64bfe69e834973ffca5d41d3948a604a3e2a3":
|
||||
# ref: https://huggingface.co/mistral-community/pixtral-12b
|
||||
res = "pixtral"
|
||||
|
||||
if res is None:
|
||||
logger.warning("\n")
|
||||
@ -1724,7 +1727,8 @@ class StableLMModel(TextModel):
|
||||
"MistralForCausalLM",
|
||||
"MixtralForCausalLM",
|
||||
"Idefics3ForConditionalGeneration",
|
||||
"SmolVLMForConditionalGeneration")
|
||||
"SmolVLMForConditionalGeneration",
|
||||
"LlavaForConditionalGeneration")
|
||||
class LlamaModel(TextModel):
|
||||
model_arch = gguf.MODEL_ARCH.LLAMA
|
||||
undo_permute = True
|
||||
@ -1734,6 +1738,10 @@ class LlamaModel(TextModel):
|
||||
# fix for SmolVLM2, missing `num_attention_heads` in config.json
|
||||
if self.hparams["architectures"][0] == "SmolVLMForConditionalGeneration":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
# fix for Pixtral, missing `num_attention_heads` in config.json
|
||||
if self.hparams["architectures"][0] == "LlavaForConditionalGeneration" \
|
||||
and self.hparams.get("model_type") == "mistral":
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 32)
|
||||
|
||||
def set_vocab(self):
|
||||
try:
|
||||
@ -1797,12 +1805,17 @@ class LlamaModel(TextModel):
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = self.hparams.get("num_key_value_heads")
|
||||
is_vision_tensor = "vision_tower" in name or "vision_model" in name or "model.connector" in name
|
||||
is_vision_tensor = "vision_tower" in name \
|
||||
or "vision_model" in name \
|
||||
or "model.connector" in name \
|
||||
or "multi_modal_projector" in name
|
||||
|
||||
if is_vision_tensor:
|
||||
return [] # skip vision tensors
|
||||
elif name.startswith("model.text_model"):
|
||||
name = name.replace("text_model.", "") # for SmolVLM
|
||||
elif name.startswith("language_model."):
|
||||
name = name.replace("language_model.", "") # for the rest
|
||||
|
||||
if self.undo_permute:
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
@ -1885,6 +1898,55 @@ class LlamaModel(TextModel):
|
||||
raise ValueError(f"Unprocessed experts: {experts}")
|
||||
|
||||
|
||||
@ModelBase.register("LlavaForConditionalGeneration")
|
||||
class LlavaVisionModel(VisionModel):
|
||||
img_break_tok_id = -1
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if self.hparams["model_type"] == "pixtral":
|
||||
# fix missing config.json values
|
||||
self.hparams["num_attention_heads"] = self.hparams.get("num_attention_heads", 16)
|
||||
self.hparams["num_hidden_layers"] = self.hparams.get("num_hidden_layers", 24)
|
||||
self.hparams["intermediate_size"] = self.hparams.get("intermediate_size", 4096)
|
||||
self.hparams["hidden_size"] = self.hparams.get("hidden_size", 1024)
|
||||
self.hparams["layer_norm_eps"] = self.hparams.get("layer_norm_eps", 1e-5)
|
||||
self.img_break_tok_id = 12 # see tokenizer_config.json
|
||||
else:
|
||||
raise ValueError(f"Unsupported model type: {self.hparams['model_type']}")
|
||||
|
||||
def set_gguf_parameters(self):
|
||||
super().set_gguf_parameters()
|
||||
hparams = self.hparams
|
||||
if hparams["model_type"] == "pixtral":
|
||||
self.gguf_writer.add_vision_projector_type(gguf.VisionProjectorType.PIXTRAL)
|
||||
# default values below are taken from HF tranformers code
|
||||
self.gguf_writer.add_vision_attention_layernorm_eps(hparams["layer_norm_eps"])
|
||||
self.gguf_writer.add_vision_use_silu(True)
|
||||
|
||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||
del bid # unused
|
||||
n_head = self.hparams["num_attention_heads"]
|
||||
n_kv_head = n_head
|
||||
|
||||
if name.startswith("multi_modal_projector.") or name.startswith("vision_tower."):
|
||||
# process vision tensors
|
||||
if name.endswith(("q_proj.weight", "q_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_head)
|
||||
if name.endswith(("k_proj.weight", "k_proj.bias")):
|
||||
data_torch = LlamaModel.permute(data_torch, n_head, n_kv_head)
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
if self.img_break_tok_id > 0 and "embed_tokens.weight" in name:
|
||||
logger.info(f"Extracting [IMG_BREAK] token embedding from {name}")
|
||||
# for pixtral model, we need to extract the [IMG_BREAK] token embedding
|
||||
img_break_embd = data_torch[self.img_break_tok_id]
|
||||
name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK]
|
||||
return [(self.map_tensor_name(name), img_break_embd)]
|
||||
|
||||
return [] # skip other tensors
|
||||
|
||||
|
||||
@ModelBase.register("Idefics3ForConditionalGeneration", "SmolVLMForConditionalGeneration")
|
||||
class SmolVLMModel(VisionModel):
|
||||
def __init__(self, *args, **kwargs):
|
||||
|
Reference in New Issue
Block a user