From 5d5c066de8a3d2cb32f04c4d5ad1560945f30bf3 Mon Sep 17 00:00:00 2001 From: yuiseki Date: Sun, 22 Jun 2025 21:44:57 +0900 Subject: [PATCH] mtmd : fix Pixtral OOM with large images by capping image_size to 1024 (#14326) Mistral Small 2506 models using Pixtral vision encoder were running out of GPU memory when processing images larger than 1024x1024 pixels due to exponential memory growth from unlimited image size. This fix applies the same 1024x1024 limit used by Qwen2VL models to prevent OOM issues while maintaining compatibility with existing models. --- tools/mtmd/clip.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 30283d6f1..a990520ed 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2211,6 +2211,9 @@ struct clip_model_loader { { hparams.rope_theta = 10000.0f; hparams.warmup_image_size = hparams.patch_size * 8; + // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM + // ref: https://github.com/ggml-org/llama.cpp/issues/14310 + hparams.image_size = 1024; get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false); } break; case PROJECTOR_TYPE_GEMMA3: