diff --git a/docs/multimodal/minicpmo2.6.md b/docs/multimodal/minicpmo2.6.md index 8c6db8efe..d4edc9718 100644 --- a/docs/multimodal/minicpmo2.6.md +++ b/docs/multimodal/minicpmo2.6.md @@ -29,8 +29,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-2_6-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-o-2_6 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 4 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-2_6 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-2_6 --minicpmv-projector ../MiniCPM-o-2_6/minicpmv.projector --output-dir ../MiniCPM-o-2_6/ --minicpmv_version 4 python ./convert_hf_to_gguf.py ../MiniCPM-o-2_6/model # quantize int4 version diff --git a/docs/multimodal/minicpmo4.0.md b/docs/multimodal/minicpmo4.0.md new file mode 100644 index 000000000..49125ea05 --- /dev/null +++ b/docs/multimodal/minicpmo4.0.md @@ -0,0 +1,47 @@ +## MiniCPM-o 4 + +### Prepare models and code + +Download [MiniCPM-o-4](https://huggingface.co/openbmb/MiniCPM-o-4) PyTorch model from huggingface to "MiniCPM-o-4" folder. + + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + +Clone llama.cpp: +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +``` + +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + +### Usage of MiniCPM-o 4 + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-o-4-gguf) by us) + +```bash +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-o-4 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-o-4 --minicpmv-projector ../MiniCPM-o-4/minicpmv.projector --output-dir ../MiniCPM-o-4/ --minicpmv_version 6 +python ./convert_hf_to_gguf.py ../MiniCPM-o-4/model + +# quantize int4 version +./build/bin/llama-quantize ../MiniCPM-o-4/model/ggml-model-f16.gguf ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + + +Inference on Linux or Mac +```bash +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-o-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-o-4/mmproj-model-f16.gguf +``` diff --git a/docs/multimodal/minicpmv2.5.md b/docs/multimodal/minicpmv2.5.md index 19b439607..5eb87bc96 100644 --- a/docs/multimodal/minicpmv2.5.md +++ b/docs/multimodal/minicpmv2.5.md @@ -28,8 +28,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-Llama3-V-2_5-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 2 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-Llama3-V-2_5 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-Llama3-V-2_5 --minicpmv-projector ../MiniCPM-Llama3-V-2_5/minicpmv.projector --output-dir ../MiniCPM-Llama3-V-2_5/ --minicpmv_version 2 python ./convert_hf_to_gguf.py ../MiniCPM-Llama3-V-2_5/model # quantize int4 version diff --git a/docs/multimodal/minicpmv2.6.md b/docs/multimodal/minicpmv2.6.md index 15c1bbd12..1b0ff5969 100644 --- a/docs/multimodal/minicpmv2.6.md +++ b/docs/multimodal/minicpmv2.6.md @@ -28,8 +28,8 @@ cmake --build build --config Release Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-2_6-gguf) by us) ```bash -python ./tools/mtmd/minicpmv-surgery.py -m ../MiniCPM-V-2_6 -python ./tools/mtmd/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --image-mean 0.5 0.5 0.5 --image-std 0.5 0.5 0.5 --minicpmv_version 3 +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-2_6 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-2_6 --minicpmv-projector ../MiniCPM-V-2_6/minicpmv.projector --output-dir ../MiniCPM-V-2_6/ --minicpmv_version 3 python ./convert_hf_to_gguf.py ../MiniCPM-V-2_6/model # quantize int4 version diff --git a/docs/multimodal/minicpmv4.0.md b/docs/multimodal/minicpmv4.0.md new file mode 100644 index 000000000..65887d960 --- /dev/null +++ b/docs/multimodal/minicpmv4.0.md @@ -0,0 +1,47 @@ +## MiniCPM-V 4 + +### Prepare models and code + +Download [MiniCPM-V-4](https://huggingface.co/openbmb/MiniCPM-V-4) PyTorch model from huggingface to "MiniCPM-V-4" folder. + + +### Build llama.cpp +Readme modification time: 20250206 + +If there are differences in usage, please refer to the official build [documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md) + +Clone llama.cpp: +```bash +git clone https://github.com/ggerganov/llama.cpp +cd llama.cpp +``` + +Build llama.cpp using `CMake`: +```bash +cmake -B build +cmake --build build --config Release +``` + + +### Usage of MiniCPM-V 4 + +Convert PyTorch model to gguf files (You can also download the converted [gguf](https://huggingface.co/openbmb/MiniCPM-V-4-gguf) by us) + +```bash +python ./tools/mtmd/legacy-models/minicpmv-surgery.py -m ../MiniCPM-V-4 +python ./tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py -m ../MiniCPM-V-4 --minicpmv-projector ../MiniCPM-V-4/minicpmv.projector --output-dir ../MiniCPM-V-4/ --minicpmv_version 5 +python ./convert_hf_to_gguf.py ../MiniCPM-V-4/model + +# quantize int4 version +./build/bin/llama-quantize ../MiniCPM-V-4/model/ggml-model-f16.gguf ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf Q4_K_M +``` + + +Inference on Linux or Mac +```bash +# run in single-turn mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-f16.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf -c 4096 --temp 0.7 --top-p 0.8 --top-k 100 --repeat-penalty 1.05 --image xx.jpg -p "What is in the image?" + +# run in conversation mode +./build/bin/llama-mtmd-cli -m ../MiniCPM-V-4/model/ggml-model-Q4_K_M.gguf --mmproj ../MiniCPM-V-4/mmproj-model-f16.gguf +``` diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index a4b62f9af..20c217331 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -868,10 +868,16 @@ struct clip_graph { int n_head = n_embd/d_head; int num_query = 96; if (ctx->model.hparams.minicpmv_version == 2) { + // MiniCPM-V 2.5 num_query = 96; } else if (ctx->model.hparams.minicpmv_version == 3) { + // MiniCPM-V 2.6 num_query = 64; } else if (ctx->model.hparams.minicpmv_version == 4) { + // MiniCPM-o 2.6 + num_query = 64; + } else if (ctx->model.hparams.minicpmv_version == 5) { + // MiniCPM-V 4.0 num_query = 64; } @@ -3551,10 +3557,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im case PROJECTOR_TYPE_MINICPMV: { if (params.minicpmv_version == 2) { + // MiniCPM-V 2.5 n_patches_sq = 96; } else if (params.minicpmv_version == 3) { + // MiniCPM-V 2.6 n_patches_sq = 64; } else if (params.minicpmv_version == 4) { + // MiniCPM-o 2.6 + n_patches_sq = 64; + } else if (params.minicpmv_version == 5) { + // MiniCPM-V 4.0 n_patches_sq = 64; } else { GGML_ABORT("Unknown minicpmv version"); @@ -4103,11 +4115,17 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_3_b->ne[0]; case PROJECTOR_TYPE_MINICPMV: if (hparams.minicpmv_version == 2) { + // MiniCPM-V 2.5 return 4096; } else if (hparams.minicpmv_version == 3) { + // MiniCPM-V 2.6 return 3584; } else if (hparams.minicpmv_version == 4) { + // MiniCPM-o 2.6 return 3584; + } else if (hparams.minicpmv_version == 5) { + // MiniCPM-V 4.0 + return 2560; } GGML_ABORT("Unknown minicpmv version"); case PROJECTOR_TYPE_GLM_EDGE: diff --git a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py index cfe0961f9..3c6020954 100644 --- a/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py +++ b/tools/mtmd/legacy-models/minicpmv-convert-image-encoder-to-gguf.py @@ -497,11 +497,11 @@ ap.add_argument("--projector-type", help="Type of projector. Possible values: ml ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 -default_image_mean = [0.48145466, 0.4578275, 0.40821073] -default_image_std = [0.26862954, 0.26130258, 0.27577711] +default_image_mean = [0.5, 0.5, 0.5] +default_image_std = [0.5, 0.5, 0.5] ap.add_argument('--image-mean', type=float, nargs='+', help='Mean of the images for normalization (overrides processor) ', default=None) ap.add_argument('--image-std', type=float, nargs='+', help='Standard deviation of the images for normalization (overrides processor)', default=None) -ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4', default=2) +ap.add_argument('--minicpmv_version', type=int, help='minicpmv_version: MiniCPM-V-2 use 1; MiniCPM-V-2.5 use 2; MiniCPM-V-2.6 use 3; MiniCPM-o-2.6 use 4; MiniCPM-V 4.0 use 5; MiniCPM-o-4.0 use 6', default=2) # with proper args = ap.parse_args() @@ -517,6 +517,17 @@ if args.use_f32: # output in the same directory as the model if output_dir is None dir_model = args.model_dir +# If minicpmv_projector is not specified but the default path exists, use the default path +if args.minicpmv_projector is None: + default_projector_path = os.path.join(dir_model, "minicpmv.projector") + if os.path.isfile(default_projector_path): + args.minicpmv_projector = default_projector_path + print(f"Found default projector file: {default_projector_path}") + +# If output_dir is not specified, use model_dir as the default value +if args.output_dir is None: + args.output_dir = dir_model + if args.clip_model_is_vision or not os.path.exists(dir_model + "/vocab.json") or args.clip_model_is_openclip: vocab = None tokens = None @@ -546,18 +557,21 @@ if args.use_f32: minicpmv_version = args.minicpmv_version emb_dim = 4096 block_count = 26 -if minicpmv_version == 1: +if minicpmv_version == 1: # MiniCPM-V 2.0 emb_dim = 2304 block_count = 26 -elif minicpmv_version == 2: +elif minicpmv_version == 2: # MiniCPM-V 2.5 emb_dim = 4096 block_count = 27 -elif minicpmv_version == 3: +elif minicpmv_version == 3: # MiniCPM-V 2.6 emb_dim = 3584 block_count = 27 -elif minicpmv_version == 4: +elif minicpmv_version == 4: # MiniCPM-o 2.6 emb_dim = 3584 block_count = 27 +elif minicpmv_version == 5: # MiniCPM-V 4.0 + emb_dim = 2560 + block_count = 27 default_vision_config = { "hidden_size": 1152, @@ -577,6 +591,10 @@ if minicpmv_version == 3: elif minicpmv_version == 4: vision_config = SiglipVisionConfig(**default_vision_config) model = SiglipVisionTransformer(vision_config) +elif minicpmv_version == 5: + default_vision_config["model_type"] = "siglip_vision_model" + vision_config = SiglipVisionConfig(**default_vision_config) + model = SiglipVisionTransformer(vision_config) processor = None # if model.attn_pool is not None: @@ -603,7 +621,7 @@ elif args.vision_only: else: fname_middle = "" -output_dir = args.output_dir if args.output_dir is not None else dir_model +output_dir = args.output_dir os.makedirs(output_dir, exist_ok=True) output_prefix = os.path.basename(output_dir).replace("ggml_", "") fname_out = os.path.join(output_dir, f"{fname_middle}model-{ftype_str[ftype]}.gguf") diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 45b2f1f25..a05373d5b 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -207,7 +207,7 @@ struct mtmd_context { tok_row_end_trail = false; // no trailing end-of-row token ov_img_first = true; - } else if (minicpmv_version == 3 || minicpmv_version == 4) { + } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5) { // minicpmv 2.6 format: // (overview) (slice) (slice) \n ... slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;