diff --git a/common/arg.cpp b/common/arg.cpp index 85ba41114..9cbf98571 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2140,6 +2140,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.no_mmproj = true; } ).set_examples(mmproj_examples)); + add_opt(common_arg( + {"--no-mmproj-offload"}, + "do not offload multimodal projector to GPU", + [](common_params & params) { + params.mmproj_use_gpu = false; + } + ).set_examples(mmproj_examples)); add_opt(common_arg( {"--image"}, "FILE", "path to an image file. use with multimodal models. Specify multiple times for batching", diff --git a/common/common.h b/common/common.h index 70d3ef8f2..0a9dc0599 100644 --- a/common/common.h +++ b/common/common.h @@ -342,6 +342,7 @@ struct common_params { // multimodal models (see examples/llava) struct common_params_model mmproj; + bool mmproj_use_gpu = true; // use GPU for multimodal model bool no_mmproj = false; // explicitly disable multimodal model std::vector image; // path to image file(s) diff --git a/examples/llava/mtmd-cli.cpp b/examples/llava/mtmd-cli.cpp index 193737605..250e8c9a9 100644 --- a/examples/llava/mtmd-cli.cpp +++ b/examples/llava/mtmd-cli.cpp @@ -40,7 +40,8 @@ static void show_additional_info(int /*argc*/, char ** argv) { "Usage: %s [options] -m --mmproj --image -p \n\n" " -m and --mmproj are required\n" " -hf user/repo can replace both -m and --mmproj in most cases\n" - " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n", + " --image and -p are optional, if NOT provided, the CLI will run in chat mode\n" + " to disable using GPU for mmproj model, add --no-mmproj-offload\n", argv[0] ); } @@ -112,10 +113,10 @@ struct mtmd_cli_context { void init_vision_context(common_params & params) { const char * clip_path = params.mmproj.path.c_str(); ctx_vision.reset(mtmd_init_from_file(clip_path, model, mtmd_context_params{ - /* use_gpu */ true, + /* use_gpu */ params.mmproj_use_gpu, /* timings */ true, /* n_threads */ params.cpuparams.n_threads, - /* verbosity */ GGML_LOG_LEVEL_INFO, + /* verbosity */ params.verbosity > 0 ? GGML_LOG_LEVEL_DEBUG : GGML_LOG_LEVEL_INFO, })); if (!ctx_vision.get()) { LOG_ERR("Failed to load vision model from %s\n", clip_path);