llava : introduce libmtmd (#12849)

* wip llava2

* migrated gemma3 to llava2

* add timings

* correct pre/postfix

* fix missing include

* fix compilation unused var warn

* update llava2_tokenize

* change name llava2 --> mtmd

* improve api

* refine helpers

* Update examples/llava/mtmd.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Xuan-Son Nguyen
2025-04-10 22:57:16 +02:00
committed by GitHub
parent 64eda5deb9
commit 8b9cc7cdd8
7 changed files with 687 additions and 128 deletions

View File

@ -32,23 +32,6 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
//#define CLIP_DEBUG_FUNCTIONS
// RGB uint8 image
struct clip_image_u8 {
int nx;
int ny;
std::vector<uint8_t> buf;
};
// RGB float32 image (NHWC)
// Memory layout: RGBRGBRGB...
struct clip_image_f32 {
int nx;
int ny;
std::vector<float> buf;
};
#ifdef CLIP_DEBUG_FUNCTIONS
static void clip_image_write_image_to_ppm(const clip_image_u8& img, const std::string& filename) {
std::ofstream file(filename, std::ios::binary);
@ -1614,6 +1597,12 @@ struct clip_image_f32 * clip_image_f32_init() {
return new clip_image_f32();
}
unsigned char * clip_image_u8_get_data(struct clip_image_u8 * img, uint32_t * nx, uint32_t * ny) {
if (nx) *nx = img->nx;
if (ny) *ny = img->ny;
return img->buf.data();
}
void clip_image_size_free(struct clip_image_size * load_image_size) {
if (load_image_size == nullptr) {
return;
@ -2346,6 +2335,8 @@ int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * i
int x_patch = img->nx / patch_size + (int)(img->nx % patch_size > 0);
int y_patch = img->ny / patch_size + (int)(img->ny % patch_size > 0);
n_patches = x_patch * y_patch;
} else if (ctx->proj_type == PROJECTOR_TYPE_GEMMA3) {
n_patches = 256;
}
return n_patches;
@ -2893,3 +2884,11 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
clip_image_encode(ctx, n_threads, &clip_img, vec);
return true;
}
//
// API used internally with mtmd
//
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
return ctx->proj_type;
}