mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
mtmd : support Qwen 2.5 Omni (input audio+vision, no audio output) (#13784)
* mtmd : allow multiple modalities at the same time * refactor mtmd tokenizer * fix compile * ok, missing SinusoidsPositionEmbedding * first working version * fix style * more strict validate of n_embd * refactor if..else to switch * fix regression * add test for 3B * update docs * fix tokenizing with add_special * add more tests * fix test case "huge" * rm redundant code * set_position_mrope_1d rm n_tokens
This commit is contained in:
@ -130,6 +130,7 @@ enum projector_type {
|
||||
PROJECTOR_TYPE_INTERNVL,
|
||||
PROJECTOR_TYPE_LLAMA4,
|
||||
PROJECTOR_TYPE_QWEN2A,
|
||||
PROJECTOR_TYPE_QWEN25O, // will be replaced by QWEN2A or QWEN25VL depending on clip_ctx
|
||||
PROJECTOR_TYPE_UNKNOWN,
|
||||
};
|
||||
|
||||
@ -148,6 +149,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
|
||||
{ PROJECTOR_TYPE_INTERNVL, "internvl"},
|
||||
{ PROJECTOR_TYPE_LLAMA4, "llama4"},
|
||||
{ PROJECTOR_TYPE_QWEN2A, "qwen2a"},
|
||||
{ PROJECTOR_TYPE_QWEN25O, "qwen2.5o"},
|
||||
};
|
||||
|
||||
static projector_type clip_projector_type_from_string(const std::string & str) {
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -17,12 +17,22 @@ struct clip_image_f32;
|
||||
struct clip_image_u8_batch;
|
||||
struct clip_image_f32_batch;
|
||||
|
||||
enum clip_modality {
|
||||
CLIP_MODALITY_VISION,
|
||||
CLIP_MODALITY_AUDIO,
|
||||
};
|
||||
|
||||
struct clip_context_params {
|
||||
bool use_gpu;
|
||||
enum ggml_log_level verbosity;
|
||||
};
|
||||
|
||||
struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
struct clip_init_result {
|
||||
struct clip_ctx * ctx_v; // vision context
|
||||
struct clip_ctx * ctx_a; // audio context
|
||||
};
|
||||
|
||||
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params);
|
||||
|
||||
void clip_free(struct clip_ctx * ctx);
|
||||
|
||||
|
@ -284,7 +284,9 @@ int main(int argc, char ** argv) {
|
||||
if (is_single_turn) {
|
||||
g_is_generating = true;
|
||||
if (params.prompt.find(mtmd_default_marker()) == std::string::npos) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
for (size_t i = 0; i < params.image.size(); i++) {
|
||||
params.prompt += mtmd_default_marker();
|
||||
}
|
||||
}
|
||||
common_chat_msg msg;
|
||||
msg.role = "user";
|
||||
|
@ -66,7 +66,8 @@ struct decode_embd_batch {
|
||||
}
|
||||
}
|
||||
|
||||
void set_position_mrope(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
// M-RoPE for image
|
||||
void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int y = 0; y < ny; y++) {
|
||||
@ -85,6 +86,23 @@ struct decode_embd_batch {
|
||||
}
|
||||
}
|
||||
|
||||
// M-RoPE for audio
|
||||
void set_position_mrope_1d(llama_pos pos_0, llama_seq_id seq_id) {
|
||||
GGML_ASSERT(n_pos_per_embd == 4);
|
||||
seq_id_0[0] = seq_id;
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
pos[i ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens ] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 2] = pos_0 + i;
|
||||
pos[i + batch.n_tokens * 3] = 0; // last pos dim is unused
|
||||
}
|
||||
for (int i = 0; i < batch.n_tokens; i++) {
|
||||
batch.n_seq_id[i] = 1;
|
||||
batch.seq_id [i] = seq_id_0.data();
|
||||
batch.logits [i] = false;
|
||||
}
|
||||
}
|
||||
|
||||
llama_batch get_view(int offset, int n_tokens) {
|
||||
llama_pos * pos_ptr;
|
||||
pos_view.clear();
|
||||
@ -146,18 +164,20 @@ int32_t mtmd_helper_decode_image_chunk(
|
||||
decode_embd_batch batch_embd(encoded_embd, n_tokens, n_pos_per_embd, n_mmproj_embd);
|
||||
|
||||
if (mtmd_decode_use_mrope(ctx)) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (chunk_type != MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
LOG_ERR("failed to decode chunk: M-RoPE only accepts image chunk\n");
|
||||
return -1;
|
||||
if (chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
const auto image_tokens = mtmd_input_chunk_get_tokens_image(chunk);
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope_2d(n_past, nx, ny, seq_id);
|
||||
} else if (chunk_type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
batch_embd.set_position_mrope_1d(n_past, seq_id);
|
||||
} else {
|
||||
GGML_ABORT("invalid chunk type for M-RoPE");
|
||||
}
|
||||
if (!image_tokens) {
|
||||
LOG_ERR("failed to decode chunk: image tokens are null\n");
|
||||
return -1;
|
||||
}
|
||||
const int nx = mtmd_image_tokens_get_nx(image_tokens);
|
||||
const int ny = mtmd_image_tokens_get_ny(image_tokens);
|
||||
batch_embd.set_position_mrope(n_past, nx, ny, seq_id);
|
||||
} else {
|
||||
batch_embd.set_position_normal(n_past, seq_id);
|
||||
}
|
||||
|
@ -95,15 +95,21 @@ mtmd_context_params mtmd_context_params_default() {
|
||||
}
|
||||
|
||||
struct mtmd_context {
|
||||
struct clip_ctx * ctx_clip;
|
||||
struct clip_ctx * ctx_v; // vision
|
||||
struct clip_ctx * ctx_a; // audio
|
||||
const struct llama_model * text_model;
|
||||
std::vector<float> image_embd_v; // image embedding vector
|
||||
|
||||
bool print_timings;
|
||||
int n_threads;
|
||||
std::string media_marker;
|
||||
bool has_vision;
|
||||
bool has_audio;
|
||||
const int n_embd_text;
|
||||
|
||||
// these are not token, but strings used to mark the beginning and end of image/audio embeddings
|
||||
std::string img_beg;
|
||||
std::string img_end;
|
||||
std::string aud_beg;
|
||||
std::string aud_end;
|
||||
|
||||
// for llava-uhd style models, we need special tokens in-between slices
|
||||
// minicpmv calls them "slices", llama 4 calls them "tiles"
|
||||
@ -132,33 +138,61 @@ struct mtmd_context {
|
||||
text_model (text_model),
|
||||
print_timings(ctx_params.print_timings),
|
||||
n_threads (ctx_params.n_threads),
|
||||
media_marker (ctx_params.media_marker)
|
||||
media_marker (ctx_params.media_marker),
|
||||
n_embd_text (llama_model_n_embd(text_model))
|
||||
{
|
||||
if (std::string(ctx_params.image_marker) != MTMD_DEFAULT_IMAGE_MARKER) {
|
||||
throw std::runtime_error("custom image_marker is not supported anymore, use media_marker instead");
|
||||
}
|
||||
|
||||
if (media_marker.empty()) {
|
||||
throw std::runtime_error("media_marker must not be empty");
|
||||
}
|
||||
|
||||
clip_context_params ctx_clip_params;
|
||||
ctx_clip_params.use_gpu = ctx_params.use_gpu;
|
||||
ctx_clip_params.verbosity = ctx_params.verbosity;
|
||||
ctx_clip = clip_init(mmproj_fname, ctx_clip_params);
|
||||
if (!ctx_clip) {
|
||||
auto res = clip_init(mmproj_fname, ctx_clip_params);
|
||||
ctx_v = res.ctx_v;
|
||||
ctx_a = res.ctx_a;
|
||||
if (!ctx_v && !ctx_a) {
|
||||
throw std::runtime_error(string_format("Failed to load CLIP model from %s\n", mmproj_fname));
|
||||
}
|
||||
|
||||
if (llama_model_n_embd(text_model) != clip_n_mmproj_embd(ctx_clip)) {
|
||||
// if both vision and audio mmproj are present, we need to validate their n_embd
|
||||
if (ctx_v && ctx_a) {
|
||||
int n_embd_v = clip_n_mmproj_embd(ctx_v);
|
||||
int n_embd_a = clip_n_mmproj_embd(ctx_a);
|
||||
if (n_embd_v != n_embd_a) {
|
||||
throw std::runtime_error(string_format(
|
||||
"mismatch between vision and audio mmproj (n_embd_v = %d, n_embd_a = %d)\n",
|
||||
n_embd_v, n_embd_a));
|
||||
}
|
||||
}
|
||||
|
||||
// since we already validate n_embd of vision and audio mmproj,
|
||||
// we can safely assume that they are the same
|
||||
int n_embd_clip = clip_n_mmproj_embd(ctx_v ? ctx_v : ctx_a);
|
||||
if (n_embd_text != n_embd_clip) {
|
||||
throw std::runtime_error(string_format(
|
||||
"mismatch between text model (n_embd = %d) and mmproj (n_embd = %d)\n"
|
||||
"hint: you may be using wrong mmproj\n",
|
||||
llama_model_n_embd(text_model), clip_n_mmproj_embd(ctx_clip)));
|
||||
n_embd_text, n_embd_clip));
|
||||
}
|
||||
if (ctx_v) {
|
||||
init_vision();
|
||||
}
|
||||
if (ctx_a) {
|
||||
init_audio();
|
||||
}
|
||||
}
|
||||
|
||||
has_vision = clip_has_vision_encoder(ctx_clip);
|
||||
has_audio = clip_has_audio_encoder(ctx_clip);
|
||||
use_mrope = clip_is_qwen2vl(ctx_clip);
|
||||
void init_vision() {
|
||||
GGML_ASSERT(ctx_v != nullptr);
|
||||
use_mrope = clip_is_qwen2vl(ctx_v);
|
||||
|
||||
projector_type proj = clip_get_projector_type(ctx_clip);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_clip);
|
||||
projector_type proj = clip_get_projector_type(ctx_v);
|
||||
int minicpmv_version = clip_is_minicpmv(ctx_v);
|
||||
if (minicpmv_version == 2) {
|
||||
// minicpmv 2.5 format:
|
||||
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
|
||||
@ -203,24 +237,82 @@ struct mtmd_context {
|
||||
ov_img_first = false; // overview image is last
|
||||
}
|
||||
|
||||
if (clip_has_whisper_encoder(ctx_clip)) {
|
||||
// set boi/eoi
|
||||
if (proj == PROJECTOR_TYPE_GEMMA3) {
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
img_beg = "<start_of_image>";
|
||||
img_end = "<end_of_image>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
img_beg = "<fake_token_around_image><global-img>";
|
||||
img_end = "<fake_token_around_image>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
img_end = "[IMG_END]";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
img_beg = "<|vision_start|>";
|
||||
img_end = "<|vision_end|>";
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
// (more details in mtmd_context constructor)
|
||||
img_beg = "<|image_start|>";
|
||||
img_end = "<|image_end|>";
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
|
||||
} else if (proj == PROJECTOR_TYPE_INTERNVL) {
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
img_beg = "<img>";
|
||||
img_end = "</img>";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void init_audio() {
|
||||
GGML_ASSERT(ctx_a != nullptr);
|
||||
projector_type proj = clip_get_projector_type(ctx_a);
|
||||
|
||||
if (clip_has_whisper_encoder(ctx_a)) {
|
||||
// TODO @ngxson : check if model n_mel is 128 or 80
|
||||
w_filters = whisper_precalc_filters::get_128_bins();
|
||||
}
|
||||
|
||||
// warning messages
|
||||
if (proj == PROJECTOR_TYPE_LLAMA4) {
|
||||
LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
|
||||
}
|
||||
if (has_audio) {
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||
LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
|
||||
" https://github.com/ggml-org/llama.cpp/discussions/13759\n", __func__);
|
||||
|
||||
if (proj == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
aud_beg = "<|audio_bos|>";
|
||||
aud_end = "<|audio_eos|>";
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
// get clip ctx based on chunk type
|
||||
clip_ctx * get_clip_ctx(const mtmd_input_chunk * chunk) const {
|
||||
if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
return ctx_v;
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
return ctx_a;
|
||||
}
|
||||
GGML_ABORT("unknown chunk type");
|
||||
}
|
||||
|
||||
projector_type proj_type_v() const {
|
||||
return ctx_v ? clip_get_projector_type(ctx_v) : PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
projector_type proj_type_a() const {
|
||||
return ctx_a ? clip_get_projector_type(ctx_a) : PROJECTOR_TYPE_UNKNOWN;
|
||||
}
|
||||
|
||||
~mtmd_context() {
|
||||
clip_free(ctx_clip);
|
||||
clip_free(ctx_a);
|
||||
clip_free(ctx_v);
|
||||
}
|
||||
|
||||
private:
|
||||
@ -267,107 +359,315 @@ void mtmd_free(mtmd_context * ctx) {
|
||||
}
|
||||
}
|
||||
|
||||
// copied from common_tokenize
|
||||
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
struct mtmd_tokenizer {
|
||||
mtmd_context * ctx;
|
||||
std::vector<const mtmd_bitmap *> bitmaps;
|
||||
|
||||
int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
mtmd_input_chunks * output,
|
||||
std::string input_text;
|
||||
bool add_special;
|
||||
bool parse_special;
|
||||
const llama_vocab * vocab;
|
||||
|
||||
mtmd_input_chunks cur;
|
||||
|
||||
mtmd_tokenizer(mtmd_context * ctx,
|
||||
const mtmd_input_text * text,
|
||||
const mtmd_bitmap ** bitmaps,
|
||||
size_t n_bitmaps) {
|
||||
auto vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
std::string prompt_modified(text->text);
|
||||
std::string marker_modified(ctx->media_marker);
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
|
||||
// for compatibility, we convert image marker to media marker
|
||||
string_replace_all(prompt_modified, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
||||
|
||||
// a bit hacky here, but works for now
|
||||
// for some models, we need to add prefix and suffix to the image embeddings
|
||||
if (clip_is_gemma3(ctx->ctx_clip)) {
|
||||
// gemma 3
|
||||
// <start_of_image> ... (image embeddings) ... <end_of_image>
|
||||
marker_modified = "<start_of_image>" + ctx->media_marker + "<end_of_image>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
|
||||
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
|
||||
marker_modified = "<fake_token_around_image><global-img>" + ctx->media_marker + "<fake_token_around_image>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
||||
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
|
||||
marker_modified = ctx->media_marker + "[IMG_END]";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2VL || proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
||||
// <|vision_start|> ... (image embeddings) ... <|vision_end|>
|
||||
marker_modified = "<|vision_start|>" + ctx->media_marker + "<|vision_end|>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_LLAMA4) {
|
||||
// (more details in mtmd_context constructor)
|
||||
marker_modified = "<|image_start|>" + ctx->media_marker + "<|image_end|>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_INTERNVL) {
|
||||
// <img> ... (image embeddings) ... </img>
|
||||
marker_modified = "<img>" + ctx->media_marker + "</img>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
|
||||
} else if (proj_type == PROJECTOR_TYPE_QWEN2A) {
|
||||
// <|audio_bos|> ... (embeddings) ... <|audio_eos|>
|
||||
marker_modified = "<|audio_bos|>" + ctx->media_marker + "<|audio_eos|>";
|
||||
string_replace_all(prompt_modified, ctx->media_marker, marker_modified);
|
||||
size_t n_bitmaps) : ctx(ctx), bitmaps(bitmaps, bitmaps + n_bitmaps) {
|
||||
add_special = text->add_special;
|
||||
parse_special = text->parse_special;
|
||||
input_text = text->text;
|
||||
vocab = llama_model_get_vocab(ctx->text_model);
|
||||
|
||||
// for compatibility, we convert image marker to media marker
|
||||
string_replace_all(input_text, MTMD_DEFAULT_IMAGE_MARKER, ctx->media_marker);
|
||||
}
|
||||
|
||||
// llava-1.5, llava-1.6, Yi-VL, Yi-34B, granite: don't need to add prefix and suffix
|
||||
// for glm-edge, BOI and EOI token's embeddings are not present in the text model
|
||||
int32_t tokenize(mtmd_input_chunks * output) {
|
||||
cur.entries.clear();
|
||||
std::vector<std::string> parts = split_text(input_text, ctx->media_marker);
|
||||
size_t i_bm = 0; // index of the current bitmap
|
||||
for (auto & part : parts) {
|
||||
if (part == ctx->media_marker) {
|
||||
// this is a marker, we should add the next bitmap
|
||||
if (i_bm >= bitmaps.size()) {
|
||||
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
|
||||
__func__, bitmaps.size(), parts.size() - 1);
|
||||
return 1;
|
||||
}
|
||||
const mtmd_bitmap * bitmap = bitmaps[i_bm++];
|
||||
int32_t res = add_media(bitmap);
|
||||
if (res != 0) {
|
||||
return res;
|
||||
}
|
||||
} else {
|
||||
// this is a text part, we should add it as text
|
||||
add_text(part, parse_special);
|
||||
}
|
||||
}
|
||||
|
||||
std::vector<std::string> parts = string_split_str(prompt_modified, ctx->media_marker);
|
||||
output->entries.clear();
|
||||
output->entries.reserve(parts.size());
|
||||
if (add_special && llama_vocab_get_add_bos(vocab)) {
|
||||
// if first chunk is text, we add BOS token to first text chunk
|
||||
// otherwise, create a new text chunk with BOS token
|
||||
if (!cur.entries.empty() && cur.entries[0].type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
// add BOS token to the beginning of first text chunk
|
||||
cur.entries[0].tokens_text.insert(cur.entries[0].tokens_text.begin(), llama_vocab_bos(vocab));
|
||||
} else {
|
||||
// create a new text chunk with BOS token at the beginning
|
||||
mtmd_input_chunk bos_chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
{llama_vocab_bos(vocab)},
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.insert(cur.entries.begin(), std::move(bos_chunk));
|
||||
}
|
||||
}
|
||||
|
||||
size_t i_bm = 0;
|
||||
if (add_special && llama_vocab_get_add_eos(vocab)) {
|
||||
// if last chunk is text, we add EOS token to it
|
||||
add_text({llama_vocab_eos(vocab)});
|
||||
}
|
||||
|
||||
// utility for adding raw tokens
|
||||
auto add_text_chunk = [&output](std::vector<llama_token> && tokens) {
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
};
|
||||
if (i_bm != bitmaps.size()) {
|
||||
LOG_ERR("%s: error: number of bitmaps (%zu) does not match number of markers (%zu)\n",
|
||||
__func__, bitmaps.size(), parts.size() - 1);
|
||||
return 1;
|
||||
}
|
||||
|
||||
// utility for splitting batch of multiple images into chunks of batch having single images
|
||||
auto split_batch_to_chunk = [&ctx](clip_image_f32_batch && batch_f32, const std::string & id) {
|
||||
*output = std::move(cur);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void add_text(const std::string & txt, bool parse_special) {
|
||||
LOG_DBG("%s: %s\n", __func__, txt.c_str());
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, txt, /* add_special */ false, parse_special);
|
||||
add_text(tokens);
|
||||
}
|
||||
|
||||
void add_text(const std::vector<llama_token> & tokens) {
|
||||
if (tokens.empty()) {
|
||||
return;
|
||||
}
|
||||
// if last entry is also a text chunk, add tokens to it instead of creating new chunk
|
||||
if (!cur.entries.empty() && cur.entries.back().type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
|
||||
cur.entries.back().tokens_text.insert(
|
||||
cur.entries.back().tokens_text.end(),
|
||||
tokens.begin(),
|
||||
tokens.end());
|
||||
} else {
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
tokens,
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
}
|
||||
|
||||
int32_t add_media(const mtmd_bitmap * bitmap) {
|
||||
if (!bitmap->is_audio) {
|
||||
// handle image
|
||||
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: error: model does not support vision input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (!ctx->img_beg.empty()) {
|
||||
add_text(ctx->img_beg, true); // add image begin token
|
||||
}
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmap->nx;
|
||||
img_u8->ny = bitmap->ny;
|
||||
img_u8->buf.resize(bitmap->data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
if (
|
||||
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
) {
|
||||
// split batch into chunks of single images
|
||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
|
||||
GGML_ASSERT(chunks.size() > 0);
|
||||
|
||||
auto ov_chunk = std::move(chunks.front());
|
||||
chunks.erase(chunks.begin());
|
||||
|
||||
// add overview image (first)
|
||||
if (ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_start});
|
||||
}
|
||||
cur.entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add slices (or tiles)
|
||||
if (!chunks.empty()) {
|
||||
const int n_col = batch_f32.grid_x;
|
||||
const int n_row = batch_f32.grid_y;
|
||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_slices_start});
|
||||
}
|
||||
for (int y = 0; y < n_row; y++) {
|
||||
for (int x = 0; x < n_col; x++) {
|
||||
const bool is_last_in_row = (x == n_col - 1);
|
||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_sli_img_start});
|
||||
}
|
||||
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
|
||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_sli_img_end});
|
||||
}
|
||||
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_sli_img_mid});
|
||||
}
|
||||
}
|
||||
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_row_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_slices_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add overview image (last)
|
||||
if (!ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_start});
|
||||
}
|
||||
cur.entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_v, entry.get());
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
if (ctx->use_mrope) {
|
||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
|
||||
image_tokens->use_mrope_pos = true;
|
||||
} else {
|
||||
// other models, we only need the total number of tokens
|
||||
image_tokens->nx = n_tokens;
|
||||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmap->id; // optional
|
||||
|
||||
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{}, // text tokens
|
||||
std::move(image_tokens),
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
cur.entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
if (!ctx->img_end.empty()) {
|
||||
add_text(ctx->img_end, true); // add image end token
|
||||
}
|
||||
|
||||
} else {
|
||||
// handle audio
|
||||
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: error: model does not support audio input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (bitmap->data.size() == 0) {
|
||||
LOG_ERR("%s: error: empty audio data\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (!ctx->aud_beg.empty()) {
|
||||
add_text(ctx->aud_beg, true); // add audio begin token
|
||||
}
|
||||
|
||||
// preprocess audio
|
||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmap->data.data();
|
||||
size_t n_samples = bitmap->data.size() / sizeof(float);
|
||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// consider each mel_spec as a separate audio chunk
|
||||
// TODO: maybe support batching, but this may come with memory cost
|
||||
for (auto & mel_spec : mel_spec_chunks) {
|
||||
clip_image_f32_ptr mel_f32(clip_image_f32_init());
|
||||
mel_f32->nx = mel_spec.n_len;
|
||||
mel_f32->ny = mel_spec.n_mel;
|
||||
mel_f32->buf = std::move(mel_spec.data);
|
||||
size_t n_tokens = clip_n_output_tokens(ctx->ctx_a, mel_f32.get());
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
batch_f32.is_audio = true;
|
||||
batch_f32.entries.push_back(std::move(mel_f32));
|
||||
|
||||
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
|
||||
audio_tokens->n_tokens = n_tokens;
|
||||
audio_tokens->batch_f32 = std::move(batch_f32);
|
||||
audio_tokens->id = bitmap->id; // optional
|
||||
|
||||
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
||||
{}, // text tokens
|
||||
nullptr, // image tokens
|
||||
std::move(audio_tokens),
|
||||
};
|
||||
cur.entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
if (!ctx->aud_end.empty()) {
|
||||
add_text(ctx->aud_end, true); // add audio end token
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<mtmd_input_chunk> split_batch_to_chunk(clip_image_f32_batch && batch_f32, const std::string & id) {
|
||||
std::vector<mtmd_input_chunk> chunks;
|
||||
|
||||
for (auto & entry : batch_f32.entries) {
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||
image_tokens->nx = clip_n_output_tokens(ctx->ctx_v, entry.get());
|
||||
image_tokens->ny = 1;
|
||||
image_tokens->batch_f32.entries.push_back(std::move(entry));
|
||||
image_tokens->id = id;
|
||||
@ -382,222 +682,57 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
}
|
||||
|
||||
return chunks;
|
||||
};
|
||||
|
||||
for (const auto & part : parts) {
|
||||
// printf("tokenizing part: %s\n", part.c_str());
|
||||
bool add_bos = &parts.front() == ∂
|
||||
auto tokens = mtmd_tokenize_text_internal(vocab, part, text->add_special && add_bos, text->parse_special);
|
||||
if (tokens.empty()) {
|
||||
continue;
|
||||
}
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_TEXT,
|
||||
std::move(tokens),
|
||||
nullptr, // image tokens
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
|
||||
// only add image/audio tokens to middle of 2 parts
|
||||
// therefore, we skip handling image/audio if this is the last part
|
||||
if (&parts.back() == &part) {
|
||||
continue;
|
||||
}
|
||||
|
||||
if (!bitmaps[i_bm]->is_audio) {
|
||||
// handle image
|
||||
|
||||
if (i_bm >= n_bitmaps) {
|
||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!ctx->has_vision) {
|
||||
LOG_ERR("%s: error: model does not support vision input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
// convert mtmd_bitmap to clip_image_u8
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
img_u8->nx = bitmaps[i_bm]->nx;
|
||||
img_u8->ny = bitmaps[i_bm]->ny;
|
||||
img_u8->buf.resize(bitmaps[i_bm]->data.size());
|
||||
std::memcpy(img_u8->buf.data(), bitmaps[i_bm]->data.data(), img_u8->nx * img_u8->ny * 3);
|
||||
|
||||
// preprocess image
|
||||
clip_image_f32_batch batch_f32;
|
||||
bool ok = clip_image_preprocess(ctx->ctx_clip, img_u8.get(), &batch_f32);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess image\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// handle llava-uhd style preprocessing
|
||||
if (
|
||||
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|
||||
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
|
||||
) {
|
||||
// split batch into chunks of single images
|
||||
auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmaps[i_bm]->id);
|
||||
GGML_ASSERT(chunks.size() > 0);
|
||||
|
||||
auto ov_chunk = std::move(chunks.front());
|
||||
chunks.erase(chunks.begin());
|
||||
|
||||
// add overview image (first)
|
||||
if (ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_start});
|
||||
}
|
||||
output->entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add slices (or tiles)
|
||||
if (!chunks.empty()) {
|
||||
const int n_col = batch_f32.grid_x;
|
||||
const int n_row = batch_f32.grid_y;
|
||||
if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_start});
|
||||
}
|
||||
for (int y = 0; y < n_row; y++) {
|
||||
for (int x = 0; x < n_col; x++) {
|
||||
const bool is_last_in_row = (x == n_col - 1);
|
||||
if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_start});
|
||||
}
|
||||
output->entries.emplace_back(std::move(chunks[y * n_col + x]));
|
||||
if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_end});
|
||||
}
|
||||
if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_sli_img_mid});
|
||||
}
|
||||
}
|
||||
if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_row_end});
|
||||
}
|
||||
}
|
||||
if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_slices_end});
|
||||
}
|
||||
}
|
||||
|
||||
// add overview image (last)
|
||||
if (!ctx->ov_img_first) {
|
||||
if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_start});
|
||||
}
|
||||
output->entries.emplace_back(std::move(ov_chunk));
|
||||
if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
|
||||
add_text_chunk({ctx->tok_ov_img_end});
|
||||
}
|
||||
}
|
||||
|
||||
} else {
|
||||
size_t n_tokens = 0;
|
||||
for (const auto & entry : batch_f32.entries) {
|
||||
n_tokens += clip_n_output_tokens(ctx->ctx_clip, entry.get());
|
||||
}
|
||||
|
||||
mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
|
||||
if (ctx->use_mrope) {
|
||||
// for Qwen2VL, we need this information for M-RoPE decoding positions
|
||||
image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||
image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_clip, batch_f32.entries[0].get());
|
||||
image_tokens->use_mrope_pos = true;
|
||||
} else {
|
||||
// other models, we only need the total number of tokens
|
||||
image_tokens->nx = n_tokens;
|
||||
image_tokens->ny = 1;
|
||||
}
|
||||
image_tokens->batch_f32 = std::move(batch_f32);
|
||||
image_tokens->id = bitmaps[i_bm]->id; // optional
|
||||
|
||||
LOG_DBG("image_tokens->nx = %d\n", image_tokens->nx);
|
||||
LOG_DBG("image_tokens->ny = %d\n", image_tokens->ny);
|
||||
LOG_DBG("batch_f32 size = %d\n", (int)image_tokens->batch_f32.entries.size());
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_IMAGE,
|
||||
{}, // text tokens
|
||||
std::move(image_tokens),
|
||||
nullptr, // audio tokens
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
i_bm++; // move to next image
|
||||
continue;
|
||||
|
||||
} else {
|
||||
// handle audio
|
||||
|
||||
if (i_bm >= n_bitmaps) {
|
||||
LOG_ERR("%s: error: not enough images for %d parts\n", __func__, (int)parts.size());
|
||||
return 1;
|
||||
}
|
||||
|
||||
if (!ctx->has_audio) {
|
||||
LOG_ERR("%s: error: model does not support audio input\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
if (bitmaps[i_bm]->data.size() == 0) {
|
||||
LOG_ERR("%s: error: empty audio data\n", __func__);
|
||||
return 2;
|
||||
}
|
||||
|
||||
// preprocess audio
|
||||
GGML_ASSERT(ctx->w_filters.n_mel); // make sure we have filter preloaded
|
||||
std::vector<whisper_preprocessor::whisper_mel> mel_spec_chunks;
|
||||
const float * samples = (const float *)bitmaps[i_bm]->data.data();
|
||||
size_t n_samples = bitmaps[i_bm]->data.size() / sizeof(float);
|
||||
bool ok = whisper_preprocessor::preprocess_audio(samples, n_samples, ctx->w_filters, mel_spec_chunks);
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to preprocess audio\n");
|
||||
return 2;
|
||||
}
|
||||
|
||||
// consider each mel_spec as a separate audio chunk
|
||||
// TODO: maybe support batching, but this may come with memory cost
|
||||
for (auto & mel_spec : mel_spec_chunks) {
|
||||
clip_image_f32_ptr mel_f32(clip_image_f32_init());
|
||||
mel_f32->nx = mel_spec.n_len;
|
||||
mel_f32->ny = mel_spec.n_mel;
|
||||
mel_f32->buf = std::move(mel_spec.data);
|
||||
size_t n_tokens = clip_n_output_tokens(ctx->ctx_clip, mel_f32.get());
|
||||
|
||||
clip_image_f32_batch batch_f32;
|
||||
batch_f32.is_audio = true;
|
||||
batch_f32.entries.push_back(std::move(mel_f32));
|
||||
|
||||
mtmd_audio_tokens_ptr audio_tokens(new mtmd_audio_tokens);
|
||||
audio_tokens->n_tokens = n_tokens;
|
||||
audio_tokens->batch_f32 = std::move(batch_f32);
|
||||
audio_tokens->id = bitmaps[i_bm]->id; // optional
|
||||
|
||||
LOG_DBG("audio_tokens->n_tokens = %d\n", audio_tokens->n_tokens);
|
||||
|
||||
mtmd_input_chunk chunk{
|
||||
MTMD_INPUT_CHUNK_TYPE_AUDIO,
|
||||
{}, // text tokens
|
||||
nullptr, // image tokens
|
||||
std::move(audio_tokens),
|
||||
};
|
||||
output->entries.emplace_back(std::move(chunk));
|
||||
}
|
||||
|
||||
i_bm++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
// for example: "a <__media__> b <__media__> c" --> "a", "<__media__>", "b", "<__media__>", "c"
|
||||
static std::vector<std::string> split_text(const std::string & input, const std::string & delimiter) {
|
||||
std::vector<std::string> result;
|
||||
if (input.empty()) {
|
||||
return result;
|
||||
}
|
||||
size_t start = 0;
|
||||
size_t pos = 0;
|
||||
while ((pos = input.find(delimiter, start)) != std::string::npos) {
|
||||
if (pos > start) {
|
||||
result.push_back(input.substr(start, pos - start));
|
||||
}
|
||||
result.push_back(delimiter);
|
||||
start = pos + delimiter.length();
|
||||
}
|
||||
if (start < input.length()) {
|
||||
result.push_back(input.substr(start));
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
// copied from common_tokenize
|
||||
static std::vector<llama_token> mtmd_tokenize_text_internal(
|
||||
const struct llama_vocab * vocab,
|
||||
const std::string & text,
|
||||
bool add_special,
|
||||
bool parse_special) {
|
||||
// upper limit for the number of tokens
|
||||
int n_tokens = text.length() + 2 * add_special;
|
||||
std::vector<llama_token> result(n_tokens);
|
||||
n_tokens = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
if (n_tokens < 0) {
|
||||
result.resize(-n_tokens);
|
||||
int check = llama_tokenize(vocab, text.data(), text.length(), result.data(), result.size(), add_special, parse_special);
|
||||
GGML_ASSERT(check == -n_tokens);
|
||||
} else {
|
||||
result.resize(n_tokens);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
int32_t mtmd_tokenize(mtmd_context * ctx,
|
||||
mtmd_input_chunks * output,
|
||||
const mtmd_input_text * text,
|
||||
const mtmd_bitmap ** bitmaps,
|
||||
size_t n_bitmaps) {
|
||||
mtmd_tokenizer tokenizer(ctx, text, bitmaps, n_bitmaps);
|
||||
return tokenizer.tokenize(output);
|
||||
}
|
||||
|
||||
int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
@ -605,41 +740,54 @@ int32_t mtmd_encode_chunk(mtmd_context * ctx, const mtmd_input_chunk * chunk) {
|
||||
LOG_WRN("mtmd_encode_chunk has no effect for text chunks\n");
|
||||
return 0;
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_IMAGE) {
|
||||
if (!ctx->ctx_v) {
|
||||
LOG_ERR("%s: model does not support vision input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
return mtmd_encode(ctx, chunk->tokens_image.get());
|
||||
} else if (chunk->type == MTMD_INPUT_CHUNK_TYPE_AUDIO) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
if (!ctx->ctx_a) {
|
||||
LOG_ERR("%s: model does not support audio input\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
int n_mmproj_embd = ctx->n_embd_text;
|
||||
ctx->image_embd_v.resize(chunk->tokens_audio->n_tokens * n_mmproj_embd);
|
||||
bool ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx->ctx_a,
|
||||
ctx->n_threads,
|
||||
&chunk->tokens_audio->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
return ok ? 0 : 1;
|
||||
}
|
||||
|
||||
LOG_ERR("mtmd_encode_chunk: unknown chunk type %d\n", (int)chunk->type);
|
||||
LOG_ERR("%s: unknown chunk type %d\n", __func__, (int)chunk->type);
|
||||
return 1;
|
||||
}
|
||||
|
||||
int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens) {
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx->ctx_clip);
|
||||
clip_ctx * ctx_clip = ctx->ctx_v;
|
||||
if (!ctx_clip) {
|
||||
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
|
||||
return 1;
|
||||
}
|
||||
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
|
||||
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
|
||||
bool ok = false;
|
||||
|
||||
if (clip_is_llava(ctx->ctx_clip) || clip_is_minicpmv(ctx->ctx_clip) || clip_is_glm(ctx->ctx_clip)) {
|
||||
if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
|
||||
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
|
||||
const auto & entries = image_tokens->batch_f32.entries;
|
||||
for (size_t i = 0; i < entries.size(); i++) {
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx->ctx_clip, entries[i].get());
|
||||
int n_tokens_per_image = clip_n_output_tokens(ctx_clip, entries[i].get());
|
||||
ok = clip_image_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
entries[i].get(),
|
||||
ctx->image_embd_v.data() + i*n_mmproj_embd*n_tokens_per_image);
|
||||
}
|
||||
} else {
|
||||
ok = clip_image_batch_encode(
|
||||
ctx->ctx_clip,
|
||||
ctx_clip,
|
||||
ctx->n_threads,
|
||||
&image_tokens->batch_f32,
|
||||
ctx->image_embd_v.data());
|
||||
@ -653,8 +801,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
|
||||
}
|
||||
|
||||
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
|
||||
projector_type proj_type = clip_get_projector_type(ctx->ctx_clip);
|
||||
if (proj_type == PROJECTOR_TYPE_GEMMA3) {
|
||||
if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
@ -665,11 +812,11 @@ bool mtmd_decode_use_mrope(mtmd_context * ctx) {
|
||||
}
|
||||
|
||||
bool mtmd_support_vision(mtmd_context * ctx) {
|
||||
return ctx->has_vision;
|
||||
return ctx->ctx_v != nullptr;
|
||||
}
|
||||
|
||||
bool mtmd_support_audio(mtmd_context * ctx) {
|
||||
return ctx->has_audio;
|
||||
return ctx->ctx_a != nullptr;
|
||||
}
|
||||
|
||||
// these 2 helpers below use internal clip_image_u8_ptr,
|
||||
|
BIN
tools/mtmd/test-2.mp3
Normal file
BIN
tools/mtmd/test-2.mp3
Normal file
Binary file not shown.
@ -25,80 +25,99 @@ RUN_HUGE_TESTS=false
|
||||
if [ "${1:-}" = "huge" ]; then
|
||||
RUN_HUGE_TESTS=true
|
||||
RUN_BIG_TESTS=true
|
||||
echo "Include BIG models..."
|
||||
echo "Include BIG and HUGE models..."
|
||||
fi
|
||||
|
||||
###############
|
||||
|
||||
arr_bin=()
|
||||
arr_prefix=()
|
||||
arr_hf=()
|
||||
arr_tmpl=() # chat template
|
||||
arr_file=()
|
||||
|
||||
add_test() {
|
||||
local bin=$1
|
||||
local hf=$2
|
||||
local tmpl=${3:-""} # default to empty string if not provided
|
||||
arr_bin+=("$bin")
|
||||
add_test_vision() {
|
||||
local hf=$1
|
||||
local tmpl=${2:-""} # default to empty string if not provided
|
||||
arr_prefix+=("[vision]")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("$tmpl")
|
||||
arr_file+=("test-1.jpeg")
|
||||
}
|
||||
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test "llama-mtmd-cli" "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test "llama-mtmd-cli" "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test "llama-mtmd-cli" "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test "llama-mtmd-cli" "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
||||
add_test_audio() {
|
||||
local hf=$1
|
||||
arr_prefix+=("[audio] ")
|
||||
arr_hf+=("$hf")
|
||||
arr_tmpl+=("") # no need for chat tmpl
|
||||
arr_file+=("test-2.mp3")
|
||||
}
|
||||
|
||||
add_test_vision "ggml-org/SmolVLM-500M-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/SmolVLM2-2.2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/SmolVLM2-500M-Video-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/gemma-3-4b-it-GGUF:Q4_K_M"
|
||||
add_test_vision "THUDM/glm-edge-v-5b-gguf:Q4_K_M"
|
||||
add_test_vision "second-state/Llava-v1.5-7B-GGUF:Q2_K" "vicuna"
|
||||
add_test_vision "cjpais/llava-1.6-mistral-7b-gguf:Q3_K_M" "vicuna"
|
||||
add_test_vision "ibm-research/granite-vision-3.2-2b-GGUF:Q4_K_M"
|
||||
add_test_vision "second-state/MiniCPM-Llama3-V-2_5-GGUF:Q2_K" # model from openbmb is corrupted
|
||||
add_test_vision "openbmb/MiniCPM-V-2_6-gguf:Q2_K"
|
||||
add_test_vision "openbmb/MiniCPM-o-2_6-gguf:Q4_0"
|
||||
add_test_vision "bartowski/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL2_5-1B-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
|
||||
|
||||
# to test the big models, run: ./tests.sh big
|
||||
if [ "$RUN_BIG_TESTS" = true ]; then
|
||||
add_test "llama-mtmd-cli" "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
# add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
add_test_vision "ggml-org/pixtral-12b-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Mistral-Small-3.1-24B-Instruct-2503-GGUF" "mistral-v7"
|
||||
add_test_vision "ggml-org/Qwen2-VL-2B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-3B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-7B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL3-8B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/InternVL3-14B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/Qwen2.5-VL-32B-Instruct-GGUF:Q4_K_M" # does not work on my mac M3 Ultra
|
||||
|
||||
add_test_audio "ggml-org/ultravox-v0_5-llama-3_1-8b-GGUF:Q4_K_M"
|
||||
add_test_audio "ggml-org/Qwen2.5-Omni-7B-GGUF:Q4_K_M"
|
||||
fi
|
||||
|
||||
# to test the huge models, run: ./tests.sh huge
|
||||
# this will run both the big and huge models
|
||||
# huge models are > 32B parameters
|
||||
if [ "$RUN_HUGE_TESTS" = true ]; then
|
||||
add_test "llama-mtmd-cli" "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
|
||||
add_test "llama-mtmd-cli" "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
|
||||
add_test_vision "ggml-org/Qwen2.5-VL-72B-Instruct-GGUF:Q4_K_M"
|
||||
add_test_vision "ggml-org/Llama-4-Scout-17B-16E-Instruct-GGUF:IQ1_S"
|
||||
fi
|
||||
|
||||
# these models always give the wrong answer, not sure why
|
||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
|
||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
|
||||
# add_test "llama-mtmd-cli" "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
|
||||
# add_test_vision "ggml-org/SmolVLM-Instruct-GGUF:Q4_K_M"
|
||||
# add_test_vision "ggml-org/SmolVLM-256M-Instruct-GGUF:Q8_0"
|
||||
# add_test_vision "ggml-org/SmolVLM2-256M-Video-Instruct-GGUF:Q8_0"
|
||||
|
||||
# this model has broken chat template, not usable
|
||||
# add_test "llama-mtmd-cli" "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
# add_test "llama-mtmd-cli" "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
# add_test_vision "cmp-nct/Yi-VL-6B-GGUF:Q5_K"
|
||||
# add_test_vision "guinmoon/MobileVLM-3B-GGUF:Q4_K_M" "deepseek"
|
||||
|
||||
###############
|
||||
|
||||
cmake --build build -j --target "${arr_bin[@]}"
|
||||
cmake --build build -j --target llama-mtmd-cli
|
||||
|
||||
arr_res=()
|
||||
|
||||
for i in "${!arr_bin[@]}"; do
|
||||
bin="${arr_bin[$i]}"
|
||||
for i in "${!arr_hf[@]}"; do
|
||||
bin="llama-mtmd-cli"
|
||||
prefix="${arr_prefix[$i]}"
|
||||
hf="${arr_hf[$i]}"
|
||||
tmpl="${arr_tmpl[$i]}"
|
||||
inp_file="${arr_file[$i]}"
|
||||
|
||||
echo "Running test with binary: $bin and HF model: $hf"
|
||||
echo ""
|
||||
@ -107,7 +126,7 @@ for i in "${!arr_bin[@]}"; do
|
||||
output=$(\
|
||||
"$PROJ_ROOT/build/bin/$bin" \
|
||||
-hf "$hf" \
|
||||
--image $SCRIPT_DIR/test-1.jpeg \
|
||||
--image $SCRIPT_DIR/$inp_file \
|
||||
-p "what is the publisher name of the newspaper?" \
|
||||
--temp 0 -n 128 \
|
||||
${tmpl:+--chat-template "$tmpl"} \
|
||||
@ -116,9 +135,9 @@ for i in "${!arr_bin[@]}"; do
|
||||
echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
|
||||
|
||||
if echo "$output" | grep -iq "new york"; then
|
||||
result="\033[32mOK\033[0m: $bin $hf"
|
||||
result="$prefix \033[32mOK\033[0m: $bin $hf"
|
||||
else
|
||||
result="\033[31mFAIL\033[0m: $bin $hf"
|
||||
result="$prefix \033[31mFAIL\033[0m: $bin $hf"
|
||||
fi
|
||||
echo -e "$result"
|
||||
arr_res+=("$result")
|
||||
|
Reference in New Issue
Block a user