From 10961339b26bd2eff01d5479e8879f435da261b7 Mon Sep 17 00:00:00 2001 From: Xuan-Son Nguyen Date: Wed, 28 May 2025 22:35:22 +0200 Subject: [PATCH] =?UTF-8?q?mtmd=20:=20move=20helpers=20to=20dedicated=20li?= =?UTF-8?q?brary=20(=E2=9A=A0=EF=B8=8F=20breaking=20change)=20(#13866)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * mtmd : move helpers to dedicated library * fix server build * rm leftover cmakelist code --- .editorconfig | 2 +- tools/mtmd/CMakeLists.txt | 46 ++++--- tools/mtmd/clip.cpp | 27 ---- tools/mtmd/mtmd-audio.cpp | 86 ------------- tools/mtmd/mtmd-audio.h | 19 +-- tools/mtmd/mtmd-cli.cpp | 3 +- tools/mtmd/mtmd-helper.cpp | 142 ++++++++++++++++++++++ tools/mtmd/mtmd-helper.h | 91 ++++++++++++++ tools/mtmd/mtmd.cpp | 51 +------- tools/mtmd/mtmd.h | 73 +---------- tools/mtmd/{ => vendor}/miniaudio.h | 0 {common => tools/mtmd/vendor}/stb_image.h | 0 tools/server/CMakeLists.txt | 2 +- tools/server/server.cpp | 3 +- tools/server/utils.hpp | 1 + 15 files changed, 277 insertions(+), 269 deletions(-) create mode 100644 tools/mtmd/mtmd-helper.h rename tools/mtmd/{ => vendor}/miniaudio.h (100%) rename {common => tools/mtmd/vendor}/stb_image.h (100%) diff --git a/.editorconfig b/.editorconfig index 316448c7e..47111c72d 100644 --- a/.editorconfig +++ b/.editorconfig @@ -49,6 +49,6 @@ charset = unset trim_trailing_whitespace = unset insert_final_newline = unset -[tools/mtmd/miniaudio.h] +[tools/mtmd/vendor/miniaudio.h] trim_trailing_whitespace = unset insert_final_newline = unset diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index c3024cec1..33e251d3b 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -1,48 +1,54 @@ # mtmd -# compile mtmd-audio separately to avoid long compile times with miniaudio.h -# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library -add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h) -if (BUILD_SHARED_LIBS) - set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON) -endif() -target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT}) -target_compile_features(mtmd_audio PRIVATE cxx_std_17) -target_include_directories(mtmd_audio PRIVATE .) - add_library(mtmd OBJECT mtmd.cpp - mtmd-helper.cpp + mtmd-audio.cpp mtmd.h clip.cpp clip.h clip-impl.h ) -target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT}) - +target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) target_include_directories(mtmd PUBLIC .) target_include_directories(mtmd PRIVATE ../..) -target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h - target_compile_features(mtmd PRIVATE cxx_std_17) -add_library(mtmd_static STATIC $) +# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h + +add_library(mtmd_helper OBJECT + mtmd-helper.cpp + mtmd-helper.h + ) + +target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT}) +target_include_directories(mtmd_helper PUBLIC .) +target_include_directories(mtmd_helper PRIVATE ./vendor) +target_include_directories(mtmd_helper PRIVATE ../..) +target_compile_features(mtmd_helper PRIVATE cxx_std_17) + if (BUILD_SHARED_LIBS) set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON) target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD) add_library(mtmd_shared SHARED $) - target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT}) + target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT}) install(TARGETS mtmd_shared LIBRARY) + + set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON) + target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD) + add_library(mtmd_helper_shared SHARED $) + target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT}) + install(TARGETS mtmd_helper_shared LIBRARY) endif() if (NOT MSVC) - target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h - target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h + # for stb_image.h and miniaudio.h + target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual) endif() if(TARGET BUILD_INFO) add_dependencies(mtmd BUILD_INFO) + add_dependencies(mtmd_helper BUILD_INFO) endif() add_executable(llama-llava-cli deprecation-warning.cpp) @@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli) add_executable(${TARGET} mtmd-cli.cpp) set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli) install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_17) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 6ae2c2ce4..c25bacc17 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -11,9 +11,6 @@ #include "ggml-backend.h" #include "gguf.h" -#define STB_IMAGE_IMPLEMENTATION -#include "stb_image.h" - #include #include #include @@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny memcpy(img->buf.data(), rgb_pixels, img->buf.size()); } -bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) { - int nx, ny, nc; - auto * data = stbi_load(fname, &nx, &ny, &nc, 3); - if (!data) { - LOG_ERR("%s: failed to load image '%s'\n", __func__, fname); - return false; - } - clip_build_img_from_pixels(data, nx, ny, img); - stbi_image_free(data); - return true; -} - -bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) { - int nx, ny, nc; - auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3); - if (!data) { - LOG_ERR("%s: failed to decode image bytes\n", __func__); - return false; - } - clip_build_img_from_pixels(data, nx, ny, img); - stbi_image_free(data); - return true; -} - // Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) { dst.nx = src.nx; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index ae06a695d..4d053895c 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -1,28 +1,5 @@ -// fix problem with std::min and std::max -#if defined(_WIN32) -#define WIN32_LEAN_AND_MEAN -#ifndef NOMINMAX -# define NOMINMAX -#endif -#include -#endif - #include "mtmd-audio.h" -//#define MTMD_AUDIO_DEBUG - -#define MINIAUDIO_IMPLEMENTATION -#ifndef MTMD_AUDIO_DEBUG -# define MA_NO_ENCODING -#endif -#define MA_NO_DEVICE_IO -#define MA_NO_RESOURCE_MANAGER -#define MA_NO_NODE_GRAPH -#define MA_NO_ENGINE -#define MA_NO_GENERATION -#define MA_API static -#include "miniaudio.h" - #define _USE_MATH_DEFINES // for M_PI #include #include @@ -359,69 +336,6 @@ bool preprocess_audio( } // namespace whisper_preprocessor -namespace audio_helpers { - -bool is_audio_file(const char * buf, size_t len) { - if (len < 12) { - return false; - } - - // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format - // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html - bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0; - bool is_mp3 = len >= 3 && ( - memcmp(buf, "ID3", 3) == 0 || - // Check for MPEG sync word (simplified check) - ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0) - ); - bool is_flac = memcmp(buf, "fLaC", 4) == 0; - - return is_wav || is_mp3 || is_flac; -} - -// returns true if the buffer is a valid audio file -bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector & pcmf32_mono) { - ma_result result; - const int channels = 1; - ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate); - ma_decoder decoder; - - result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder); - if (result != MA_SUCCESS) { - return false; - } - - ma_uint64 frame_count; - ma_uint64 frames_read; - result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count); - if (result != MA_SUCCESS) { - ma_decoder_uninit(&decoder); - return false; - } - - pcmf32_mono.resize(frame_count); - result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read); - if (result != MA_SUCCESS) { - ma_decoder_uninit(&decoder); - return false; - } - -#ifdef MTMD_AUDIO_DEBUG - // save audio to wav file - ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate); - ma_encoder encoder; - ma_encoder_init_file("output.wav", &config, &encoder); - ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read); - ma_encoder_uninit(&encoder); -#endif - - ma_decoder_uninit(&decoder); - return true; -} - -} // namespace wav_utils - - // precalculated mel filter banks // values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 348d11dca..b7b940aff 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -32,7 +32,7 @@ struct whisper_filters { std::vector data; }; -extern bool preprocess_audio( +bool preprocess_audio( const float * samples, size_t n_samples, const whisper_filters & filters, @@ -40,23 +40,8 @@ extern bool preprocess_audio( } // namespace whisper_preprocessor - -// TODO @ngxson : move this helper to mtmd-helpers.cpp -namespace audio_helpers { - -extern bool is_audio_file(const char * buf, size_t len); - -extern bool decode_audio_from_buf( - const unsigned char * buf_in, - size_t len, - int target_sampler_rate, - std::vector & pcmf32_mono); - -} // namespace audio_helpers - - namespace whisper_precalc_filters { -extern whisper_preprocessor::whisper_filters get_128_bins(); +whisper_preprocessor::whisper_filters get_128_bins(); } // namespace whisper_precalc_filters diff --git a/tools/mtmd/mtmd-cli.cpp b/tools/mtmd/mtmd-cli.cpp index a70f11ca9..508a64c58 100644 --- a/tools/mtmd/mtmd-cli.cpp +++ b/tools/mtmd/mtmd-cli.cpp @@ -7,6 +7,7 @@ #include "console.h" #include "chat.h" #include "mtmd.h" +#include "mtmd-helper.h" #include #include @@ -143,7 +144,7 @@ struct mtmd_cli_context { } bool load_media(const std::string & fname) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str())); if (!bmp.ptr) { return false; } diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp index e6c926080..058323818 100644 --- a/tools/mtmd/mtmd-helper.cpp +++ b/tools/mtmd/mtmd-helper.cpp @@ -1,10 +1,37 @@ +// fix problem with std::min and std::max +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +# define NOMINMAX +#endif +#include +#endif + #include "mtmd.h" +#include "mtmd-helper.h" #include "llama.h" #include #include #include +//#define MTMD_AUDIO_DEBUG + +#define MINIAUDIO_IMPLEMENTATION +#ifndef MTMD_AUDIO_DEBUG +# define MA_NO_ENCODING +#endif +#define MA_NO_DEVICE_IO +#define MA_NO_RESOURCE_MANAGER +#define MA_NO_NODE_GRAPH +#define MA_NO_ENGINE +#define MA_NO_GENERATION +#define MA_API static +#include "vendor/miniaudio.h" + +#define STB_IMAGE_IMPLEMENTATION +#include "vendor/stb_image.h" + #define LOG_INF(...) fprintf(stdout, __VA_ARGS__) #define LOG_ERR(...) fprintf(stderr, __VA_ARGS__) @@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, return 0; } + +namespace audio_helpers { + +static bool is_audio_file(const char * buf, size_t len) { + if (len < 12) { + return false; + } + + // RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format + // WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html + bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0; + bool is_mp3 = len >= 3 && ( + memcmp(buf, "ID3", 3) == 0 || + // Check for MPEG sync word (simplified check) + ((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0) + ); + bool is_flac = memcmp(buf, "fLaC", 4) == 0; + + return is_wav || is_mp3 || is_flac; +} + +// returns true if the buffer is a valid audio file +static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector & pcmf32_mono) { + ma_result result; + const int channels = 1; + ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate); + ma_decoder decoder; + + result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder); + if (result != MA_SUCCESS) { + return false; + } + + ma_uint64 frame_count; + ma_uint64 frames_read; + result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + + pcmf32_mono.resize(frame_count); + result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read); + if (result != MA_SUCCESS) { + ma_decoder_uninit(&decoder); + return false; + } + +#ifdef MTMD_AUDIO_DEBUG + // save audio to wav file + ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate); + ma_encoder encoder; + ma_encoder_init_file("output.wav", &config, &encoder); + ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read); + ma_encoder_uninit(&encoder); +#endif + + ma_decoder_uninit(&decoder); + return true; +} + +} // namespace audio_helpers + +mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) { + if (audio_helpers::is_audio_file((const char *)buf, len)) { + std::vector pcmf32; + int bitrate = mtmd_get_audio_bitrate(ctx); + if (bitrate < 0) { + LOG_ERR("This model does not support audio input\n"); + return nullptr; + } + if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) { + LOG_ERR("Unable to read WAV audio file from buffer\n"); + return nullptr; + } + return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); + } + + // otherwise, we assume it's an image + mtmd_bitmap * result = nullptr; + { + int nx, ny, nc; + auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3); + if (!data) { + LOG_ERR("%s: failed to decode image bytes\n", __func__); + return nullptr; + } + result = mtmd_bitmap_init(nx, ny, data); + stbi_image_free(data); + } + return result; +} + +mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) { + std::vector buf; + FILE * f = fopen(fname, "rb"); + if (!f) { + LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); + return nullptr; + } + + fseek(f, 0, SEEK_END); + long file_size = ftell(f); + fseek(f, 0, SEEK_SET); + buf.resize(file_size); + + size_t n_read = fread(buf.data(), 1, file_size, f); + fclose(f); + if (n_read != (size_t)file_size) { + LOG_ERR("Failed to read entire file %s", fname); + return nullptr; + } + + return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size()); +} diff --git a/tools/mtmd/mtmd-helper.h b/tools/mtmd/mtmd-helper.h new file mode 100644 index 000000000..5c0edc693 --- /dev/null +++ b/tools/mtmd/mtmd-helper.h @@ -0,0 +1,91 @@ +#ifndef MTMD_HELPER_H +#define MTMD_HELPER_H + +#include "ggml.h" +#include "llama.h" +#include "mtmd.h" + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// +// libmtmd helper functions +// +// Please note that these helpers are not guaranteed to be stable. +// BREAKING CHANGES are expected. +// + +// helper function to construct a mtmd_bitmap from a file +// it calls mtmd_helper_bitmap_init_from_buf() internally +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname); + +// helper function to construct a mtmd_bitmap from a buffer containing a file +// supported formats: +// image: formats supported by stb_image: jpg, png, bmp, gif, etc. +// audio: formats supported by miniaudio: wav, mp3, flac +// note: audio files will be auto-detected based on magic bytes +// returns nullptr on failure +// this function is thread-safe +MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len); + +// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache +MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); + +// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past +// normally, n_pos is equal to n_tokens, but for M-RoPE it is different +MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); + +// helper function that automatically: +// 1. run llama_decode() on text chunks +// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() +// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error +// otherwise, returns 0 on success +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunks * chunks, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// works like mtmd_helper_eval_chunks(), but only for a single chunk +// this function is NOT thread-safe +MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + bool logits_last, + llama_pos * new_n_past); + +// helper function to decode an image whose embeddings have already been calculated +// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) +// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure +MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, + struct llama_context * lctx, + const mtmd_input_chunk * chunk, + float * encoded_embd, + llama_pos n_past, + llama_seq_id seq_id, + int32_t n_batch, + llama_pos * new_n_past); + +#ifdef __cplusplus +} // extern "C" +#endif + +// +// C++ wrappers +// + +#endif diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 52bf71e2c..8573f1143 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) { return ctx->ctx_a != nullptr; } -// these 2 helpers below use internal clip_image_u8_ptr, -// so unfortunately they cannot moved to mtmd-helper.h -// however, in theory, user can decode image file to bitmap using -// whichever library they want, and then use mtmd_bitmap_init() to create bitmap - -mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) { - if (audio_helpers::is_audio_file((const char *)buf, len)) { - std::vector pcmf32; - if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) { - LOG_ERR("Unable to read WAV audio file from buffer\n"); - return nullptr; - } - return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data()); +int mtmd_get_audio_bitrate(mtmd_context * ctx) { + if (!ctx->ctx_a) { + return -1; } - - clip_image_u8_ptr img_u8(clip_image_u8_init()); - bool ok = clip_image_load_from_bytes(buf, len, img_u8.get()); - if (!ok) { - LOG_ERR("Unable to load image from buffer\n"); - return nullptr; - } - uint32_t nx, ny; - unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny); - return mtmd_bitmap_init(nx, ny, data); -} - -mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) { - std::vector buf; - FILE * f = fopen(fname, "rb"); - if (!f) { - LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno)); - return nullptr; - } - - fseek(f, 0, SEEK_END); - long file_size = ftell(f); - fseek(f, 0, SEEK_SET); - buf.resize(file_size); - - size_t n_read = fread(buf.data(), 1, file_size, f); - fclose(f); - if (n_read != (size_t)file_size) { - LOG_ERR("Failed to read entire file %s", fname); - return nullptr; - } - - return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size()); + // for now, we assume that all audio models have the same bitrate + return 16000; // 16kHz } // diff --git a/tools/mtmd/mtmd.h b/tools/mtmd/mtmd.h index b53f215a2..541918e09 100644 --- a/tools/mtmd/mtmd.h +++ b/tools/mtmd/mtmd.h @@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx); // whether the current model supports audio input MTMD_API bool mtmd_support_audio(mtmd_context * ctx); +// get audio bitrate in Hz, for example 16000 for Whisper +// return -1 if audio is not supported +MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx); + // mtmd_bitmap // // if bitmap is image: @@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); ///////////////////////////////////////// -// -// Helper functions (can be implemented based on other functions) -// -// Please note that these helpers are not guaranteed to be stable. -// BREAKING CHANGES are expected. -// - -// helper function to construct a mtmd_bitmap from a file -// it calls mtmd_helper_bitmap_init_from_buf() internally -// returns nullptr on failure -// this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname); - -// helper function to construct a mtmd_bitmap from a buffer containing a file -// supported formats: -// image: formats supported by stb_image: jpg, png, bmp, gif, etc. -// audio: formats supported by miniaudio: wav, mp3, flac -// note: audio files will be auto-detected based on magic bytes -// returns nullptr on failure -// this function is thread-safe -MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len); - -// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache -MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); - -// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past -// normally, n_pos is equal to n_tokens, but for M-RoPE it is different -MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); - -// helper function that automatically: -// 1. run llama_decode() on text chunks -// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() -// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error -// otherwise, returns 0 on success -// this function is NOT thread-safe -MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx, - struct llama_context * lctx, - const mtmd_input_chunks * chunks, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - bool logits_last, - llama_pos * new_n_past); - -// works like mtmd_helper_eval_chunks(), but only for a single chunk -// this function is NOT thread-safe -MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx, - struct llama_context * lctx, - const mtmd_input_chunk * chunk, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - bool logits_last, - llama_pos * new_n_past); - -// helper function to decode an image whose embeddings have already been calculated -// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention) -// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure -MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx, - struct llama_context * lctx, - const mtmd_input_chunk * chunk, - float * encoded_embd, - llama_pos n_past, - llama_seq_id seq_id, - int32_t n_batch, - llama_pos * new_n_past); - -///////////////////////////////////////// - // test function, to be used in test-mtmd-c-api.c MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void); diff --git a/tools/mtmd/miniaudio.h b/tools/mtmd/vendor/miniaudio.h similarity index 100% rename from tools/mtmd/miniaudio.h rename to tools/mtmd/vendor/miniaudio.h diff --git a/common/stb_image.h b/tools/mtmd/vendor/stb_image.h similarity index 100% rename from common/stb_image.h rename to tools/mtmd/vendor/stb_image.h diff --git a/tools/server/CMakeLists.txt b/tools/server/CMakeLists.txt index 17109fddb..08597145c 100644 --- a/tools/server/CMakeLists.txt +++ b/tools/server/CMakeLists.txt @@ -36,7 +36,7 @@ install(TARGETS ${TARGET} RUNTIME) target_include_directories(${TARGET} PRIVATE ../llava) target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR}) -target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT}) if (LLAMA_SERVER_SSL) find_package(OpenSSL REQUIRED) diff --git a/tools/server/server.cpp b/tools/server/server.cpp index fe6c685ec..96d1e4adf 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -9,6 +9,7 @@ #include "sampling.h" #include "speculative.h" #include "mtmd.h" +#include "mtmd-helper.h" // Change JSON_ASSERT from assert() to GGML_ASSERT: #define JSON_ASSERT GGML_ASSERT @@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) { throw std::runtime_error("This server does not support multimodal"); } for (auto & file : files) { - mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size())); + mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size())); if (!bmp.ptr) { throw std::runtime_error("Failed to load image or audio file"); } diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index b64620582..58b679d75 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -6,6 +6,7 @@ #include "arg.h" // common_remote_get_content #include "base64.hpp" #include "mtmd.h" +#include "mtmd-helper.h" // increase max payload length to allow use of larger context size #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576