mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
mtmd : move helpers to dedicated library (⚠️ breaking change) (#13866)
* mtmd : move helpers to dedicated library * fix server build * rm leftover cmakelist code
This commit is contained in:
@ -49,6 +49,6 @@ charset = unset
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
||||
[tools/mtmd/miniaudio.h]
|
||||
[tools/mtmd/vendor/miniaudio.h]
|
||||
trim_trailing_whitespace = unset
|
||||
insert_final_newline = unset
|
||||
|
@ -1,48 +1,54 @@
|
||||
# mtmd
|
||||
|
||||
# compile mtmd-audio separately to avoid long compile times with miniaudio.h
|
||||
# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
|
||||
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
endif()
|
||||
target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(mtmd_audio PRIVATE cxx_std_17)
|
||||
target_include_directories(mtmd_audio PRIVATE .)
|
||||
|
||||
add_library(mtmd OBJECT
|
||||
mtmd.cpp
|
||||
mtmd-helper.cpp
|
||||
mtmd-audio.cpp
|
||||
mtmd.h
|
||||
clip.cpp
|
||||
clip.h
|
||||
clip-impl.h
|
||||
)
|
||||
|
||||
target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(mtmd PUBLIC .)
|
||||
target_include_directories(mtmd PRIVATE ../..)
|
||||
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
|
||||
|
||||
target_compile_features(mtmd PRIVATE cxx_std_17)
|
||||
|
||||
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
|
||||
# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h
|
||||
|
||||
add_library(mtmd_helper OBJECT
|
||||
mtmd-helper.cpp
|
||||
mtmd-helper.h
|
||||
)
|
||||
|
||||
target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_include_directories(mtmd_helper PUBLIC .)
|
||||
target_include_directories(mtmd_helper PRIVATE ./vendor)
|
||||
target_include_directories(mtmd_helper PRIVATE ../..)
|
||||
target_compile_features(mtmd_helper PRIVATE cxx_std_17)
|
||||
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
|
||||
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
install(TARGETS mtmd_shared LIBRARY)
|
||||
|
||||
set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||
add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
|
||||
target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
install(TARGETS mtmd_helper_shared LIBRARY)
|
||||
endif()
|
||||
|
||||
if (NOT MSVC)
|
||||
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
|
||||
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
|
||||
# for stb_image.h and miniaudio.h
|
||||
target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
|
||||
endif()
|
||||
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(mtmd BUILD_INFO)
|
||||
add_dependencies(mtmd_helper BUILD_INFO)
|
||||
endif()
|
||||
|
||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||
@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli)
|
||||
add_executable(${TARGET} mtmd-cli.cpp)
|
||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||
|
@ -11,9 +11,6 @@
|
||||
#include "ggml-backend.h"
|
||||
#include "gguf.h"
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "stb_image.h"
|
||||
|
||||
#include <cassert>
|
||||
#include <cmath>
|
||||
#include <cstdlib>
|
||||
@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
|
||||
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
||||
}
|
||||
|
||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
||||
int nx, ny, nc;
|
||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
||||
return false;
|
||||
}
|
||||
clip_build_img_from_pixels(data, nx, ny, img);
|
||||
stbi_image_free(data);
|
||||
return true;
|
||||
}
|
||||
|
||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
||||
int nx, ny, nc;
|
||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||
return false;
|
||||
}
|
||||
clip_build_img_from_pixels(data, nx, ny, img);
|
||||
stbi_image_free(data);
|
||||
return true;
|
||||
}
|
||||
|
||||
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
|
||||
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
|
||||
dst.nx = src.nx;
|
||||
|
@ -1,28 +1,5 @@
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "mtmd-audio.h"
|
||||
|
||||
//#define MTMD_AUDIO_DEBUG
|
||||
|
||||
#define MINIAUDIO_IMPLEMENTATION
|
||||
#ifndef MTMD_AUDIO_DEBUG
|
||||
# define MA_NO_ENCODING
|
||||
#endif
|
||||
#define MA_NO_DEVICE_IO
|
||||
#define MA_NO_RESOURCE_MANAGER
|
||||
#define MA_NO_NODE_GRAPH
|
||||
#define MA_NO_ENGINE
|
||||
#define MA_NO_GENERATION
|
||||
#define MA_API static
|
||||
#include "miniaudio.h"
|
||||
|
||||
#define _USE_MATH_DEFINES // for M_PI
|
||||
#include <cmath>
|
||||
#include <cstdint>
|
||||
@ -359,69 +336,6 @@ bool preprocess_audio(
|
||||
} // namespace whisper_preprocessor
|
||||
|
||||
|
||||
namespace audio_helpers {
|
||||
|
||||
bool is_audio_file(const char * buf, size_t len) {
|
||||
if (len < 12) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
||||
bool is_mp3 = len >= 3 && (
|
||||
memcmp(buf, "ID3", 3) == 0 ||
|
||||
// Check for MPEG sync word (simplified check)
|
||||
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
||||
);
|
||||
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
||||
|
||||
return is_wav || is_mp3 || is_flac;
|
||||
}
|
||||
|
||||
// returns true if the buffer is a valid audio file
|
||||
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
||||
ma_result result;
|
||||
const int channels = 1;
|
||||
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
||||
ma_decoder decoder;
|
||||
|
||||
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
||||
if (result != MA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ma_uint64 frame_count;
|
||||
ma_uint64 frames_read;
|
||||
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
pcmf32_mono.resize(frame_count);
|
||||
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef MTMD_AUDIO_DEBUG
|
||||
// save audio to wav file
|
||||
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
||||
ma_encoder encoder;
|
||||
ma_encoder_init_file("output.wav", &config, &encoder);
|
||||
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
||||
ma_encoder_uninit(&encoder);
|
||||
#endif
|
||||
|
||||
ma_decoder_uninit(&decoder);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace wav_utils
|
||||
|
||||
|
||||
// precalculated mel filter banks
|
||||
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
|
||||
//
|
||||
|
@ -32,7 +32,7 @@ struct whisper_filters {
|
||||
std::vector<float> data;
|
||||
};
|
||||
|
||||
extern bool preprocess_audio(
|
||||
bool preprocess_audio(
|
||||
const float * samples,
|
||||
size_t n_samples,
|
||||
const whisper_filters & filters,
|
||||
@ -40,23 +40,8 @@ extern bool preprocess_audio(
|
||||
|
||||
} // namespace whisper_preprocessor
|
||||
|
||||
|
||||
// TODO @ngxson : move this helper to mtmd-helpers.cpp
|
||||
namespace audio_helpers {
|
||||
|
||||
extern bool is_audio_file(const char * buf, size_t len);
|
||||
|
||||
extern bool decode_audio_from_buf(
|
||||
const unsigned char * buf_in,
|
||||
size_t len,
|
||||
int target_sampler_rate,
|
||||
std::vector<float> & pcmf32_mono);
|
||||
|
||||
} // namespace audio_helpers
|
||||
|
||||
|
||||
namespace whisper_precalc_filters {
|
||||
|
||||
extern whisper_preprocessor::whisper_filters get_128_bins();
|
||||
whisper_preprocessor::whisper_filters get_128_bins();
|
||||
|
||||
} // namespace whisper_precalc_filters
|
||||
|
@ -7,6 +7,7 @@
|
||||
#include "console.h"
|
||||
#include "chat.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
#include <vector>
|
||||
#include <limits.h>
|
||||
@ -143,7 +144,7 @@ struct mtmd_cli_context {
|
||||
}
|
||||
|
||||
bool load_media(const std::string & fname) {
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
|
||||
if (!bmp.ptr) {
|
||||
return false;
|
||||
}
|
||||
|
@ -1,10 +1,37 @@
|
||||
// fix problem with std::min and std::max
|
||||
#if defined(_WIN32)
|
||||
#define WIN32_LEAN_AND_MEAN
|
||||
#ifndef NOMINMAX
|
||||
# define NOMINMAX
|
||||
#endif
|
||||
#include <windows.h>
|
||||
#endif
|
||||
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cinttypes>
|
||||
#include <vector>
|
||||
|
||||
//#define MTMD_AUDIO_DEBUG
|
||||
|
||||
#define MINIAUDIO_IMPLEMENTATION
|
||||
#ifndef MTMD_AUDIO_DEBUG
|
||||
# define MA_NO_ENCODING
|
||||
#endif
|
||||
#define MA_NO_DEVICE_IO
|
||||
#define MA_NO_RESOURCE_MANAGER
|
||||
#define MA_NO_NODE_GRAPH
|
||||
#define MA_NO_ENGINE
|
||||
#define MA_NO_GENERATION
|
||||
#define MA_API static
|
||||
#include "vendor/miniaudio.h"
|
||||
|
||||
#define STB_IMAGE_IMPLEMENTATION
|
||||
#include "vendor/stb_image.h"
|
||||
|
||||
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
||||
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
||||
|
||||
@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
namespace audio_helpers {
|
||||
|
||||
static bool is_audio_file(const char * buf, size_t len) {
|
||||
if (len < 12) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
||||
bool is_mp3 = len >= 3 && (
|
||||
memcmp(buf, "ID3", 3) == 0 ||
|
||||
// Check for MPEG sync word (simplified check)
|
||||
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
||||
);
|
||||
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
||||
|
||||
return is_wav || is_mp3 || is_flac;
|
||||
}
|
||||
|
||||
// returns true if the buffer is a valid audio file
|
||||
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
||||
ma_result result;
|
||||
const int channels = 1;
|
||||
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
||||
ma_decoder decoder;
|
||||
|
||||
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
||||
if (result != MA_SUCCESS) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ma_uint64 frame_count;
|
||||
ma_uint64 frames_read;
|
||||
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
pcmf32_mono.resize(frame_count);
|
||||
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
||||
if (result != MA_SUCCESS) {
|
||||
ma_decoder_uninit(&decoder);
|
||||
return false;
|
||||
}
|
||||
|
||||
#ifdef MTMD_AUDIO_DEBUG
|
||||
// save audio to wav file
|
||||
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
||||
ma_encoder encoder;
|
||||
ma_encoder_init_file("output.wav", &config, &encoder);
|
||||
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
||||
ma_encoder_uninit(&encoder);
|
||||
#endif
|
||||
|
||||
ma_decoder_uninit(&decoder);
|
||||
return true;
|
||||
}
|
||||
|
||||
} // namespace audio_helpers
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
|
||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||
std::vector<float> pcmf32;
|
||||
int bitrate = mtmd_get_audio_bitrate(ctx);
|
||||
if (bitrate < 0) {
|
||||
LOG_ERR("This model does not support audio input\n");
|
||||
return nullptr;
|
||||
}
|
||||
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
|
||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||
return nullptr;
|
||||
}
|
||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
||||
}
|
||||
|
||||
// otherwise, we assume it's an image
|
||||
mtmd_bitmap * result = nullptr;
|
||||
{
|
||||
int nx, ny, nc;
|
||||
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
|
||||
if (!data) {
|
||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||
return nullptr;
|
||||
}
|
||||
result = mtmd_bitmap_init(nx, ny, data);
|
||||
stbi_image_free(data);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
|
||||
std::vector<unsigned char> buf;
|
||||
FILE * f = fopen(fname, "rb");
|
||||
if (!f) {
|
||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
buf.resize(file_size);
|
||||
|
||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||
fclose(f);
|
||||
if (n_read != (size_t)file_size) {
|
||||
LOG_ERR("Failed to read entire file %s", fname);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
|
||||
}
|
||||
|
91
tools/mtmd/mtmd-helper.h
Normal file
91
tools/mtmd/mtmd-helper.h
Normal file
@ -0,0 +1,91 @@
|
||||
#ifndef MTMD_HELPER_H
|
||||
#define MTMD_HELPER_H
|
||||
|
||||
#include "ggml.h"
|
||||
#include "llama.h"
|
||||
#include "mtmd.h"
|
||||
|
||||
#include <stddef.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//
|
||||
// libmtmd helper functions
|
||||
//
|
||||
// Please note that these helpers are not guaranteed to be stable.
|
||||
// BREAKING CHANGES are expected.
|
||||
//
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a file
|
||||
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
||||
// supported formats:
|
||||
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
||||
// audio: formats supported by miniaudio: wav, mp3, flac
|
||||
// note: audio files will be auto-detected based on magic bytes
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
||||
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// otherwise, returns 0 on success
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// helper function to decode an image whose embeddings have already been calculated
|
||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
||||
//
|
||||
// C++ wrappers
|
||||
//
|
||||
|
||||
#endif
|
@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
|
||||
return ctx->ctx_a != nullptr;
|
||||
}
|
||||
|
||||
// these 2 helpers below use internal clip_image_u8_ptr,
|
||||
// so unfortunately they cannot moved to mtmd-helper.h
|
||||
// however, in theory, user can decode image file to bitmap using
|
||||
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||
std::vector<float> pcmf32;
|
||||
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
|
||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||
return nullptr;
|
||||
}
|
||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
||||
int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
||||
if (!ctx->ctx_a) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
||||
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
||||
if (!ok) {
|
||||
LOG_ERR("Unable to load image from buffer\n");
|
||||
return nullptr;
|
||||
}
|
||||
uint32_t nx, ny;
|
||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
||||
return mtmd_bitmap_init(nx, ny, data);
|
||||
}
|
||||
|
||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
|
||||
std::vector<unsigned char> buf;
|
||||
FILE * f = fopen(fname, "rb");
|
||||
if (!f) {
|
||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
fseek(f, 0, SEEK_END);
|
||||
long file_size = ftell(f);
|
||||
fseek(f, 0, SEEK_SET);
|
||||
buf.resize(file_size);
|
||||
|
||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||
fclose(f);
|
||||
if (n_read != (size_t)file_size) {
|
||||
LOG_ERR("Failed to read entire file %s", fname);
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
|
||||
// for now, we assume that all audio models have the same bitrate
|
||||
return 16000; // 16kHz
|
||||
}
|
||||
|
||||
//
|
||||
|
@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
||||
// whether the current model supports audio input
|
||||
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||
|
||||
// get audio bitrate in Hz, for example 16000 for Whisper
|
||||
// return -1 if audio is not supported
|
||||
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
||||
|
||||
// mtmd_bitmap
|
||||
//
|
||||
// if bitmap is image:
|
||||
@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
//
|
||||
// Helper functions (can be implemented based on other functions)
|
||||
//
|
||||
// Please note that these helpers are not guaranteed to be stable.
|
||||
// BREAKING CHANGES are expected.
|
||||
//
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a file
|
||||
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
|
||||
|
||||
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
||||
// supported formats:
|
||||
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
||||
// audio: formats supported by miniaudio: wav, mp3, flac
|
||||
// note: audio files will be auto-detected based on magic bytes
|
||||
// returns nullptr on failure
|
||||
// this function is thread-safe
|
||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
|
||||
|
||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
||||
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
||||
|
||||
// helper function that automatically:
|
||||
// 1. run llama_decode() on text chunks
|
||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||
// otherwise, returns 0 on success
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunks * chunks,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
||||
// this function is NOT thread-safe
|
||||
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
bool logits_last,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
// helper function to decode an image whose embeddings have already been calculated
|
||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||
struct llama_context * lctx,
|
||||
const mtmd_input_chunk * chunk,
|
||||
float * encoded_embd,
|
||||
llama_pos n_past,
|
||||
llama_seq_id seq_id,
|
||||
int32_t n_batch,
|
||||
llama_pos * new_n_past);
|
||||
|
||||
/////////////////////////////////////////
|
||||
|
||||
// test function, to be used in test-mtmd-c-api.c
|
||||
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
||||
|
||||
|
@ -36,7 +36,7 @@ install(TARGETS ${TARGET} RUNTIME)
|
||||
|
||||
target_include_directories(${TARGET} PRIVATE ../llava)
|
||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
|
||||
|
||||
if (LLAMA_SERVER_SSL)
|
||||
find_package(OpenSSL REQUIRED)
|
||||
|
@ -9,6 +9,7 @@
|
||||
#include "sampling.h"
|
||||
#include "speculative.h"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||
#define JSON_ASSERT GGML_ASSERT
|
||||
@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
|
||||
throw std::runtime_error("This server does not support multimodal");
|
||||
}
|
||||
for (auto & file : files) {
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
|
||||
if (!bmp.ptr) {
|
||||
throw std::runtime_error("Failed to load image or audio file");
|
||||
}
|
||||
|
@ -6,6 +6,7 @@
|
||||
#include "arg.h" // common_remote_get_content
|
||||
#include "base64.hpp"
|
||||
#include "mtmd.h"
|
||||
#include "mtmd-helper.h"
|
||||
|
||||
// increase max payload length to allow use of larger context size
|
||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||
|
Reference in New Issue
Block a user