mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 12:05:03 +00:00
mtmd : move helpers to dedicated library (⚠️ breaking change) (#13866)
* mtmd : move helpers to dedicated library * fix server build * rm leftover cmakelist code
This commit is contained in:
@ -49,6 +49,6 @@ charset = unset
|
|||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
|
||||||
[tools/mtmd/miniaudio.h]
|
[tools/mtmd/vendor/miniaudio.h]
|
||||||
trim_trailing_whitespace = unset
|
trim_trailing_whitespace = unset
|
||||||
insert_final_newline = unset
|
insert_final_newline = unset
|
||||||
|
@ -1,48 +1,54 @@
|
|||||||
# mtmd
|
# mtmd
|
||||||
|
|
||||||
# compile mtmd-audio separately to avoid long compile times with miniaudio.h
|
|
||||||
# TODO @ngxson : move miniaudio.h and stb_image.h to mtmd-helper.cpp, then compile the helper as a separate library
|
|
||||||
add_library(mtmd_audio STATIC mtmd-audio.cpp mtmd-audio.h)
|
|
||||||
if (BUILD_SHARED_LIBS)
|
|
||||||
set_target_properties(mtmd_audio PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
|
||||||
endif()
|
|
||||||
target_link_libraries(mtmd_audio PRIVATE ggml ${CMAKE_THREAD_LIBS_INIT})
|
|
||||||
target_compile_features(mtmd_audio PRIVATE cxx_std_17)
|
|
||||||
target_include_directories(mtmd_audio PRIVATE .)
|
|
||||||
|
|
||||||
add_library(mtmd OBJECT
|
add_library(mtmd OBJECT
|
||||||
mtmd.cpp
|
mtmd.cpp
|
||||||
mtmd-helper.cpp
|
mtmd-audio.cpp
|
||||||
mtmd.h
|
mtmd.h
|
||||||
clip.cpp
|
clip.cpp
|
||||||
clip.h
|
clip.h
|
||||||
clip-impl.h
|
clip-impl.h
|
||||||
)
|
)
|
||||||
|
|
||||||
target_link_libraries(mtmd PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(mtmd PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
target_include_directories(mtmd PUBLIC .)
|
target_include_directories(mtmd PUBLIC .)
|
||||||
target_include_directories(mtmd PRIVATE ../..)
|
target_include_directories(mtmd PRIVATE ../..)
|
||||||
target_include_directories(mtmd PRIVATE ../../common) # for stb_image.h
|
|
||||||
|
|
||||||
target_compile_features(mtmd PRIVATE cxx_std_17)
|
target_compile_features(mtmd PRIVATE cxx_std_17)
|
||||||
|
|
||||||
add_library(mtmd_static STATIC $<TARGET_OBJECTS:mtmd>)
|
# compile the helper separately, to avoid long compile times with miniaudio.h and stb_image.h
|
||||||
|
|
||||||
|
add_library(mtmd_helper OBJECT
|
||||||
|
mtmd-helper.cpp
|
||||||
|
mtmd-helper.h
|
||||||
|
)
|
||||||
|
|
||||||
|
target_link_libraries(mtmd_helper PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
target_include_directories(mtmd_helper PUBLIC .)
|
||||||
|
target_include_directories(mtmd_helper PRIVATE ./vendor)
|
||||||
|
target_include_directories(mtmd_helper PRIVATE ../..)
|
||||||
|
target_compile_features(mtmd_helper PRIVATE cxx_std_17)
|
||||||
|
|
||||||
if (BUILD_SHARED_LIBS)
|
if (BUILD_SHARED_LIBS)
|
||||||
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
set_target_properties(mtmd PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
target_compile_definitions(mtmd PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||||
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
|
add_library(mtmd_shared SHARED $<TARGET_OBJECTS:mtmd>)
|
||||||
target_link_libraries(mtmd_shared PRIVATE ggml llama mtmd_audio ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(mtmd_shared PRIVATE ggml llama ${CMAKE_THREAD_LIBS_INIT})
|
||||||
install(TARGETS mtmd_shared LIBRARY)
|
install(TARGETS mtmd_shared LIBRARY)
|
||||||
|
|
||||||
|
set_target_properties(mtmd_helper PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||||
|
target_compile_definitions(mtmd_helper PRIVATE LLAMA_SHARED LLAMA_BUILD)
|
||||||
|
add_library(mtmd_helper_shared SHARED $<TARGET_OBJECTS:mtmd>)
|
||||||
|
target_link_libraries(mtmd_helper_shared PRIVATE ggml llama mtmd ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
install(TARGETS mtmd_helper_shared LIBRARY)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (NOT MSVC)
|
if (NOT MSVC)
|
||||||
target_compile_options(mtmd PRIVATE -Wno-cast-qual) # stb_image.h
|
# for stb_image.h and miniaudio.h
|
||||||
target_compile_options(mtmd_audio PRIVATE -Wno-cast-qual) # miniaudio.h
|
target_compile_options(mtmd_helper PRIVATE -Wno-cast-qual)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if(TARGET BUILD_INFO)
|
if(TARGET BUILD_INFO)
|
||||||
add_dependencies(mtmd BUILD_INFO)
|
add_dependencies(mtmd BUILD_INFO)
|
||||||
|
add_dependencies(mtmd_helper BUILD_INFO)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
add_executable(llama-llava-cli deprecation-warning.cpp)
|
add_executable(llama-llava-cli deprecation-warning.cpp)
|
||||||
@ -54,5 +60,5 @@ set(TARGET llama-mtmd-cli)
|
|||||||
add_executable(${TARGET} mtmd-cli.cpp)
|
add_executable(${TARGET} mtmd-cli.cpp)
|
||||||
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-mtmd-cli)
|
||||||
install(TARGETS ${TARGET} RUNTIME)
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
|
||||||
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
target_compile_features(${TARGET} PRIVATE cxx_std_17)
|
||||||
|
@ -11,9 +11,6 @@
|
|||||||
#include "ggml-backend.h"
|
#include "ggml-backend.h"
|
||||||
#include "gguf.h"
|
#include "gguf.h"
|
||||||
|
|
||||||
#define STB_IMAGE_IMPLEMENTATION
|
|
||||||
#include "stb_image.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdlib>
|
#include <cstdlib>
|
||||||
@ -2786,30 +2783,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
|
|||||||
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
memcpy(img->buf.data(), rgb_pixels, img->buf.size());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool clip_image_load_from_file(const char * fname, clip_image_u8 * img) {
|
|
||||||
int nx, ny, nc;
|
|
||||||
auto * data = stbi_load(fname, &nx, &ny, &nc, 3);
|
|
||||||
if (!data) {
|
|
||||||
LOG_ERR("%s: failed to load image '%s'\n", __func__, fname);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
clip_build_img_from_pixels(data, nx, ny, img);
|
|
||||||
stbi_image_free(data);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length, struct clip_image_u8 * img) {
|
|
||||||
int nx, ny, nc;
|
|
||||||
auto * data = stbi_load_from_memory(bytes, bytes_length, &nx, &ny, &nc, 3);
|
|
||||||
if (!data) {
|
|
||||||
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
clip_build_img_from_pixels(data, nx, ny, img);
|
|
||||||
stbi_image_free(data);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
|
// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
|
||||||
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
|
static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
|
||||||
dst.nx = src.nx;
|
dst.nx = src.nx;
|
||||||
|
@ -1,28 +1,5 @@
|
|||||||
// fix problem with std::min and std::max
|
|
||||||
#if defined(_WIN32)
|
|
||||||
#define WIN32_LEAN_AND_MEAN
|
|
||||||
#ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
#endif
|
|
||||||
#include <windows.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#include "mtmd-audio.h"
|
#include "mtmd-audio.h"
|
||||||
|
|
||||||
//#define MTMD_AUDIO_DEBUG
|
|
||||||
|
|
||||||
#define MINIAUDIO_IMPLEMENTATION
|
|
||||||
#ifndef MTMD_AUDIO_DEBUG
|
|
||||||
# define MA_NO_ENCODING
|
|
||||||
#endif
|
|
||||||
#define MA_NO_DEVICE_IO
|
|
||||||
#define MA_NO_RESOURCE_MANAGER
|
|
||||||
#define MA_NO_NODE_GRAPH
|
|
||||||
#define MA_NO_ENGINE
|
|
||||||
#define MA_NO_GENERATION
|
|
||||||
#define MA_API static
|
|
||||||
#include "miniaudio.h"
|
|
||||||
|
|
||||||
#define _USE_MATH_DEFINES // for M_PI
|
#define _USE_MATH_DEFINES // for M_PI
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
@ -359,69 +336,6 @@ bool preprocess_audio(
|
|||||||
} // namespace whisper_preprocessor
|
} // namespace whisper_preprocessor
|
||||||
|
|
||||||
|
|
||||||
namespace audio_helpers {
|
|
||||||
|
|
||||||
bool is_audio_file(const char * buf, size_t len) {
|
|
||||||
if (len < 12) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
|
||||||
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
|
||||||
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
|
||||||
bool is_mp3 = len >= 3 && (
|
|
||||||
memcmp(buf, "ID3", 3) == 0 ||
|
|
||||||
// Check for MPEG sync word (simplified check)
|
|
||||||
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
|
||||||
);
|
|
||||||
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
|
||||||
|
|
||||||
return is_wav || is_mp3 || is_flac;
|
|
||||||
}
|
|
||||||
|
|
||||||
// returns true if the buffer is a valid audio file
|
|
||||||
bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
|
||||||
ma_result result;
|
|
||||||
const int channels = 1;
|
|
||||||
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
|
||||||
ma_decoder decoder;
|
|
||||||
|
|
||||||
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
|
||||||
if (result != MA_SUCCESS) {
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
ma_uint64 frame_count;
|
|
||||||
ma_uint64 frames_read;
|
|
||||||
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
|
||||||
if (result != MA_SUCCESS) {
|
|
||||||
ma_decoder_uninit(&decoder);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
pcmf32_mono.resize(frame_count);
|
|
||||||
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
|
||||||
if (result != MA_SUCCESS) {
|
|
||||||
ma_decoder_uninit(&decoder);
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef MTMD_AUDIO_DEBUG
|
|
||||||
// save audio to wav file
|
|
||||||
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
|
||||||
ma_encoder encoder;
|
|
||||||
ma_encoder_init_file("output.wav", &config, &encoder);
|
|
||||||
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
|
||||||
ma_encoder_uninit(&encoder);
|
|
||||||
#endif
|
|
||||||
|
|
||||||
ma_decoder_uninit(&decoder);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace wav_utils
|
|
||||||
|
|
||||||
|
|
||||||
// precalculated mel filter banks
|
// precalculated mel filter banks
|
||||||
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
|
// values are multiplied by 1000.0 to save space, and will be divided by 1000.0 in the end of the function
|
||||||
//
|
//
|
||||||
|
@ -32,7 +32,7 @@ struct whisper_filters {
|
|||||||
std::vector<float> data;
|
std::vector<float> data;
|
||||||
};
|
};
|
||||||
|
|
||||||
extern bool preprocess_audio(
|
bool preprocess_audio(
|
||||||
const float * samples,
|
const float * samples,
|
||||||
size_t n_samples,
|
size_t n_samples,
|
||||||
const whisper_filters & filters,
|
const whisper_filters & filters,
|
||||||
@ -40,23 +40,8 @@ extern bool preprocess_audio(
|
|||||||
|
|
||||||
} // namespace whisper_preprocessor
|
} // namespace whisper_preprocessor
|
||||||
|
|
||||||
|
|
||||||
// TODO @ngxson : move this helper to mtmd-helpers.cpp
|
|
||||||
namespace audio_helpers {
|
|
||||||
|
|
||||||
extern bool is_audio_file(const char * buf, size_t len);
|
|
||||||
|
|
||||||
extern bool decode_audio_from_buf(
|
|
||||||
const unsigned char * buf_in,
|
|
||||||
size_t len,
|
|
||||||
int target_sampler_rate,
|
|
||||||
std::vector<float> & pcmf32_mono);
|
|
||||||
|
|
||||||
} // namespace audio_helpers
|
|
||||||
|
|
||||||
|
|
||||||
namespace whisper_precalc_filters {
|
namespace whisper_precalc_filters {
|
||||||
|
|
||||||
extern whisper_preprocessor::whisper_filters get_128_bins();
|
whisper_preprocessor::whisper_filters get_128_bins();
|
||||||
|
|
||||||
} // namespace whisper_precalc_filters
|
} // namespace whisper_precalc_filters
|
||||||
|
@ -7,6 +7,7 @@
|
|||||||
#include "console.h"
|
#include "console.h"
|
||||||
#include "chat.h"
|
#include "chat.h"
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
|
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <limits.h>
|
#include <limits.h>
|
||||||
@ -143,7 +144,7 @@ struct mtmd_cli_context {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool load_media(const std::string & fname) {
|
bool load_media(const std::string & fname) {
|
||||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(fname.c_str()));
|
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_file(ctx_vision.get(), fname.c_str()));
|
||||||
if (!bmp.ptr) {
|
if (!bmp.ptr) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1,10 +1,37 @@
|
|||||||
|
// fix problem with std::min and std::max
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
# define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <cinttypes>
|
#include <cinttypes>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
//#define MTMD_AUDIO_DEBUG
|
||||||
|
|
||||||
|
#define MINIAUDIO_IMPLEMENTATION
|
||||||
|
#ifndef MTMD_AUDIO_DEBUG
|
||||||
|
# define MA_NO_ENCODING
|
||||||
|
#endif
|
||||||
|
#define MA_NO_DEVICE_IO
|
||||||
|
#define MA_NO_RESOURCE_MANAGER
|
||||||
|
#define MA_NO_NODE_GRAPH
|
||||||
|
#define MA_NO_ENGINE
|
||||||
|
#define MA_NO_GENERATION
|
||||||
|
#define MA_API static
|
||||||
|
#include "vendor/miniaudio.h"
|
||||||
|
|
||||||
|
#define STB_IMAGE_IMPLEMENTATION
|
||||||
|
#include "vendor/stb_image.h"
|
||||||
|
|
||||||
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
#define LOG_INF(...) fprintf(stdout, __VA_ARGS__)
|
||||||
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
#define LOG_ERR(...) fprintf(stderr, __VA_ARGS__)
|
||||||
|
|
||||||
@ -315,3 +342,118 @@ int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
|||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace audio_helpers {
|
||||||
|
|
||||||
|
static bool is_audio_file(const char * buf, size_t len) {
|
||||||
|
if (len < 12) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// RIFF ref: https://en.wikipedia.org/wiki/Resource_Interchange_File_Format
|
||||||
|
// WAV ref: https://www.mmsp.ece.mcgill.ca/Documents/AudioFormats/WAVE/WAVE.html
|
||||||
|
bool is_wav = memcmp(buf, "RIFF", 4) == 0 && memcmp(buf + 8, "WAVE", 4) == 0;
|
||||||
|
bool is_mp3 = len >= 3 && (
|
||||||
|
memcmp(buf, "ID3", 3) == 0 ||
|
||||||
|
// Check for MPEG sync word (simplified check)
|
||||||
|
((unsigned char)buf[0] == 0xFF && ((unsigned char)buf[1] & 0xE0) == 0xE0)
|
||||||
|
);
|
||||||
|
bool is_flac = memcmp(buf, "fLaC", 4) == 0;
|
||||||
|
|
||||||
|
return is_wav || is_mp3 || is_flac;
|
||||||
|
}
|
||||||
|
|
||||||
|
// returns true if the buffer is a valid audio file
|
||||||
|
static bool decode_audio_from_buf(const unsigned char * buf_in, size_t len, int target_sampler_rate, std::vector<float> & pcmf32_mono) {
|
||||||
|
ma_result result;
|
||||||
|
const int channels = 1;
|
||||||
|
ma_decoder_config decoder_config = ma_decoder_config_init(ma_format_f32, channels, target_sampler_rate);
|
||||||
|
ma_decoder decoder;
|
||||||
|
|
||||||
|
result = ma_decoder_init_memory(buf_in, len, &decoder_config, &decoder);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
ma_uint64 frame_count;
|
||||||
|
ma_uint64 frames_read;
|
||||||
|
result = ma_decoder_get_length_in_pcm_frames(&decoder, &frame_count);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
pcmf32_mono.resize(frame_count);
|
||||||
|
result = ma_decoder_read_pcm_frames(&decoder, pcmf32_mono.data(), frame_count, &frames_read);
|
||||||
|
if (result != MA_SUCCESS) {
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef MTMD_AUDIO_DEBUG
|
||||||
|
// save audio to wav file
|
||||||
|
ma_encoder_config config = ma_encoder_config_init(ma_encoding_format_wav, ma_format_f32, 1, target_sampler_rate);
|
||||||
|
ma_encoder encoder;
|
||||||
|
ma_encoder_init_file("output.wav", &config, &encoder);
|
||||||
|
ma_encoder_write_pcm_frames(&encoder, pcmf32_mono.data(), pcmf32_mono.size(), &frames_read);
|
||||||
|
ma_encoder_uninit(&encoder);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
ma_decoder_uninit(&decoder);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace audio_helpers
|
||||||
|
|
||||||
|
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len) {
|
||||||
|
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
||||||
|
std::vector<float> pcmf32;
|
||||||
|
int bitrate = mtmd_get_audio_bitrate(ctx);
|
||||||
|
if (bitrate < 0) {
|
||||||
|
LOG_ERR("This model does not support audio input\n");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
if (!audio_helpers::decode_audio_from_buf(buf, len, bitrate, pcmf32)) {
|
||||||
|
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
||||||
|
}
|
||||||
|
|
||||||
|
// otherwise, we assume it's an image
|
||||||
|
mtmd_bitmap * result = nullptr;
|
||||||
|
{
|
||||||
|
int nx, ny, nc;
|
||||||
|
auto * data = stbi_load_from_memory(buf, len, &nx, &ny, &nc, 3);
|
||||||
|
if (!data) {
|
||||||
|
LOG_ERR("%s: failed to decode image bytes\n", __func__);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
result = mtmd_bitmap_init(nx, ny, data);
|
||||||
|
stbi_image_free(data);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname) {
|
||||||
|
std::vector<unsigned char> buf;
|
||||||
|
FILE * f = fopen(fname, "rb");
|
||||||
|
if (!f) {
|
||||||
|
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
fseek(f, 0, SEEK_END);
|
||||||
|
long file_size = ftell(f);
|
||||||
|
fseek(f, 0, SEEK_SET);
|
||||||
|
buf.resize(file_size);
|
||||||
|
|
||||||
|
size_t n_read = fread(buf.data(), 1, file_size, f);
|
||||||
|
fclose(f);
|
||||||
|
if (n_read != (size_t)file_size) {
|
||||||
|
LOG_ERR("Failed to read entire file %s", fname);
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mtmd_helper_bitmap_init_from_buf(ctx, buf.data(), buf.size());
|
||||||
|
}
|
||||||
|
91
tools/mtmd/mtmd-helper.h
Normal file
91
tools/mtmd/mtmd-helper.h
Normal file
@ -0,0 +1,91 @@
|
|||||||
|
#ifndef MTMD_HELPER_H
|
||||||
|
#define MTMD_HELPER_H
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "mtmd.h"
|
||||||
|
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
extern "C" {
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// libmtmd helper functions
|
||||||
|
//
|
||||||
|
// Please note that these helpers are not guaranteed to be stable.
|
||||||
|
// BREAKING CHANGES are expected.
|
||||||
|
//
|
||||||
|
|
||||||
|
// helper function to construct a mtmd_bitmap from a file
|
||||||
|
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
||||||
|
// returns nullptr on failure
|
||||||
|
// this function is thread-safe
|
||||||
|
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char * fname);
|
||||||
|
|
||||||
|
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
||||||
|
// supported formats:
|
||||||
|
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
||||||
|
// audio: formats supported by miniaudio: wav, mp3, flac
|
||||||
|
// note: audio files will be auto-detected based on magic bytes
|
||||||
|
// returns nullptr on failure
|
||||||
|
// this function is thread-safe
|
||||||
|
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
|
||||||
|
|
||||||
|
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
||||||
|
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
||||||
|
|
||||||
|
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
||||||
|
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
||||||
|
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
||||||
|
|
||||||
|
// helper function that automatically:
|
||||||
|
// 1. run llama_decode() on text chunks
|
||||||
|
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
||||||
|
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
||||||
|
// otherwise, returns 0 on success
|
||||||
|
// this function is NOT thread-safe
|
||||||
|
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const mtmd_input_chunks * chunks,
|
||||||
|
llama_pos n_past,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
int32_t n_batch,
|
||||||
|
bool logits_last,
|
||||||
|
llama_pos * new_n_past);
|
||||||
|
|
||||||
|
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
||||||
|
// this function is NOT thread-safe
|
||||||
|
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const mtmd_input_chunk * chunk,
|
||||||
|
llama_pos n_past,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
int32_t n_batch,
|
||||||
|
bool logits_last,
|
||||||
|
llama_pos * new_n_past);
|
||||||
|
|
||||||
|
// helper function to decode an image whose embeddings have already been calculated
|
||||||
|
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
||||||
|
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
||||||
|
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
||||||
|
struct llama_context * lctx,
|
||||||
|
const mtmd_input_chunk * chunk,
|
||||||
|
float * encoded_embd,
|
||||||
|
llama_pos n_past,
|
||||||
|
llama_seq_id seq_id,
|
||||||
|
int32_t n_batch,
|
||||||
|
llama_pos * new_n_past);
|
||||||
|
|
||||||
|
#ifdef __cplusplus
|
||||||
|
} // extern "C"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
//
|
||||||
|
// C++ wrappers
|
||||||
|
//
|
||||||
|
|
||||||
|
#endif
|
@ -819,53 +819,12 @@ bool mtmd_support_audio(mtmd_context * ctx) {
|
|||||||
return ctx->ctx_a != nullptr;
|
return ctx->ctx_a != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
// these 2 helpers below use internal clip_image_u8_ptr,
|
int mtmd_get_audio_bitrate(mtmd_context * ctx) {
|
||||||
// so unfortunately they cannot moved to mtmd-helper.h
|
if (!ctx->ctx_a) {
|
||||||
// however, in theory, user can decode image file to bitmap using
|
return -1;
|
||||||
// whichever library they want, and then use mtmd_bitmap_init() to create bitmap
|
|
||||||
|
|
||||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len) {
|
|
||||||
if (audio_helpers::is_audio_file((const char *)buf, len)) {
|
|
||||||
std::vector<float> pcmf32;
|
|
||||||
if (!audio_helpers::decode_audio_from_buf(buf, len, COMMON_SAMPLE_RATE, pcmf32)) {
|
|
||||||
LOG_ERR("Unable to read WAV audio file from buffer\n");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
return mtmd_bitmap_init_from_audio(pcmf32.size(), pcmf32.data());
|
|
||||||
}
|
}
|
||||||
|
// for now, we assume that all audio models have the same bitrate
|
||||||
clip_image_u8_ptr img_u8(clip_image_u8_init());
|
return 16000; // 16kHz
|
||||||
bool ok = clip_image_load_from_bytes(buf, len, img_u8.get());
|
|
||||||
if (!ok) {
|
|
||||||
LOG_ERR("Unable to load image from buffer\n");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
uint32_t nx, ny;
|
|
||||||
unsigned char * data = clip_image_u8_get_data(img_u8.get(), &nx, &ny);
|
|
||||||
return mtmd_bitmap_init(nx, ny, data);
|
|
||||||
}
|
|
||||||
|
|
||||||
mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname) {
|
|
||||||
std::vector<unsigned char> buf;
|
|
||||||
FILE * f = fopen(fname, "rb");
|
|
||||||
if (!f) {
|
|
||||||
LOG_ERR("Unable to open file %s: %s\n", fname, strerror(errno));
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
fseek(f, 0, SEEK_END);
|
|
||||||
long file_size = ftell(f);
|
|
||||||
fseek(f, 0, SEEK_SET);
|
|
||||||
buf.resize(file_size);
|
|
||||||
|
|
||||||
size_t n_read = fread(buf.data(), 1, file_size, f);
|
|
||||||
fclose(f);
|
|
||||||
if (n_read != (size_t)file_size) {
|
|
||||||
LOG_ERR("Failed to read entire file %s", fname);
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
return mtmd_helper_bitmap_init_from_buf(buf.data(), buf.size());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
//
|
//
|
||||||
|
@ -109,6 +109,10 @@ MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
|
|||||||
// whether the current model supports audio input
|
// whether the current model supports audio input
|
||||||
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
|
||||||
|
|
||||||
|
// get audio bitrate in Hz, for example 16000 for Whisper
|
||||||
|
// return -1 if audio is not supported
|
||||||
|
MTMD_API int mtmd_get_audio_bitrate(mtmd_context * ctx);
|
||||||
|
|
||||||
// mtmd_bitmap
|
// mtmd_bitmap
|
||||||
//
|
//
|
||||||
// if bitmap is image:
|
// if bitmap is image:
|
||||||
@ -209,75 +213,6 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
|
|||||||
|
|
||||||
/////////////////////////////////////////
|
/////////////////////////////////////////
|
||||||
|
|
||||||
//
|
|
||||||
// Helper functions (can be implemented based on other functions)
|
|
||||||
//
|
|
||||||
// Please note that these helpers are not guaranteed to be stable.
|
|
||||||
// BREAKING CHANGES are expected.
|
|
||||||
//
|
|
||||||
|
|
||||||
// helper function to construct a mtmd_bitmap from a file
|
|
||||||
// it calls mtmd_helper_bitmap_init_from_buf() internally
|
|
||||||
// returns nullptr on failure
|
|
||||||
// this function is thread-safe
|
|
||||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_file(const char * fname);
|
|
||||||
|
|
||||||
// helper function to construct a mtmd_bitmap from a buffer containing a file
|
|
||||||
// supported formats:
|
|
||||||
// image: formats supported by stb_image: jpg, png, bmp, gif, etc.
|
|
||||||
// audio: formats supported by miniaudio: wav, mp3, flac
|
|
||||||
// note: audio files will be auto-detected based on magic bytes
|
|
||||||
// returns nullptr on failure
|
|
||||||
// this function is thread-safe
|
|
||||||
MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(const unsigned char * buf, size_t len);
|
|
||||||
|
|
||||||
// helper to count the total number of tokens from a list of chunks, useful to keep track of KV cache
|
|
||||||
MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
|
|
||||||
|
|
||||||
// helper to count the total position of tokens from a list of chunks, useful to keep track of n_past
|
|
||||||
// normally, n_pos is equal to n_tokens, but for M-RoPE it is different
|
|
||||||
MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
|
|
||||||
|
|
||||||
// helper function that automatically:
|
|
||||||
// 1. run llama_decode() on text chunks
|
|
||||||
// 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
|
|
||||||
// if any of the mtmd_encode() or llama_decode() calls return non-zero, stop and forward the error
|
|
||||||
// otherwise, returns 0 on success
|
|
||||||
// this function is NOT thread-safe
|
|
||||||
MTMD_API int32_t mtmd_helper_eval_chunks(mtmd_context * ctx,
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const mtmd_input_chunks * chunks,
|
|
||||||
llama_pos n_past,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch,
|
|
||||||
bool logits_last,
|
|
||||||
llama_pos * new_n_past);
|
|
||||||
|
|
||||||
// works like mtmd_helper_eval_chunks(), but only for a single chunk
|
|
||||||
// this function is NOT thread-safe
|
|
||||||
MTMD_API int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const mtmd_input_chunk * chunk,
|
|
||||||
llama_pos n_past,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch,
|
|
||||||
bool logits_last,
|
|
||||||
llama_pos * new_n_past);
|
|
||||||
|
|
||||||
// helper function to decode an image whose embeddings have already been calculated
|
|
||||||
// this helper will handle batching and pre/post decoding setup (for ex. gemma 3 requires non-causal attention)
|
|
||||||
// ret 0 on success, -1 on chunk not being a valid image chunk, 1 on decode failure
|
|
||||||
MTMD_API int32_t mtmd_helper_decode_image_chunk(mtmd_context * ctx,
|
|
||||||
struct llama_context * lctx,
|
|
||||||
const mtmd_input_chunk * chunk,
|
|
||||||
float * encoded_embd,
|
|
||||||
llama_pos n_past,
|
|
||||||
llama_seq_id seq_id,
|
|
||||||
int32_t n_batch,
|
|
||||||
llama_pos * new_n_past);
|
|
||||||
|
|
||||||
/////////////////////////////////////////
|
|
||||||
|
|
||||||
// test function, to be used in test-mtmd-c-api.c
|
// test function, to be used in test-mtmd-c-api.c
|
||||||
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
MTMD_API mtmd_input_chunks * mtmd_test_create_input_chunks(void);
|
||||||
|
|
||||||
|
@ -36,7 +36,7 @@ install(TARGETS ${TARGET} RUNTIME)
|
|||||||
|
|
||||||
target_include_directories(${TARGET} PRIVATE ../llava)
|
target_include_directories(${TARGET} PRIVATE ../llava)
|
||||||
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
target_include_directories(${TARGET} PRIVATE ${CMAKE_SOURCE_DIR})
|
||||||
target_link_libraries(${TARGET} PRIVATE common mtmd ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE common mtmd mtmd_helper ${CMAKE_THREAD_LIBS_INIT})
|
||||||
|
|
||||||
if (LLAMA_SERVER_SSL)
|
if (LLAMA_SERVER_SSL)
|
||||||
find_package(OpenSSL REQUIRED)
|
find_package(OpenSSL REQUIRED)
|
||||||
|
@ -9,6 +9,7 @@
|
|||||||
#include "sampling.h"
|
#include "sampling.h"
|
||||||
#include "speculative.h"
|
#include "speculative.h"
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
|
|
||||||
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
#define JSON_ASSERT GGML_ASSERT
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
@ -4187,7 +4188,7 @@ int main(int argc, char ** argv) {
|
|||||||
throw std::runtime_error("This server does not support multimodal");
|
throw std::runtime_error("This server does not support multimodal");
|
||||||
}
|
}
|
||||||
for (auto & file : files) {
|
for (auto & file : files) {
|
||||||
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(file.data(), file.size()));
|
mtmd::bitmap bmp(mtmd_helper_bitmap_init_from_buf(ctx_server.mctx, file.data(), file.size()));
|
||||||
if (!bmp.ptr) {
|
if (!bmp.ptr) {
|
||||||
throw std::runtime_error("Failed to load image or audio file");
|
throw std::runtime_error("Failed to load image or audio file");
|
||||||
}
|
}
|
||||||
|
@ -6,6 +6,7 @@
|
|||||||
#include "arg.h" // common_remote_get_content
|
#include "arg.h" // common_remote_get_content
|
||||||
#include "base64.hpp"
|
#include "base64.hpp"
|
||||||
#include "mtmd.h"
|
#include "mtmd.h"
|
||||||
|
#include "mtmd-helper.h"
|
||||||
|
|
||||||
// increase max payload length to allow use of larger context size
|
// increase max payload length to allow use of larger context size
|
||||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||||
|
Reference in New Issue
Block a user