mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-04 18:16:58 +00:00
mtmd : add ultravox audio input (#13623)
* convert ok, load ok * warmup ok * test * still does not work? * fix padding * temporary give up * fix merge conflict * build_ultravox() * rm test * fix merge conflict * add necessary mtmd APIs * first working version (only 4s of audio) * will this monster compile? * fix compile * please compile * fPIC * fix windows * various fixes * clean up audio_helpers * fix conversion * add some debug stuff * long audio input ok * adapt the api * add --audio arg * final touch UX * add miniaudio to readme * fix typo * refactor kv metadata * mtmd_default_marker()
This commit is contained in:
@ -936,12 +936,18 @@ class GGUFWriter:
|
||||
|
||||
# for vision models
|
||||
|
||||
def add_clip_has_vision_encoder(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Clip.HAS_VISION_ENCODER, value)
|
||||
|
||||
def add_clip_has_audio_encoder(self, value: bool) -> None:
|
||||
self.add_bool(Keys.Clip.HAS_AUDIO_ENCODER, value)
|
||||
|
||||
def add_clip_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_vision_projection_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
|
||||
|
||||
def add_vision_has_vision_encoder(self, value: bool) -> None:
|
||||
self.add_bool(Keys.ClipVision.HAS_VISION_ENCODER, value)
|
||||
|
||||
def add_vision_patch_size(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.PATCH_SIZE, value)
|
||||
|
||||
@ -957,9 +963,6 @@ class GGUFWriter:
|
||||
def add_vision_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.Attention.HEAD_COUNT, value)
|
||||
|
||||
def add_vision_projector_type(self, value: str) -> None:
|
||||
self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
|
||||
|
||||
def add_vision_attention_layernorm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.ClipVision.Attention.LAYERNORM_EPS, value)
|
||||
|
||||
@ -987,6 +990,32 @@ class GGUFWriter:
|
||||
def add_vision_n_wa_pattern(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipVision.N_WA_PATTERN, value)
|
||||
|
||||
# audio models
|
||||
|
||||
def add_audio_projection_dim(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
|
||||
|
||||
def add_audio_embedding_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.EMBEDDING_LENGTH, value)
|
||||
|
||||
def add_audio_feed_forward_length(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.FEED_FORWARD_LENGTH, value)
|
||||
|
||||
def add_audio_block_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.BLOCK_COUNT, value)
|
||||
|
||||
def add_audio_head_count(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Attention.HEAD_COUNT, value)
|
||||
|
||||
def add_audio_attention_layernorm_eps(self, value: float) -> None:
|
||||
self.add_float32(Keys.ClipAudio.Attention.LAYERNORM_EPS, value)
|
||||
|
||||
def add_audio_num_mel_bins(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.NUM_MEL_BINS, value)
|
||||
|
||||
def add_audio_stack_factor(self, value: int) -> None:
|
||||
self.add_uint32(Keys.ClipAudio.Projector.STACK_FACTOR, value)
|
||||
|
||||
def _pack(self, fmt: str, value: Any, skip_pack_prefix: bool = False) -> bytes:
|
||||
pack_prefix = ''
|
||||
if not skip_pack_prefix:
|
||||
|
Reference in New Issue
Block a user