mtmd : add ultravox audio input (#13623)

* convert ok, load ok * warmup ok * test * still does not work? * fix padding * temporary give up * fix merge conflict * build_ultravox() * rm test * fix merge conflict * add necessary mtmd APIs * first working version (only 4s of audio) * will this monster compile? * fix compile * please compile * fPIC * fix windows * various fixes * clean up audio_helpers * fix conversion * add some debug stuff * long audio input ok * adapt the api * add --audio arg * final touch UX * add miniaudio to readme * fix typo * refactor kv metadata * mtmd_default_marker()
2025-06-26 19:55:04 +00:00 · 2025-05-22 20:42:48 +02:00
parent ab86335760
commit 797990c4bc
21 changed files with 95401 additions and 259 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -39,7 +39,7 @@
 using json = nlohmann::ordered_json;

 std::initializer_list<enum llama_example> mmproj_examples = {
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_SERVER,
 };

@ -2233,12 +2233,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        }
    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
    add_opt(common_arg(
-        {"--image"}, "FILE",
-        "path to an image file. use with multimodal models. Specify multiple times for batching",
+        {"--image", "--audio"}, "FILE",
+        "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
        [](common_params & params, const std::string & value) {
            params.image.emplace_back(value);
        }
-    ).set_examples({LLAMA_EXAMPLE_LLAVA}));
+    ).set_examples({LLAMA_EXAMPLE_MTMD}));
    if (llama_supports_rpc()) {
        add_opt(common_arg(
            {"--rpc"}, "SERVERS",
@ -2868,7 +2868,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
        [](common_params & params, const std::string & value) {
            params.chat_template = value;
        }
-    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_LLAVA}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
+    ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_CHAT_TEMPLATE"));
    add_opt(common_arg(
        {"--chat-template-file"}, "JINJA_TEMPLATE_FILE",
        string_format(
--- a/common/common.h
+++ b/common/common.h
@ -76,7 +76,7 @@ enum llama_example {
    LLAMA_EXAMPLE_SERVER,
    LLAMA_EXAMPLE_CVECTOR_GENERATOR,
    LLAMA_EXAMPLE_EXPORT_LORA,
-    LLAMA_EXAMPLE_LLAVA,
+    LLAMA_EXAMPLE_MTMD,
    LLAMA_EXAMPLE_LOOKUP,
    LLAMA_EXAMPLE_PARALLEL,
    LLAMA_EXAMPLE_TTS,