tool-call: refactor common chat / tool-call api (+ tests / fixes) (#11900)

* tool-call refactoring: moved common_chat_* to chat.h, common_chat_templates_init return a unique_ptr to opaque type * addressed clang-tidy lints in [test-]chat.* * rm minja deps from util & common & move it to common/minja/ * add name & tool_call_id to common_chat_msg * add common_chat_tool * added json <-> tools, msgs conversions to chat.h * fix double bos/eos jinja avoidance hack (was preventing inner bos/eos tokens) * fix deepseek r1 slow test (no longer <think> opening w/ new template) * allow empty tools w/ auto + grammar * fix & test server grammar & json_schema params w/ & w/o --jinja
2025-06-28 12:25:03 +00:00 · 2025-02-18 18:03:23 +00:00
parent 63ac128563
commit 63e489c025
18 changed files with 1385 additions and 993 deletions
--- a/common/common.cpp
+++ b/common/common.cpp
@ -12,8 +12,6 @@
 #include "json.hpp"
 #include "json-schema-to-grammar.h"
 #include "llama.h"
-#include "chat.hpp"
-#include "chat-template.hpp"

 #include <algorithm>
 #include <cinttypes>
@ -1768,174 +1766,6 @@ std::string common_detokenize(const struct llama_vocab * vocab, const std::vecto
    return text;
 }

-//
-// Chat template utils
-//
-
-bool common_chat_verify_template(const std::string & tmpl, bool use_jinja) {
-    if (use_jinja) {
-        try {
-            auto chat_template = common_chat_template(tmpl, "<s>", "</s>");
-            common_chat_inputs inputs;
-            inputs.messages = json::array({{
-                {"role", "user"},
-                {"content", "test"},
-            }});
-            common_chat_params_init(chat_template, inputs);
-            return true;
-        } catch (const std::exception & e) {
-            LOG_ERR("%s: failed to apply template: %s\n", __func__, e.what());
-            return false;
-        }
-    }
-    llama_chat_message chat[] = {{"user", "test"}};
-    const int res = llama_chat_apply_template(tmpl.c_str(), chat, 1, true, nullptr, 0);
-    return res >= 0;
-}
-
-std::string common_chat_apply_template(
-        const common_chat_template & tmpl,
-        const std::vector<common_chat_msg> & msgs,
-        bool add_ass,
-        bool use_jinja) {
-    if (use_jinja) {
-        auto messages = json::array();
-        for (const auto & msg : msgs) {
-            messages.push_back({{"role", msg.role}, {"content", msg.content}});
-        }
-        common_chat_inputs inputs;
-        inputs.messages = messages;
-        inputs.add_generation_prompt = add_ass;
-        return common_chat_params_init(tmpl, inputs).prompt;
-    }
-
-    int alloc_size = 0;
-    std::vector<llama_chat_message> chat;
-    for (const auto & msg : msgs) {
-        chat.push_back({msg.role.c_str(), msg.content.c_str()});
-        alloc_size += (msg.role.size() + msg.content.size()) * 1.25;
-    }
-
-    std::vector<char> buf(alloc_size);
-
-    // run the first time to get the total output length
-    int32_t res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-
-    // error: chat template is not supported
-    if (res < 0) {
-        // if the custom "tmpl" is not supported, we throw an error
-        // this is a bit redundant (for good), since we're not sure if user validated the custom template with llama_chat_verify_template()
-        throw std::runtime_error("this custom template is not supported");
-    }
-
-    // if it turns out that our buffer is too small, we resize it
-    if ((size_t) res > buf.size()) {
-        buf.resize(res);
-        res = llama_chat_apply_template(tmpl.source().c_str(), chat.data(), chat.size(), add_ass, buf.data(), buf.size());
-    }
-
-    std::string formatted_chat(buf.data(), res);
-    return formatted_chat;
-}
-
-std::string common_chat_format_single(
-        const common_chat_template & tmpl,
-        const std::vector<common_chat_msg> & past_msg,
-        const common_chat_msg & new_msg,
-        bool add_ass,
-        bool use_jinja) {
-    std::ostringstream ss;
-    auto fmt_past_msg = past_msg.empty() ? "" : common_chat_apply_template(tmpl, past_msg, false, use_jinja);
-    std::vector<common_chat_msg> chat_new(past_msg);
-    // if the past_msg ends with a newline, we must preserve it in the formatted version
-    if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
-        ss << "\n";
-    };
-    // format chat with new_msg
-    chat_new.push_back(new_msg);
-    auto fmt_new_msg = common_chat_apply_template(tmpl, chat_new, add_ass, use_jinja);
-    // get the diff part
-    ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
-    return ss.str();
-}
-
-std::string common_chat_format_example(const common_chat_template & tmpl, bool use_jinja) {
-    std::vector<common_chat_msg> msgs = {
-        {"system",    "You are a helpful assistant", {}},
-        {"user",      "Hello", {}},
-        {"assistant", "Hi there", {}},
-        {"user",      "How are you?", {}},
-    };
-    return common_chat_apply_template(tmpl, msgs, true, use_jinja);
-}
-
-#define CHATML_TEMPLATE_SRC \
-    "{%- for message in messages -%}\n" \
-    "  {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>\n' -}}\n" \
-    "{%- endfor -%}\n" \
-    "{%- if add_generation_prompt -%}\n" \
-    "  {{- '<|im_start|>assistant\n' -}}\n" \
-    "{%- endif -%}"
-
-common_chat_templates common_chat_templates_from_model(const struct llama_model * model, const std::string & chat_template_override)
-{
-    std::string default_template_src;
-    std::string template_tool_use_src;
-
-    bool has_explicit_template = !chat_template_override.empty();
-    if (chat_template_override.empty()) {
-        auto str = llama_model_chat_template(model, /* name */ nullptr);
-        if (str) {
-            default_template_src = str;
-            has_explicit_template = true;
-        }
-        str = llama_model_chat_template(model, /* name */ "tool_use");
-        if (str) {
-            template_tool_use_src = str;
-            has_explicit_template = true;
-        }
-    } else {
-        default_template_src = chat_template_override;
-    }
-    if (default_template_src.empty() || default_template_src == "chatml") {
-        if (!template_tool_use_src.empty()) {
-            default_template_src = template_tool_use_src;
-        } else {
-            default_template_src = CHATML_TEMPLATE_SRC;
-        }
-    }
-    auto vocab = llama_model_get_vocab(model);
-    const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
-        if (token == LLAMA_TOKEN_NULL) {
-            if (default_template_src.find(jinja_variable_name) != std::string::npos
-                || template_tool_use_src.find(jinja_variable_name) != std::string::npos) {
-                LOG_WRN("%s: warning: vocab does not have a %s token, jinja template won't work as intended.\n", __func__, name);
-            }
-            return std::string();
-        } else {
-            return common_token_to_piece(vocab, token, true);
-        }
-    };
-    auto token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
-    auto token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
-    try {
-        return {
-            has_explicit_template,
-            std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos),
-            template_tool_use_src.empty()
-                ? nullptr
-                : std::make_unique<minja::chat_template>(template_tool_use_src, token_bos, token_eos),
-        };
-    } catch (const std::exception & e) {
-        LOG_ERR("%s: failed to parse chat template: %s\n", __func__, e.what());
-        return {
-            has_explicit_template,
-            std::make_unique<minja::chat_template>(CHATML_TEMPLATE_SRC, token_bos, token_eos),
-            nullptr,
-        };
-    }
-}
-
 //
 // KV cache utils
 //