server: fix streaming crashes (#13786)

* add preludes to content on partial regex match * allow all parsers to parse non-tool-call content. * tweak order of <|python_tag|> vs <function= parsing for functionary v3.1 format. still not ideal but hopefully less prone to crash
2025-08-12 19:37:53 -04:00 · 2025-05-26 08:03:57 -07:00
parent 88c125f2ac
commit 03f582ae8f
7 changed files with 112 additions and 59 deletions
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -735,8 +735,11 @@ static json oaicompat_chat_params_parse(
    inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
    inputs.reasoning_format      = opt.reasoning_format;
    inputs.enable_thinking       = opt.enable_thinking;
-    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
-        throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+    if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE) {
+        if (body.contains("grammar")) {
+            throw std::runtime_error("Cannot use custom grammar constraints with tools.");
+        }
+        llama_params["parse_tool_calls"] = true;
    }

    // if the assistant message appears at the end of list, we do not add end-of-turn token