server : support jinja extra template kwargs (Qwen3 enable_thinking feature), from command line and from client (#13196)

* initial commit for handling extra template kwargs

* enable_thinking and assistant prefill cannot be enabled at the same time

* can set chat_template_kwargs in command line

* added doc

* fixed formatting

* add support for extra context in generic template init

* coding standard: common/chat.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* coding standard:  common/chat.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Apply suggestions from code review

coding standard: cosmetic changes

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* fix merge conflict

* chat.cpp: simplify calls to apply to ensure systematic propagation of extra_context (+ the odd existing additional_context)

* normalize environment variable name

* simplify code

* prefill cannot be used with thinking models

* compatibility with the new reasoning-budget parameter

* fix prefill for non thinking models

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Olivier Chafik <olivier.chafik@gmail.com>
This commit is contained in:
matteo
2025-06-29 20:02:53 +02:00
committed by GitHub
parent 83790b0e7e
commit caf5681fcb
7 changed files with 69 additions and 20 deletions

View File

@ -2794,6 +2794,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.ssl_file_cert = value; params.ssl_file_cert = value;
} }
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE")); ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_SSL_CERT_FILE"));
add_opt(common_arg(
{"--chat-template-kwargs"}, "STRING",
string_format("sets additional params for the json template parser"),
[](common_params & params, const std::string & value) {
auto parsed = json::parse(value);
for (const auto & item : parsed.items()) {
params.default_template_kwargs[item.key()] = item.value().dump();
}
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_CHAT_TEMPLATE_KWARGS"));
add_opt(common_arg( add_opt(common_arg(
{"-to", "--timeout"}, "N", {"-to", "--timeout"}, "N",
string_format("server read/write timeout in seconds (default: %d)", params.timeout_read), string_format("server read/write timeout in seconds (default: %d)", params.timeout_read),

View File

@ -17,6 +17,8 @@
#include <string> #include <string>
#include <vector> #include <vector>
using json = nlohmann::ordered_json;
static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) { static std::string format_time(const std::chrono::system_clock::time_point & now, const std::string & format) {
auto time = std::chrono::system_clock::to_time_t(now); auto time = std::chrono::system_clock::to_time_t(now);
auto local_time = *std::localtime(&time); auto local_time = *std::localtime(&time);
@ -140,6 +142,7 @@ struct templates_params {
bool add_generation_prompt = true; bool add_generation_prompt = true;
bool enable_thinking = true; bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
json extra_context;
}; };
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) { common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@ -720,16 +723,23 @@ static void foreach_function(const json & tools, const std::function<void(const
static std::string apply( static std::string apply(
const common_chat_template & tmpl, const common_chat_template & tmpl,
const nlohmann::ordered_json & messages, const struct templates_params & inputs,
const nlohmann::ordered_json & tools, const std::optional<json> & messages_override = std::nullopt,
bool add_generation_prompt, const std::optional<json> & tools_override = std::nullopt,
const nlohmann::ordered_json & extra_context = nlohmann::ordered_json()) const std::optional<json> & additional_context = std::nullopt)
{ {
minja::chat_template_inputs tmpl_inputs; minja::chat_template_inputs tmpl_inputs;
tmpl_inputs.messages = messages; tmpl_inputs.messages = messages_override ? *messages_override : inputs.messages;
tmpl_inputs.tools = tools; if (tools_override) {
tmpl_inputs.add_generation_prompt = add_generation_prompt; tmpl_inputs.tools = *tools_override;
tmpl_inputs.extra_context = extra_context; } else {
tmpl_inputs.tools = inputs.tools.empty() ? json() : inputs.tools;
}
tmpl_inputs.add_generation_prompt = inputs.add_generation_prompt;
tmpl_inputs.extra_context = inputs.extra_context;
if (additional_context) {
tmpl_inputs.extra_context.merge_patch(*additional_context);
}
// TODO: add flag to control date/time, if only for testing purposes. // TODO: add flag to control date/time, if only for testing purposes.
// tmpl_inputs.now = std::chrono::system_clock::now(); // tmpl_inputs.now = std::chrono::system_clock::now();
@ -828,7 +838,7 @@ static common_chat_params common_chat_params_init_generic(const common_chat_temp
inputs.messages, inputs.messages,
"Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request"); "Respond in JSON format, either with `tool_call` (a request to call tools) or with `response` reply to the user's request");
data.prompt = apply(tmpl, tweaked_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs, /* messages_override= */ tweaked_messages);
data.format = COMMON_CHAT_FORMAT_GENERIC; data.format = COMMON_CHAT_FORMAT_GENERIC;
return data; return data;
} }
@ -904,7 +914,7 @@ static common_chat_params common_chat_params_init_mistral_nemo(const common_chat
data.preserved_tokens = { data.preserved_tokens = {
"[TOOL_CALLS]", "[TOOL_CALLS]",
}; };
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO; data.format = COMMON_CHAT_FORMAT_MISTRAL_NEMO;
return data; return data;
} }
@ -934,7 +944,7 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
adjusted_messages.push_back(msg); adjusted_messages.push_back(msg);
} }
} }
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {}); data.prompt = apply(tmpl, inputs, /* messages_override= */ adjusted_messages);
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B; data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
if (string_ends_with(data.prompt, "<|START_THINKING|>")) { if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
if (!inputs.enable_thinking) { if (!inputs.enable_thinking) {
@ -1122,7 +1132,7 @@ static common_chat_params common_chat_params_init_llama_3_x(const common_chat_te
} else { } else {
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
} }
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, { data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, json {
{"date_string", format_time(inputs.now, "%d %b %Y")}, {"date_string", format_time(inputs.now, "%d %b %Y")},
{"tools_in_user_message", false}, {"tools_in_user_message", false},
{"builtin_tools", builtin_tools.empty() ? json() : builtin_tools}, {"builtin_tools", builtin_tools.empty() ? json() : builtin_tools},
@ -1187,7 +1197,7 @@ static void common_chat_parse_llama_3_1(common_chat_msg_parser & builder, bool w
static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) { static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data; common_chat_params data;
auto prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); auto prompt = apply(tmpl, inputs);
// Hacks to fix the official (broken) prompt. // Hacks to fix the official (broken) prompt.
// It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead, // It is advisable to use --chat-template-file models/templates/llama-cpp-deepseek-r1.jinja instead,
@ -1282,7 +1292,7 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) { static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
LOG_DBG("%s\n", __func__); LOG_DBG("%s\n", __func__);
common_chat_params data; common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, /* tools= */ nullptr, inputs.add_generation_prompt, { data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ json(), json {
{"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")}, {"datetime", format_time(inputs.now, "%b %d %Y %H:%M:%S GMT")},
{"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))}, {"functions", json(inputs.tools.empty() ? "" : inputs.tools.dump(2))},
}); });
@ -1338,7 +1348,7 @@ static common_chat_params common_chat_params_init_functionary_v3_2(const common_
// Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar // Using ">>>f1\n", ">>>f2\n"... as trigger words for the grammar
// If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code. // If the function is python, we also allow raw python code (if the line after `python\n` doesn't start w/ opening `{`), which the model seems to prefer for multiline code.
common_chat_params data; common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2; data.format = COMMON_CHAT_FORMAT_FUNCTIONARY_V3_2;
if (inputs.tools.is_array() && !inputs.tools.empty()) { if (inputs.tools.is_array() && !inputs.tools.empty()) {
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED; data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@ -1465,7 +1475,7 @@ static common_chat_params common_chat_params_init_functionary_v3_1_llama_3_1(con
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
} }
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs);
// TODO: if (has_raw_python) // TODO: if (has_raw_python)
return data; return data;
} }
@ -1498,14 +1508,15 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) { static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data; common_chat_params data;
json additional_context = { json extra_context = json {
{"enable_thinking", inputs.enable_thinking}, {"enable_thinking", inputs.enable_thinking},
}; };
extra_context.update(inputs.extra_context);
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context); data.prompt = apply(tmpl, inputs, /* messages_override =*/ std::nullopt, /* tools_override= */ std::nullopt, extra_context);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO; data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
if (string_ends_with(data.prompt, "<think>\n")) { if (string_ends_with(data.prompt, "<think>\n")) {
if (!inputs.enable_thinking) { if (!extra_context["enable_thinking"]) {
data.prompt += "</think>"; data.prompt += "</think>";
} else { } else {
data.thinking_forced_open = true; data.thinking_forced_open = true;
@ -1691,7 +1702,7 @@ static void common_chat_parse_hermes_2_pro(common_chat_msg_parser & builder) {
static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) { static common_chat_params common_chat_params_init_without_tools(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data; common_chat_params data;
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt); data.prompt = apply(tmpl, inputs);
data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY; data.format = COMMON_CHAT_FORMAT_CONTENT_ONLY;
data.grammar_lazy = false; data.grammar_lazy = false;
if (!inputs.json_schema.is_null()) { if (!inputs.json_schema.is_null()) {
@ -1722,6 +1733,12 @@ static common_chat_params common_chat_templates_apply_jinja(
params.enable_thinking = inputs.enable_thinking; params.enable_thinking = inputs.enable_thinking;
params.grammar = inputs.grammar; params.grammar = inputs.grammar;
params.now = inputs.now; params.now = inputs.now;
params.extra_context = json::object();
for (auto el : inputs.chat_template_kwargs) {
params.extra_context[el.first] = json::parse(el.second);
}
if (!inputs.json_schema.empty()) { if (!inputs.json_schema.empty()) {
params.json_schema = json::parse(inputs.json_schema); params.json_schema = json::parse(inputs.json_schema);
} }

View File

@ -7,6 +7,7 @@
#include <chrono> #include <chrono>
#include <string> #include <string>
#include <vector> #include <vector>
#include <map>
struct common_chat_templates; struct common_chat_templates;
@ -125,6 +126,7 @@ struct common_chat_templates_inputs {
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
bool enable_thinking = true; bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now(); std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
std::map<std::string, std::string> chat_template_kwargs;
}; };
struct common_chat_params { struct common_chat_params {

View File

@ -8,6 +8,7 @@
#include <string> #include <string>
#include <string_view> #include <string_view>
#include <vector> #include <vector>
#include <map>
#include <sstream> #include <sstream>
#ifdef _WIN32 #ifdef _WIN32
@ -381,6 +382,8 @@ struct common_params {
std::string ssl_file_key = ""; // NOLINT std::string ssl_file_key = ""; // NOLINT
std::string ssl_file_cert = ""; // NOLINT std::string ssl_file_cert = ""; // NOLINT
std::map<std::string, std::string> default_template_kwargs;
// "advanced" endpoints are disabled by default for better security // "advanced" endpoints are disabled by default for better security
bool webui = true; bool webui = true;
bool endpoint_slots = false; bool endpoint_slots = false;

View File

@ -164,6 +164,7 @@ The project is under active development, and we are [looking for feedback and co
| `--api-key-file FNAME` | path to file containing API keys (default: none) | | `--api-key-file FNAME` | path to file containing API keys (default: none) |
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) | | `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) | | `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) | | `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) | | `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) | | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
@ -1118,6 +1119,8 @@ See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers. The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`
*Examples:* *Examples:*
You can use either Python `openai` library with appropriate checkpoints: You can use either Python `openai` library with appropriate checkpoints:

View File

@ -2110,6 +2110,7 @@ struct server_context {
/* use_jinja */ params_base.use_jinja, /* use_jinja */ params_base.use_jinja,
/* prefill_assistant */ params_base.prefill_assistant, /* prefill_assistant */ params_base.prefill_assistant,
/* reasoning_format */ params_base.reasoning_format, /* reasoning_format */ params_base.reasoning_format,
/* chat_template_kwargs */ params_base.default_template_kwargs,
/* common_chat_templates */ chat_templates.get(), /* common_chat_templates */ chat_templates.get(),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false, /* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false, /* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,

View File

@ -579,6 +579,7 @@ struct oaicompat_parser_options {
bool use_jinja; bool use_jinja;
bool prefill_assistant; bool prefill_assistant;
common_reasoning_format reasoning_format; common_reasoning_format reasoning_format;
std::map<std::string,std::string> chat_template_kwargs;
common_chat_templates * tmpls; common_chat_templates * tmpls;
bool allow_image; bool allow_image;
bool allow_audio; bool allow_audio;
@ -756,6 +757,13 @@ static json oaicompat_chat_params_parse(
llama_params["parse_tool_calls"] = true; llama_params["parse_tool_calls"] = true;
} }
// merge the template args provided from command line with the args provided in the user request
auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object());
inputs.chat_template_kwargs = opt.chat_template_kwargs;
for (const auto & item : chat_template_kwargs_object.items()) {
inputs.chat_template_kwargs[item.key()] = item.value().dump();
}
// if the assistant message appears at the end of list, we do not add end-of-turn token // if the assistant message appears at the end of list, we do not add end-of-turn token
// for ex. this can be useful to modify the reasoning process in reasoning models // for ex. this can be useful to modify the reasoning process in reasoning models
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant; bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
@ -771,6 +779,11 @@ static json oaicompat_chat_params_parse(
/* TODO: test this properly */ /* TODO: test this properly */
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE; inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
}
inputs.add_generation_prompt = true; inputs.add_generation_prompt = true;
} }