mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-20 01:27:38 +00:00
server : support jinja extra template kwargs (Qwen3 enable_thinking feature), from command line and from client (#13196)
* initial commit for handling extra template kwargs * enable_thinking and assistant prefill cannot be enabled at the same time * can set chat_template_kwargs in command line * added doc * fixed formatting * add support for extra context in generic template init * coding standard: common/chat.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * coding standard: common/chat.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Apply suggestions from code review coding standard: cosmetic changes Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * fix merge conflict * chat.cpp: simplify calls to apply to ensure systematic propagation of extra_context (+ the odd existing additional_context) * normalize environment variable name * simplify code * prefill cannot be used with thinking models * compatibility with the new reasoning-budget parameter * fix prefill for non thinking models --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> Co-authored-by: Olivier Chafik <olivier.chafik@gmail.com>
This commit is contained in:
@ -164,6 +164,7 @@ The project is under active development, and we are [looking for feedback and co
|
||||
| `--api-key-file FNAME` | path to file containing API keys (default: none) |
|
||||
| `--ssl-key-file FNAME` | path to file a PEM-encoded SSL private key<br/>(env: LLAMA_ARG_SSL_KEY_FILE) |
|
||||
| `--ssl-cert-file FNAME` | path to file a PEM-encoded SSL certificate<br/>(env: LLAMA_ARG_SSL_CERT_FILE) |
|
||||
| `--chat-template-kwargs STRING` | JSON object containing additional params for the json template parser. Example: `--chat_template_kwargs "{\"enable_thinking\":false}`"<br/>(env: LLAMA_CHAT_TEMPLATE_KWARGS) |
|
||||
| `-to, --timeout N` | server read/write timeout in seconds (default: 600)<br/>(env: LLAMA_ARG_TIMEOUT) |
|
||||
| `--threads-http N` | number of threads used to process HTTP requests (default: -1)<br/>(env: LLAMA_ARG_THREADS_HTTP) |
|
||||
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
|
||||
@ -1118,6 +1119,8 @@ See [OpenAI Chat Completions API documentation](https://platform.openai.com/docs
|
||||
|
||||
The `response_format` parameter supports both plain JSON output (e.g. `{"type": "json_object"}`) and schema-constrained JSON (e.g. `{"type": "json_object", "schema": {"type": "string", "minLength": 10, "maxLength": 100}}` or `{"type": "json_schema", "schema": {"properties": { "name": { "title": "Name", "type": "string" }, "date": { "title": "Date", "type": "string" }, "participants": { "items": {"type: "string" }, "title": "Participants", "type": "string" } } } }`), similar to other OpenAI-inspired API providers.
|
||||
|
||||
`chat_template_kwargs`: Allows sending additional parameters to the json templating system. For example: `{"enable_thinking": false}`
|
||||
|
||||
*Examples:*
|
||||
|
||||
You can use either Python `openai` library with appropriate checkpoints:
|
||||
|
@ -2110,6 +2110,7 @@ struct server_context {
|
||||
/* use_jinja */ params_base.use_jinja,
|
||||
/* prefill_assistant */ params_base.prefill_assistant,
|
||||
/* reasoning_format */ params_base.reasoning_format,
|
||||
/* chat_template_kwargs */ params_base.default_template_kwargs,
|
||||
/* common_chat_templates */ chat_templates.get(),
|
||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||
|
@ -579,6 +579,7 @@ struct oaicompat_parser_options {
|
||||
bool use_jinja;
|
||||
bool prefill_assistant;
|
||||
common_reasoning_format reasoning_format;
|
||||
std::map<std::string,std::string> chat_template_kwargs;
|
||||
common_chat_templates * tmpls;
|
||||
bool allow_image;
|
||||
bool allow_audio;
|
||||
@ -756,6 +757,13 @@ static json oaicompat_chat_params_parse(
|
||||
llama_params["parse_tool_calls"] = true;
|
||||
}
|
||||
|
||||
// merge the template args provided from command line with the args provided in the user request
|
||||
auto chat_template_kwargs_object = json_value(body, "chat_template_kwargs", json::object());
|
||||
inputs.chat_template_kwargs = opt.chat_template_kwargs;
|
||||
for (const auto & item : chat_template_kwargs_object.items()) {
|
||||
inputs.chat_template_kwargs[item.key()] = item.value().dump();
|
||||
}
|
||||
|
||||
// if the assistant message appears at the end of list, we do not add end-of-turn token
|
||||
// for ex. this can be useful to modify the reasoning process in reasoning models
|
||||
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && opt.prefill_assistant;
|
||||
@ -771,6 +779,11 @@ static json oaicompat_chat_params_parse(
|
||||
|
||||
/* TODO: test this properly */
|
||||
inputs.reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||
|
||||
if ( (!inputs.enable_thinking) || inputs.chat_template_kwargs.find("enable_thinking") != inputs.chat_template_kwargs.end()) {
|
||||
throw std::runtime_error("Assistant response prefill is incompatible with enable_thinking.");
|
||||
}
|
||||
|
||||
inputs.add_generation_prompt = true;
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user