diff --git a/common/arg.cpp b/common/arg.cpp
index 62eec8337..5ed5a2390 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2848,15 +2848,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
- "reasoning format (default: deepseek; allowed values: deepseek, none)\n"
- "controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
- "only supported for non-streamed responses",
+ "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
+ "- none: leaves thoughts unparsed in `message.content`\n"
+ "- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
+ "(default: deepseek)",
[](common_params & params, const std::string & value) {
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
- else { std::invalid_argument("invalid value"); }
+ else { throw std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
+ add_opt(common_arg(
+ {"--reasoning-budget"}, "N",
+ "controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
+ [](common_params & params, int value) {
+ if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
+ params.reasoning_budget = value;
+ }
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
add_opt(common_arg(
{"--chat-template"}, "JINJA_TEMPLATE",
string_format(
@@ -2955,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
[](common_params & params, const std::string & value) {
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
else if (value == "md") { params.batched_bench_output_jsonl = false; }
- else { std::invalid_argument("invalid value"); }
+ else { throw std::invalid_argument("invalid value"); }
}
).set_examples({LLAMA_EXAMPLE_BENCH}));
add_opt(common_arg(
diff --git a/common/chat.cpp b/common/chat.cpp
index 78af5eafa..adfe51db5 100644
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -133,6 +133,7 @@ struct templates_params {
bool stream;
std::string grammar;
bool add_generation_prompt = true;
+ bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};
@@ -573,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init(
return tmpls;
}
-std::string common_chat_format_name(common_chat_format format) {
+const char * common_chat_format_name(common_chat_format format) {
switch (format) {
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
@@ -591,6 +592,15 @@ std::string common_chat_format_name(common_chat_format format) {
}
}
+const char * common_reasoning_format_name(common_reasoning_format format) {
+ switch (format) {
+ case COMMON_REASONING_FORMAT_NONE: return "none";
+ case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
+ default:
+ throw std::runtime_error("Unknown reasoning format");
+ }
+}
+
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
std::string arguments;
if (builder.is_partial()) {
@@ -918,7 +928,13 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
- data.thinking_forced_open = true;
+ if (!inputs.enable_thinking) {
+ data.prompt += "<|END_THINKING|>";
+ } else {
+ data.thinking_forced_open = true;
+ }
+ } else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
+ data.prompt += "<|START_THINKING|><|END_THINKING|>";
}
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
@@ -1186,7 +1202,11 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
data.prompt = prompt;
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
if (string_ends_with(data.prompt, "\n")) {
- data.thinking_forced_open = true;
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
}
if (inputs.tools.is_array() && !inputs.tools.empty()) {
@@ -1460,104 +1480,114 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
common_chat_params data;
- data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
+ json additional_context = {
+ {"enable_thinking", inputs.enable_thinking},
+ };
+
+ data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
if (string_ends_with(data.prompt, "\n")) {
- data.thinking_forced_open = true;
+ if (!inputs.enable_thinking) {
+ data.prompt += "";
+ } else {
+ data.thinking_forced_open = true;
+ }
}
- // (content)?({"name": "foo", "arguments": {"a": 1}})*
- data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
- data.grammar = build_grammar([&](const common_grammar_builder & builder) {
- std::vector tool_rules;
- std::vector tool_call_alts;
- std::vector escaped_names;
- foreach_function(inputs.tools, [&](const json & tool) {
- const auto & function = tool.at("function");
- std::string name = function.at("name");
- auto parameters = function.at("parameters");
- builder.resolve_refs(parameters);
- tool_rules.push_back(builder.add_schema(name + "-call", {
- {"type", "object"},
- {"properties", json {
- {"name", json {{"const", name}}},
- {"arguments", parameters},
- }},
- {"required", json::array({"name", "arguments"})},
- }));
- tool_call_alts.push_back(builder.add_rule(
- name + "-function-tag",
- "\"\" space " +
- builder.add_schema(name + "-args", parameters) + " "
- "\"\" space"));
+ if (!inputs.tools.is_null()) {
+ // (content)?({"name": "foo", "arguments": {"a": 1}})*
+ data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
+ data.grammar = build_grammar([&](const common_grammar_builder & builder) {
+ std::vector tool_rules;
+ std::vector tool_call_alts;
+ std::vector escaped_names;
+ foreach_function(inputs.tools, [&](const json & tool) {
+ const auto & function = tool.at("function");
+ std::string name = function.at("name");
+ auto parameters = function.at("parameters");
+ builder.resolve_refs(parameters);
+ tool_rules.push_back(builder.add_schema(name + "-call", {
+ {"type", "object"},
+ {"properties", json {
+ {"name", json {{"const", name}}},
+ {"arguments", parameters},
+ }},
+ {"required", json::array({"name", "arguments"})},
+ }));
+ tool_call_alts.push_back(builder.add_rule(
+ name + "-function-tag",
+ "\"\" space " +
+ builder.add_schema(name + "-args", parameters) + " "
+ "\"\" space"));
- data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
- "",
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_WORD,
+ "",
+ });
+ auto escaped_name = regex_escape(name);
+ data.grammar_triggers.push_back({
+ COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
+ " alt_tags {
+ any_tool_call,
+ "\"\" space " + any_tool_call + " \"\"",
+ // The rest is just to accommodate common "good bad" outputs.
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ "\"\" space " + any_tool_call + " \"\"",
+ };
+ auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
+ tool_call_alts.push_back(wrappable_tool_call);
+ tool_call_alts.push_back(
+ "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
+ auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
+ builder.add_rule("root",
+ std::string(data.thinking_forced_open ? "( \"\" space )? " : "") +
+ (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
+ // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN,
- " tag in the grammar,
+ // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
+ std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + (
+ "(\\s*"
+ "(?:"
+ "||||)?"
+ "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
+ ")"
+ ")[\\s\\S]*"
+ ),
});
- escaped_names.push_back(escaped_name);
+ data.preserved_tokens = {
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "",
+ "```",
+ "```json",
+ "```xml",
+ };
});
- auto any_tool_call = builder.add_rule("any_tool_call", "( " + string_join(tool_rules, " | ") + " ) space");
- std::vector alt_tags {
- any_tool_call,
- "\"\" space " + any_tool_call + " \"\"",
- // The rest is just to accommodate common "good bad" outputs.
- "\"\" space " + any_tool_call + " \"\"",
- "\"\" space " + any_tool_call + " \"\"",
- "\"\" space " + any_tool_call + " \"\"",
- "\"\" space " + any_tool_call + " \"\"",
- "\"\" space " + any_tool_call + " \"\"",
- "\"\" space " + any_tool_call + " \"\"",
- };
- auto wrappable_tool_call = builder.add_rule("wrappable_tool_call", "( " + string_join(alt_tags, " | ") + " ) space");
- tool_call_alts.push_back(wrappable_tool_call);
- tool_call_alts.push_back(
- "( \"```\\n\" | \"```json\\n\" | \"```xml\\n\" ) space " + wrappable_tool_call + " space \"```\" space ");
- auto tool_call = builder.add_rule("tool_call", string_join(tool_call_alts, " | "));
- builder.add_rule("root",
- std::string(data.thinking_forced_open ? "( \"\" space )? " : "") +
- (inputs.parallel_tool_calls ? "(" + tool_call + ")+" : tool_call));
- // Trigger on some common known "good bad" outputs (only from the start and with a json that's about a specific argument name to avoid false positives)
- data.grammar_triggers.push_back({
- COMMON_GRAMMAR_TRIGGER_TYPE_PATTERN_FULL,
- // If thinking_forced_open, then we capture the tag in the grammar,
- // (important for required tool choice) and in the trigger's first capture (decides what is sent to the grammar)
- std::string(data.thinking_forced_open ? "[\\s\\S]*?(\\s*)" : "(?:[\\s\\S]*?\\s*)?") + (
- "(\\s*"
- "(?:"
- "||||)?"
- "\\s*\\{\\s*\"name\"\\s*:\\s*\"(?:" + string_join(escaped_names, "|") + ")\""
- ")"
- ")[\\s\\S]*"
- ),
- });
- data.preserved_tokens = {
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "",
- "```",
- "```json",
- "```xml",
- };
- });
+ }
return data;
}
@@ -1669,6 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja(
params.messages = common_chat_msgs_to_json_oaicompat(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
params.add_generation_prompt = inputs.add_generation_prompt;
params.tool_choice = inputs.tool_choice;
+ params.enable_thinking = inputs.enable_thinking;
params.grammar = inputs.grammar;
params.now = inputs.now;
if (!inputs.json_schema.empty()) {
@@ -1702,7 +1733,7 @@ static common_chat_params common_chat_templates_apply_jinja(
}
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
- if (src.find("") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
+ if (src.find("") != std::string::npos && params.json_schema.is_null()) {
return common_chat_params_init_hermes_2_pro(tmpl, params);
}
@@ -1821,7 +1852,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
}
static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
- LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str());
+ LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str());
switch (format) {
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
@@ -1858,7 +1889,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
common_chat_parse_command_r7b(builder);
break;
default:
- throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
+ throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format));
}
builder.finish();
}
diff --git a/common/chat.h b/common/chat.h
index ce926777e..3e2cbbaae 100644
--- a/common/chat.h
+++ b/common/chat.h
@@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
bool parallel_tool_calls = false;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+ bool enable_thinking = true;
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
};
@@ -181,7 +182,8 @@ std::string common_chat_format_example(
const struct common_chat_templates * tmpls,
bool use_jinja);
-std::string common_chat_format_name(common_chat_format format);
+const char* common_chat_format_name(common_chat_format format);
+const char* common_reasoning_format_name(common_reasoning_format format);
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
diff --git a/common/common.h b/common/common.h
index f0c52c314..92b9533fc 100644
--- a/common/common.h
+++ b/common/common.h
@@ -368,6 +368,7 @@ struct common_params {
bool use_jinja = false; // NOLINT
bool enable_chat_template = true;
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+ int reasoning_budget = -1;
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
std::vector api_keys;
diff --git a/models/templates/Qwen-Qwen3-0.6B.jinja b/models/templates/Qwen-Qwen3-0.6B.jinja
new file mode 100644
index 000000000..699ff8df4
--- /dev/null
+++ b/models/templates/Qwen-Qwen3-0.6B.jinja
@@ -0,0 +1,85 @@
+{%- if tools %}
+ {{- '<|im_start|>system\n' }}
+ {%- if messages[0].role == 'system' %}
+ {{- messages[0].content + '\n\n' }}
+ {%- endif %}
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }}
+ {%- for tool in tools %}
+ {{- "\n" }}
+ {{- tool | tojson }}
+ {%- endfor %}
+ {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }}
+{%- else %}
+ {%- if messages[0].role == 'system' %}
+ {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+ {%- endif %}
+{%- endif %}
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+{%- for message in messages[::-1] %}
+ {%- set index = (messages|length - 1) - loop.index0 %}
+ {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %}
+ {%- set ns.multi_step_tool = false %}
+ {%- set ns.last_query_index = index %}
+ {%- endif %}
+{%- endfor %}
+{%- for message in messages %}
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+ {%- elif message.role == "assistant" %}
+ {%- set content = message.content %}
+ {%- set reasoning_content = '' %}
+ {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+ {%- set reasoning_content = message.reasoning_content %}
+ {%- else %}
+ {%- if '' in message.content %}
+ {%- set content = message.content.split('')[-1].lstrip('\n') %}
+ {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %}
+ {%- endif %}
+ {%- endif %}
+ {%- if loop.index0 > ns.last_query_index %}
+ {%- if loop.last or (not loop.last and reasoning_content) %}
+ {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- else %}
+ {{- '<|im_start|>' + message.role + '\n' + content }}
+ {%- endif %}
+ {%- if message.tool_calls %}
+ {%- for tool_call in message.tool_calls %}
+ {%- if (loop.first and content) or (not loop.first) %}
+ {{- '\n' }}
+ {%- endif %}
+ {%- if tool_call.function %}
+ {%- set tool_call = tool_call.function %}
+ {%- endif %}
+ {{- '\n{"name": "' }}
+ {{- tool_call.name }}
+ {{- '", "arguments": ' }}
+ {%- if tool_call.arguments is string %}
+ {{- tool_call.arguments }}
+ {%- else %}
+ {{- tool_call.arguments | tojson }}
+ {%- endif %}
+ {{- '}\n' }}
+ {%- endfor %}
+ {%- endif %}
+ {{- '<|im_end|>\n' }}
+ {%- elif message.role == "tool" %}
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+ {{- '<|im_start|>user' }}
+ {%- endif %}
+ {{- '\n\n' }}
+ {{- message.content }}
+ {{- '\n' }}
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+ {{- '<|im_end|>\n' }}
+ {%- endif %}
+ {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+ {{- '<|im_start|>assistant\n' }}
+ {%- if enable_thinking is defined and enable_thinking is false %}
+ {{- '\n\n\n\n' }}
+ {%- endif %}
+{%- endif %}
\ No newline at end of file
diff --git a/models/templates/README.md b/models/templates/README.md
index b8655be9f..35b6386dd 100644
--- a/models/templates/README.md
+++ b/models/templates/README.md
@@ -20,4 +20,5 @@ These templates can be updated with the following commands:
./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja
+./scripts/get_chat_template.py Qwen/Qwen3-0.6B > models/templates/Qwen-Qwen3-0.6B.jinja
```
\ No newline at end of file
diff --git a/tests/test-chat.cpp b/tests/test-chat.cpp
index dfcdce350..fb048022a 100644
--- a/tests/test-chat.cpp
+++ b/tests/test-chat.cpp
@@ -737,14 +737,14 @@ static void test_template_output_parsers() {
auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja");
std::vector end_tokens{ "<|im_end|>" };
- assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+ assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
}
{
auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja");
std::vector end_tokens{ "<|im_end|>" };
- assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
+ assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
assert_equals(
COMMON_CHAT_FORMAT_HERMES_2_PRO,
diff --git a/tools/server/README.md b/tools/server/README.md
index 0b84966ae..06533c172 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -173,7 +173,8 @@ The project is under active development, and we are [looking for feedback and co
| `--no-slots` | disables slots monitoring endpoint
(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
| `--jinja` | use jinja template for chat (default: disabled)
(env: LLAMA_ARG_JINJA) |
-| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)
controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).
only supported for non-streamed responses
(env: LLAMA_ARG_THINK) |
+| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:
- none: leaves thoughts unparsed in `message.content`
- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)
(default: deepseek)
(env: LLAMA_ARG_THINK) |
+| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)
(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
diff --git a/tools/server/server.cpp b/tools/server/server.cpp
index 9f0b0ffaa..07b613122 100644
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -178,7 +178,7 @@ struct slot_params {
{"grammar_triggers", grammar_triggers},
{"preserved_tokens", sampling.preserved_tokens},
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
- {"reasoning_format", (oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "deepseek" : "none")},
+ {"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
{"samplers", samplers},
@@ -357,7 +357,7 @@ struct server_task {
auto it = data.find("chat_format");
if (it != data.end()) {
params.oaicompat_chat_syntax.format = static_cast(it->get());
- SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format).c_str());
+ SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
} else {
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
}
@@ -2089,6 +2089,7 @@ struct server_context {
/* common_chat_templates */ chat_templates.get(),
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
+ /* enable_thinking */ params_base.reasoning_budget != 0,
};
}
diff --git a/tools/server/tests/unit/test_template.py b/tools/server/tests/unit/test_template.py
index 7bb857b33..c53eda5b8 100644
--- a/tools/server/tests/unit/test_template.py
+++ b/tools/server/tests/unit/test_template.py
@@ -25,6 +25,40 @@ def create_server():
server.n_slots = 1
+@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
+@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
+ ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "\n"),
+ ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", -1, "\n"),
+ ("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", 0, "\n"),
+
+ ("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
+ ("Qwen-Qwen3-0.6B", 0, "<|im_start|>assistant\n\n\n\n\n"),
+
+ ("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n\n"),
+ ("Qwen-QwQ-32B", 0, "<|im_start|>assistant\n\n"),
+
+ ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
+ ("CohereForAI-c4ai-command-r7b-12-2024-tool_use", 0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
+])
+def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
+ global server
+ server.jinja = True
+ server.reasoning_budget = reasoning_budget
+ server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
+ server.start(timeout_seconds=TIMEOUT_SERVER_START)
+
+ res = server.make_request("POST", "/apply-template", data={
+ "messages": [
+ {"role": "user", "content": "What is today?"},
+ ],
+ "tools": tools,
+ })
+ assert res.status_code == 200
+ prompt = res.body["prompt"]
+
+ assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'"
+
+
@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
@pytest.mark.parametrize("template_name,format", [
("meta-llama-Llama-3.3-70B-Instruct", "%d %b %Y"),
diff --git a/tools/server/tests/utils.py b/tools/server/tests/utils.py
index b480801b1..11672f515 100644
--- a/tools/server/tests/utils.py
+++ b/tools/server/tests/utils.py
@@ -84,7 +84,8 @@ class ServerProcess:
draft_max: int | None = None
no_webui: bool | None = None
jinja: bool | None = None
- reasoning_format: Literal['deepseek', 'none'] | None = None
+ reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
+ reasoning_budget: int | None = None
chat_template: str | None = None
chat_template_file: str | None = None
server_path: str | None = None
@@ -191,6 +192,8 @@ class ServerProcess:
server_args.append("--jinja")
if self.reasoning_format is not None:
server_args.extend(("--reasoning-format", self.reasoning_format))
+ if self.reasoning_budget is not None:
+ server_args.extend(("--reasoning-budget", self.reasoning_budget))
if self.chat_template:
server_args.extend(["--chat-template", self.chat_template])
if self.chat_template_file:
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index ee33f76c2..fc9f7071e 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -568,6 +568,7 @@ struct oaicompat_parser_options {
common_chat_templates * tmpls;
bool allow_image;
bool allow_audio;
+ bool enable_thinking = true;
};
// used by /chat/completions endpoint
@@ -733,6 +734,7 @@ static json oaicompat_chat_params_parse(
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
inputs.reasoning_format = opt.reasoning_format;
+ inputs.enable_thinking = opt.enable_thinking;
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
}