mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 20:05:20 +00:00
server
: add --reasoning-budget 0
to disable thinking (incl. qwen3 w/ enable_thinking:false) (#13771)
--------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
@ -2848,15 +2848,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_JINJA"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--reasoning-format"}, "FORMAT",
|
{"--reasoning-format"}, "FORMAT",
|
||||||
"reasoning format (default: deepseek; allowed values: deepseek, none)\n"
|
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
|
||||||
"controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).\n"
|
"- none: leaves thoughts unparsed in `message.content`\n"
|
||||||
"only supported for non-streamed responses",
|
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
|
||||||
|
"(default: deepseek)",
|
||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
|
||||||
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
|
||||||
else { std::invalid_argument("invalid value"); }
|
else { throw std::invalid_argument("invalid value"); }
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--reasoning-budget"}, "N",
|
||||||
|
"controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)",
|
||||||
|
[](common_params & params, int value) {
|
||||||
|
if (value != 0 && value != -1) { throw std::invalid_argument("invalid value"); }
|
||||||
|
params.reasoning_budget = value;
|
||||||
|
}
|
||||||
|
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK_BUDGET"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"--chat-template"}, "JINJA_TEMPLATE",
|
{"--chat-template"}, "JINJA_TEMPLATE",
|
||||||
string_format(
|
string_format(
|
||||||
@ -2955,7 +2964,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
[](common_params & params, const std::string & value) {
|
[](common_params & params, const std::string & value) {
|
||||||
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
/**/ if (value == "jsonl") { params.batched_bench_output_jsonl = true; }
|
||||||
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
else if (value == "md") { params.batched_bench_output_jsonl = false; }
|
||||||
else { std::invalid_argument("invalid value"); }
|
else { throw std::invalid_argument("invalid value"); }
|
||||||
}
|
}
|
||||||
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
).set_examples({LLAMA_EXAMPLE_BENCH}));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
|
@ -133,6 +133,7 @@ struct templates_params {
|
|||||||
bool stream;
|
bool stream;
|
||||||
std::string grammar;
|
std::string grammar;
|
||||||
bool add_generation_prompt = true;
|
bool add_generation_prompt = true;
|
||||||
|
bool enable_thinking = true;
|
||||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -573,7 +574,7 @@ common_chat_templates_ptr common_chat_templates_init(
|
|||||||
return tmpls;
|
return tmpls;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string common_chat_format_name(common_chat_format format) {
|
const char * common_chat_format_name(common_chat_format format) {
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
case COMMON_CHAT_FORMAT_CONTENT_ONLY: return "Content-only";
|
||||||
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
case COMMON_CHAT_FORMAT_GENERIC: return "Generic";
|
||||||
@ -591,6 +592,15 @@ std::string common_chat_format_name(common_chat_format format) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const char * common_reasoning_format_name(common_reasoning_format format) {
|
||||||
|
switch (format) {
|
||||||
|
case COMMON_REASONING_FORMAT_NONE: return "none";
|
||||||
|
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
|
||||||
|
default:
|
||||||
|
throw std::runtime_error("Unknown reasoning format");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
|
static std::string wrap_code_as_arguments(common_chat_msg_parser & builder, const std::string & code) {
|
||||||
std::string arguments;
|
std::string arguments;
|
||||||
if (builder.is_partial()) {
|
if (builder.is_partial()) {
|
||||||
@ -918,8 +928,14 @@ static common_chat_params common_chat_params_init_command_r7b(const common_chat_
|
|||||||
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
|
data.prompt = apply(tmpl, adjusted_messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, {});
|
||||||
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
data.format = COMMON_CHAT_FORMAT_COMMAND_R7B;
|
||||||
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
|
if (string_ends_with(data.prompt, "<|START_THINKING|>")) {
|
||||||
|
if (!inputs.enable_thinking) {
|
||||||
|
data.prompt += "<|END_THINKING|>";
|
||||||
|
} else {
|
||||||
data.thinking_forced_open = true;
|
data.thinking_forced_open = true;
|
||||||
}
|
}
|
||||||
|
} else if (!inputs.enable_thinking && string_ends_with(data.prompt, "<|CHATBOT_TOKEN|>")) {
|
||||||
|
data.prompt += "<|START_THINKING|><|END_THINKING|>";
|
||||||
|
}
|
||||||
|
|
||||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||||
@ -1186,8 +1202,12 @@ static common_chat_params common_chat_params_init_deepseek_r1(const common_chat_
|
|||||||
data.prompt = prompt;
|
data.prompt = prompt;
|
||||||
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
data.format = COMMON_CHAT_FORMAT_DEEPSEEK_R1;
|
||||||
if (string_ends_with(data.prompt, "<think>\n")) {
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
||||||
|
if (!inputs.enable_thinking) {
|
||||||
|
data.prompt += "</think>";
|
||||||
|
} else {
|
||||||
data.thinking_forced_open = true;
|
data.thinking_forced_open = true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
if (inputs.tools.is_array() && !inputs.tools.empty()) {
|
||||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED && inputs.json_schema.is_null();
|
||||||
@ -1460,12 +1480,21 @@ static void common_chat_parse_functionary_v3_1_llama_3_1(common_chat_msg_parser
|
|||||||
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat_template & tmpl, const struct templates_params & inputs) {
|
||||||
common_chat_params data;
|
common_chat_params data;
|
||||||
|
|
||||||
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt);
|
json additional_context = {
|
||||||
|
{"enable_thinking", inputs.enable_thinking},
|
||||||
|
};
|
||||||
|
|
||||||
|
data.prompt = apply(tmpl, inputs.messages, inputs.tools.empty() ? json() : inputs.tools, inputs.add_generation_prompt, additional_context);
|
||||||
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
data.format = COMMON_CHAT_FORMAT_HERMES_2_PRO;
|
||||||
if (string_ends_with(data.prompt, "<think>\n")) {
|
if (string_ends_with(data.prompt, "<think>\n")) {
|
||||||
|
if (!inputs.enable_thinking) {
|
||||||
|
data.prompt += "</think>";
|
||||||
|
} else {
|
||||||
data.thinking_forced_open = true;
|
data.thinking_forced_open = true;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!inputs.tools.is_null()) {
|
||||||
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
|
// (content)?(<tool_call>{"name": "foo", "arguments": {"a": 1}}</tool_call>)*
|
||||||
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
data.grammar_lazy = inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_REQUIRED;
|
||||||
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
data.grammar = build_grammar([&](const common_grammar_builder & builder) {
|
||||||
@ -1558,6 +1587,7 @@ static common_chat_params common_chat_params_init_hermes_2_pro(const common_chat
|
|||||||
"```xml",
|
"```xml",
|
||||||
};
|
};
|
||||||
});
|
});
|
||||||
|
}
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
@ -1669,6 +1699,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|||||||
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
params.messages = common_chat_msgs_to_json_oaicompat<json>(inputs.messages, /* concat_text= */ !tmpl.original_caps().requires_typed_content);
|
||||||
params.add_generation_prompt = inputs.add_generation_prompt;
|
params.add_generation_prompt = inputs.add_generation_prompt;
|
||||||
params.tool_choice = inputs.tool_choice;
|
params.tool_choice = inputs.tool_choice;
|
||||||
|
params.enable_thinking = inputs.enable_thinking;
|
||||||
params.grammar = inputs.grammar;
|
params.grammar = inputs.grammar;
|
||||||
params.now = inputs.now;
|
params.now = inputs.now;
|
||||||
if (!inputs.json_schema.empty()) {
|
if (!inputs.json_schema.empty()) {
|
||||||
@ -1702,7 +1733,7 @@ static common_chat_params common_chat_templates_apply_jinja(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
// Hermes 2/3 Pro, Qwen 2.5 Instruct (w/ tools)
|
||||||
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null() && params.tools.is_array() && params.json_schema.is_null()) {
|
if (src.find("<tool_call>") != std::string::npos && params.json_schema.is_null()) {
|
||||||
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
return common_chat_params_init_hermes_2_pro(tmpl, params);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1821,7 +1852,7 @@ static void common_chat_parse_content_only(common_chat_msg_parser & builder) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
|
static void common_chat_parse(common_chat_msg_parser & builder, common_chat_format format) {
|
||||||
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format).c_str(), builder.input().c_str());
|
LOG_DBG("Parsing input with format %s: %s\n", common_chat_format_name(format), builder.input().c_str());
|
||||||
|
|
||||||
switch (format) {
|
switch (format) {
|
||||||
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
|
case COMMON_CHAT_FORMAT_CONTENT_ONLY:
|
||||||
@ -1858,7 +1889,7 @@ static void common_chat_parse(common_chat_msg_parser & builder, common_chat_form
|
|||||||
common_chat_parse_command_r7b(builder);
|
common_chat_parse_command_r7b(builder);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
throw std::runtime_error("Unsupported format: " + common_chat_format_name(format));
|
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(format));
|
||||||
}
|
}
|
||||||
builder.finish();
|
builder.finish();
|
||||||
}
|
}
|
||||||
|
@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
|
|||||||
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
|
||||||
bool parallel_tool_calls = false;
|
bool parallel_tool_calls = false;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
|
||||||
|
bool enable_thinking = true;
|
||||||
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -181,7 +182,8 @@ std::string common_chat_format_example(
|
|||||||
const struct common_chat_templates * tmpls,
|
const struct common_chat_templates * tmpls,
|
||||||
bool use_jinja);
|
bool use_jinja);
|
||||||
|
|
||||||
std::string common_chat_format_name(common_chat_format format);
|
const char* common_chat_format_name(common_chat_format format);
|
||||||
|
const char* common_reasoning_format_name(common_reasoning_format format);
|
||||||
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
common_chat_msg common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
|
||||||
|
|
||||||
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
|
||||||
|
@ -368,6 +368,7 @@ struct common_params {
|
|||||||
bool use_jinja = false; // NOLINT
|
bool use_jinja = false; // NOLINT
|
||||||
bool enable_chat_template = true;
|
bool enable_chat_template = true;
|
||||||
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
|
||||||
|
int reasoning_budget = -1;
|
||||||
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
|
||||||
|
|
||||||
std::vector<std::string> api_keys;
|
std::vector<std::string> api_keys;
|
||||||
|
85
models/templates/Qwen-Qwen3-0.6B.jinja
Normal file
85
models/templates/Qwen-Qwen3-0.6B.jinja
Normal file
@ -0,0 +1,85 @@
|
|||||||
|
{%- if tools %}
|
||||||
|
{{- '<|im_start|>system\n' }}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- messages[0].content + '\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
||||||
|
{%- for tool in tools %}
|
||||||
|
{{- "\n" }}
|
||||||
|
{{- tool | tojson }}
|
||||||
|
{%- endfor %}
|
||||||
|
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
||||||
|
{%- else %}
|
||||||
|
{%- if messages[0].role == 'system' %}
|
||||||
|
{{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
|
||||||
|
{%- for message in messages[::-1] %}
|
||||||
|
{%- set index = (messages|length - 1) - loop.index0 %}
|
||||||
|
{%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
|
||||||
|
{%- set ns.multi_step_tool = false %}
|
||||||
|
{%- set ns.last_query_index = index %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- for message in messages %}
|
||||||
|
{%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
||||||
|
{%- elif message.role == "assistant" %}
|
||||||
|
{%- set content = message.content %}
|
||||||
|
{%- set reasoning_content = '' %}
|
||||||
|
{%- if message.reasoning_content is defined and message.reasoning_content is not none %}
|
||||||
|
{%- set reasoning_content = message.reasoning_content %}
|
||||||
|
{%- else %}
|
||||||
|
{%- if '</think>' in message.content %}
|
||||||
|
{%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
|
||||||
|
{%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if loop.index0 > ns.last_query_index %}
|
||||||
|
{%- if loop.last or (not loop.last and reasoning_content) %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- else %}
|
||||||
|
{{- '<|im_start|>' + message.role + '\n' + content }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if message.tool_calls %}
|
||||||
|
{%- for tool_call in message.tool_calls %}
|
||||||
|
{%- if (loop.first and content) or (not loop.first) %}
|
||||||
|
{{- '\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- if tool_call.function %}
|
||||||
|
{%- set tool_call = tool_call.function %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<tool_call>\n{"name": "' }}
|
||||||
|
{{- tool_call.name }}
|
||||||
|
{{- '", "arguments": ' }}
|
||||||
|
{%- if tool_call.arguments is string %}
|
||||||
|
{{- tool_call.arguments }}
|
||||||
|
{%- else %}
|
||||||
|
{{- tool_call.arguments | tojson }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '}\n</tool_call>' }}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- elif message.role == "tool" %}
|
||||||
|
{%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
|
||||||
|
{{- '<|im_start|>user' }}
|
||||||
|
{%- endif %}
|
||||||
|
{{- '\n<tool_response>\n' }}
|
||||||
|
{{- message.content }}
|
||||||
|
{{- '\n</tool_response>' }}
|
||||||
|
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
||||||
|
{{- '<|im_end|>\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endfor %}
|
||||||
|
{%- if add_generation_prompt %}
|
||||||
|
{{- '<|im_start|>assistant\n' }}
|
||||||
|
{%- if enable_thinking is defined and enable_thinking is false %}
|
||||||
|
{{- '<think>\n\n</think>\n\n' }}
|
||||||
|
{%- endif %}
|
||||||
|
{%- endif %}
|
@ -20,4 +20,5 @@ These templates can be updated with the following commands:
|
|||||||
./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
|
./scripts/get_chat_template.py NousResearch/Hermes-3-Llama-3.1-8B tool_use > models/templates/NousResearch-Hermes-3-Llama-3.1-8B-tool_use.jinja
|
||||||
./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
|
./scripts/get_chat_template.py Qwen/Qwen2.5-7B-Instruct > models/templates/Qwen-Qwen2.5-7B-Instruct.jinja
|
||||||
./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja
|
./scripts/get_chat_template.py Qwen/QwQ-32B > models/templates/Qwen-QwQ-32B.jinja
|
||||||
|
./scripts/get_chat_template.py Qwen/Qwen3-0.6B > models/templates/Qwen-Qwen3-0.6B.jinja
|
||||||
```
|
```
|
@ -737,14 +737,14 @@ static void test_template_output_parsers() {
|
|||||||
auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja");
|
auto tmpls = read_templates("models/templates/Qwen-QwQ-32B.jinja");
|
||||||
std::vector<std::string> end_tokens{ "<|im_end|>" };
|
std::vector<std::string> end_tokens{ "<|im_end|>" };
|
||||||
|
|
||||||
assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
|
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
|
||||||
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
|
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja");
|
auto tmpls = read_templates("models/templates/NousResearch-Hermes-2-Pro-Llama-3-8B-tool_use.jinja");
|
||||||
std::vector<std::string> end_tokens{ "<|im_end|>" };
|
std::vector<std::string> end_tokens{ "<|im_end|>" };
|
||||||
|
|
||||||
assert_equals(COMMON_CHAT_FORMAT_CONTENT_ONLY, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
|
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_no_tools).format);
|
||||||
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
|
assert_equals(COMMON_CHAT_FORMAT_HERMES_2_PRO, common_chat_templates_apply(tmpls.get(), inputs_tools).format);
|
||||||
assert_equals(
|
assert_equals(
|
||||||
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
COMMON_CHAT_FORMAT_HERMES_2_PRO,
|
||||||
|
@ -173,7 +173,8 @@ The project is under active development, and we are [looking for feedback and co
|
|||||||
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
|
||||||
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
|
||||||
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
| `--jinja` | use jinja template for chat (default: disabled)<br/>(env: LLAMA_ARG_JINJA) |
|
||||||
| `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)<br/>controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).<br/>only supported for non-streamed responses<br/>(env: LLAMA_ARG_THINK) |
|
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)<br/>(default: deepseek)<br/>(env: LLAMA_ARG_THINK) |
|
||||||
|
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
|
||||||
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
|
||||||
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
|
||||||
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
|
| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
|
||||||
|
@ -178,7 +178,7 @@ struct slot_params {
|
|||||||
{"grammar_triggers", grammar_triggers},
|
{"grammar_triggers", grammar_triggers},
|
||||||
{"preserved_tokens", sampling.preserved_tokens},
|
{"preserved_tokens", sampling.preserved_tokens},
|
||||||
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
|
{"chat_format", common_chat_format_name(oaicompat_chat_syntax.format)},
|
||||||
{"reasoning_format", (oaicompat_chat_syntax.reasoning_format == COMMON_REASONING_FORMAT_DEEPSEEK ? "deepseek" : "none")},
|
{"reasoning_format", common_reasoning_format_name(oaicompat_chat_syntax.reasoning_format)},
|
||||||
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
|
{"reasoning_in_content", oaicompat_chat_syntax.reasoning_in_content},
|
||||||
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
|
{"thinking_forced_open", oaicompat_chat_syntax.thinking_forced_open},
|
||||||
{"samplers", samplers},
|
{"samplers", samplers},
|
||||||
@ -357,7 +357,7 @@ struct server_task {
|
|||||||
auto it = data.find("chat_format");
|
auto it = data.find("chat_format");
|
||||||
if (it != data.end()) {
|
if (it != data.end()) {
|
||||||
params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
|
params.oaicompat_chat_syntax.format = static_cast<common_chat_format>(it->get<int>());
|
||||||
SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format).c_str());
|
SRV_INF("Chat format: %s\n", common_chat_format_name(params.oaicompat_chat_syntax.format));
|
||||||
} else {
|
} else {
|
||||||
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
|
params.oaicompat_chat_syntax.format = defaults.oaicompat_chat_syntax.format;
|
||||||
}
|
}
|
||||||
@ -2089,6 +2089,7 @@ struct server_context {
|
|||||||
/* common_chat_templates */ chat_templates.get(),
|
/* common_chat_templates */ chat_templates.get(),
|
||||||
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
/* allow_image */ mctx ? mtmd_support_vision(mctx) : false,
|
||||||
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
/* allow_audio */ mctx ? mtmd_support_audio (mctx) : false,
|
||||||
|
/* enable_thinking */ params_base.reasoning_budget != 0,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -25,6 +25,40 @@ def create_server():
|
|||||||
server.n_slots = 1
|
server.n_slots = 1
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
|
||||||
|
@pytest.mark.parametrize("template_name,reasoning_budget,expected_end", [
|
||||||
|
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", None, "<think>\n"),
|
||||||
|
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", -1, "<think>\n"),
|
||||||
|
("deepseek-ai-DeepSeek-R1-Distill-Qwen-32B", 0, "<think>\n</think>"),
|
||||||
|
|
||||||
|
("Qwen-Qwen3-0.6B", -1, "<|im_start|>assistant\n"),
|
||||||
|
("Qwen-Qwen3-0.6B", 0, "<|im_start|>assistant\n<think>\n\n</think>\n\n"),
|
||||||
|
|
||||||
|
("Qwen-QwQ-32B", -1, "<|im_start|>assistant\n<think>\n"),
|
||||||
|
("Qwen-QwQ-32B", 0, "<|im_start|>assistant\n<think>\n</think>"),
|
||||||
|
|
||||||
|
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", -1, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>"),
|
||||||
|
("CohereForAI-c4ai-command-r7b-12-2024-tool_use", 0, "<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|><|END_THINKING|>"),
|
||||||
|
])
|
||||||
|
def test_reasoning_budget(template_name: str, reasoning_budget: int | None, expected_end: str, tools: list[dict]):
|
||||||
|
global server
|
||||||
|
server.jinja = True
|
||||||
|
server.reasoning_budget = reasoning_budget
|
||||||
|
server.chat_template_file = f'../../../models/templates/{template_name}.jinja'
|
||||||
|
server.start(timeout_seconds=TIMEOUT_SERVER_START)
|
||||||
|
|
||||||
|
res = server.make_request("POST", "/apply-template", data={
|
||||||
|
"messages": [
|
||||||
|
{"role": "user", "content": "What is today?"},
|
||||||
|
],
|
||||||
|
"tools": tools,
|
||||||
|
})
|
||||||
|
assert res.status_code == 200
|
||||||
|
prompt = res.body["prompt"]
|
||||||
|
|
||||||
|
assert prompt.endswith(expected_end), f"Expected prompt to end with '{expected_end}', got '{prompt}'"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
|
@pytest.mark.parametrize("tools", [None, [], [TEST_TOOL]])
|
||||||
@pytest.mark.parametrize("template_name,format", [
|
@pytest.mark.parametrize("template_name,format", [
|
||||||
("meta-llama-Llama-3.3-70B-Instruct", "%d %b %Y"),
|
("meta-llama-Llama-3.3-70B-Instruct", "%d %b %Y"),
|
||||||
|
@ -84,7 +84,8 @@ class ServerProcess:
|
|||||||
draft_max: int | None = None
|
draft_max: int | None = None
|
||||||
no_webui: bool | None = None
|
no_webui: bool | None = None
|
||||||
jinja: bool | None = None
|
jinja: bool | None = None
|
||||||
reasoning_format: Literal['deepseek', 'none'] | None = None
|
reasoning_format: Literal['deepseek', 'none', 'nothink'] | None = None
|
||||||
|
reasoning_budget: int | None = None
|
||||||
chat_template: str | None = None
|
chat_template: str | None = None
|
||||||
chat_template_file: str | None = None
|
chat_template_file: str | None = None
|
||||||
server_path: str | None = None
|
server_path: str | None = None
|
||||||
@ -191,6 +192,8 @@ class ServerProcess:
|
|||||||
server_args.append("--jinja")
|
server_args.append("--jinja")
|
||||||
if self.reasoning_format is not None:
|
if self.reasoning_format is not None:
|
||||||
server_args.extend(("--reasoning-format", self.reasoning_format))
|
server_args.extend(("--reasoning-format", self.reasoning_format))
|
||||||
|
if self.reasoning_budget is not None:
|
||||||
|
server_args.extend(("--reasoning-budget", self.reasoning_budget))
|
||||||
if self.chat_template:
|
if self.chat_template:
|
||||||
server_args.extend(["--chat-template", self.chat_template])
|
server_args.extend(["--chat-template", self.chat_template])
|
||||||
if self.chat_template_file:
|
if self.chat_template_file:
|
||||||
|
@ -568,6 +568,7 @@ struct oaicompat_parser_options {
|
|||||||
common_chat_templates * tmpls;
|
common_chat_templates * tmpls;
|
||||||
bool allow_image;
|
bool allow_image;
|
||||||
bool allow_audio;
|
bool allow_audio;
|
||||||
|
bool enable_thinking = true;
|
||||||
};
|
};
|
||||||
|
|
||||||
// used by /chat/completions endpoint
|
// used by /chat/completions endpoint
|
||||||
@ -733,6 +734,7 @@ static json oaicompat_chat_params_parse(
|
|||||||
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
inputs.parallel_tool_calls = json_value(body, "parallel_tool_calls", false);
|
||||||
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
inputs.add_generation_prompt = json_value(body, "add_generation_prompt", true);
|
||||||
inputs.reasoning_format = opt.reasoning_format;
|
inputs.reasoning_format = opt.reasoning_format;
|
||||||
|
inputs.enable_thinking = opt.enable_thinking;
|
||||||
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
if (!inputs.tools.empty() && inputs.tool_choice != COMMON_CHAT_TOOL_CHOICE_NONE && body.contains("grammar")) {
|
||||||
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user