From 6a2bc8bfb7cd502e5ebc72e36c97a6f848c21c2c Mon Sep 17 00:00:00 2001 From: Isaac McFadyen Date: Sat, 17 May 2025 17:59:48 -0400 Subject: [PATCH] server : added --no-prefill-assistant flag (#13608) * added no-prefill-assistant flag * reworded documentation comment * updated server README.md --- common/arg.cpp | 10 ++++++++++ common/common.h | 1 + tools/server/README.md | 2 ++ tools/server/server.cpp | 2 ++ tools/server/utils.hpp | 3 ++- 5 files changed, 17 insertions(+), 1 deletion(-) diff --git a/common/arg.cpp b/common/arg.cpp index 8aa72515d..305168043 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2880,6 +2880,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.chat_template = read_file(value); } ).set_examples({LLAMA_EXAMPLE_MAIN, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE")); + add_opt(common_arg( + {"--no-prefill-assistant"}, + string_format( + "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n" + "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n" + ), + [](common_params & params) { + params.prefill_assistant = false; + } + ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT")); add_opt(common_arg( {"-sps", "--slot-prompt-similarity"}, "SIMILARITY", string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity), diff --git a/common/common.h b/common/common.h index a99a36029..da525dd42 100644 --- a/common/common.h +++ b/common/common.h @@ -368,6 +368,7 @@ struct common_params { bool use_jinja = false; // NOLINT bool enable_chat_template = true; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; + bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response std::vector api_keys; diff --git a/tools/server/README.md b/tools/server/README.md index 17ad93df6..0b84966ae 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -13,6 +13,7 @@ Set of LLM REST APIs and a simple web front end to interact with llama.cpp. * Multimodal ([documentation](../../docs/multimodal.md)) / with OpenAI-compatible API support * Monitoring endpoints * Schema-constrained JSON response format + * Prefilling of assistant messages similar to the Claude API * [Function calling](../../docs/function-calling.md) / tool use for ~any model * Speculative decoding * Easy-to-use web UI @@ -175,6 +176,7 @@ The project is under active development, and we are [looking for feedback and co | `--reasoning-format FORMAT` | reasoning format (default: deepseek; allowed values: deepseek, none)
controls whether thought tags are extracted from the response, and in which format they're returned. 'none' leaves thoughts unparsed in `message.content`, 'deepseek' puts them in `message.reasoning_content` (for DeepSeek R1 & Command R7B only).
only supported for non-streamed responses
(env: LLAMA_ARG_THINK) | | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE) | | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)
if suffix/prefix are specified, template will be disabled
only commonly used templates are accepted (unless --jinja is set before this flag):
list of built-in templates:
bailing, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, falcon3, gemma, gigachat, glmedge, granite, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, phi3, phi4, rwkv-world, smolvlm, vicuna, vicuna-orca, yandex, zephyr
(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) | +| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)
when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled
(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) | | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.50, 0.0 = disabled)
| | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) | | `--draft-max, --draft, --draft-n N` | number of tokens to draft for speculative decoding (default: 16)
(env: LLAMA_ARG_DRAFT_MAX) | diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 129d013ac..348588a2c 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4348,6 +4348,7 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse( body, params.use_jinja, + params.prefill_assistant, params.reasoning_format, ctx_server.chat_templates.get(), ctx_server.mctx, @@ -4369,6 +4370,7 @@ int main(int argc, char ** argv) { json data = oaicompat_completion_params_parse( body, params.use_jinja, + params.prefill_assistant, params.reasoning_format, ctx_server.chat_templates.get(), ctx_server.mctx, diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp index 232eef195..3e7733539 100644 --- a/tools/server/utils.hpp +++ b/tools/server/utils.hpp @@ -583,6 +583,7 @@ static json oaicompat_completion_params_parse(const json & body) { static json oaicompat_completion_params_parse( const json & body, /* openai api json semantics */ bool use_jinja, + bool prefill_assistant, common_reasoning_format reasoning_format, const struct common_chat_templates * tmpls, bool allow_non_text, @@ -732,7 +733,7 @@ static json oaicompat_completion_params_parse( // if the assistant message appears at the end of list, we do not add end-of-turn token // for ex. this can be useful to modify the reasoning process in reasoning models - bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"; + bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant" && prefill_assistant; common_chat_msg last_message; if (prefill_assistant_message) { last_message = inputs.messages.back();