From e2e1ddb93a01ce282e304431b37e60b3cddb6114 Mon Sep 17 00:00:00 2001 From: matteo Date: Tue, 29 Apr 2025 20:33:10 +0200 Subject: [PATCH] server : Prefilling assistant message in openai compatible API (#13174) * Prefilling assistant message in openai compatible API * fixed indentation * fixed code convention * simplify method usage * no more than one assistant message at end of messages * merge checks into prefill code * Update examples/server/utils.hpp --------- Co-authored-by: matteo Co-authored-by: Xuan-Son Nguyen --- examples/server/utils.hpp | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index aba2f27f9..b497959fd 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse( throw std::runtime_error("Cannot use custom grammar constraints with tools."); } + // if the assistant message appears at the end of list, we do not add end-of-turn token + // for ex. this can be useful to modify the reasoning process in reasoning models + bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant"; + common_chat_msg last_message; + if (prefill_assistant_message) { + last_message = inputs.messages.back(); + inputs.messages.pop_back(); + + /* sanity check, max one assistant message at the end of the list */ + if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){ + throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list."); + } + + inputs.extract_reasoning = false; + inputs.add_generation_prompt = true; + } + // Apply chat template to the list of messages auto chat_params = common_chat_templates_apply(tmpls, inputs); + /* Append assistant prefilled message */ + if (prefill_assistant_message) { + chat_params.prompt += last_message.content; + } + llama_params["chat_format"] = static_cast(chat_params.format); llama_params["prompt"] = chat_params.prompt; if (!chat_params.grammar.empty()) {