server : Prefilling assistant message in openai compatible API (#13174)

* Prefilling assistant message in openai compatible API

* fixed indentation

* fixed code convention

* simplify method usage

* no more than one assistant message at end of messages

* merge checks into prefill code

* Update examples/server/utils.hpp

---------

Co-authored-by: matteo <matteo@naspc.lan>
Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
matteo
2025-04-29 20:33:10 +02:00
committed by GitHub
parent d9d398f84f
commit e2e1ddb93a

View File

@ -642,9 +642,31 @@ static json oaicompat_completion_params_parse(
throw std::runtime_error("Cannot use custom grammar constraints with tools.");
}
// if the assistant message appears at the end of list, we do not add end-of-turn token
// for ex. this can be useful to modify the reasoning process in reasoning models
bool prefill_assistant_message = !inputs.messages.empty() && inputs.messages.back().role == "assistant";
common_chat_msg last_message;
if (prefill_assistant_message) {
last_message = inputs.messages.back();
inputs.messages.pop_back();
/* sanity check, max one assistant message at the end of the list */
if (!inputs.messages.empty() && inputs.messages.back().role == "assistant"){
throw std::runtime_error("Cannot have 2 or more assistant messages at the end of the list.");
}
inputs.extract_reasoning = false;
inputs.add_generation_prompt = true;
}
// Apply chat template to the list of messages
auto chat_params = common_chat_templates_apply(tmpls, inputs);
/* Append assistant prefilled message */
if (prefill_assistant_message) {
chat_params.prompt += last_message.content;
}
llama_params["chat_format"] = static_cast<int>(chat_params.format);
llama_params["prompt"] = chat_params.prompt;
if (!chat_params.grammar.empty()) {