llama : rework embeddings logic (#14208)

* llama : rework embeddings logic

ggml-ci

* cont : fix rerank

ggml-ci

* cont : engrish [no ci]

* cont : fix rerank

ggml-ci

* server : support both embeddings and completions with single model

ggml-ci

* cont : avoid embeddings_org

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-06-16 14:14:00 +03:00
committed by GitHub
parent 3ba0d843c6
commit d3e64b9f49
16 changed files with 159 additions and 114 deletions

View File

@ -88,6 +88,26 @@ enum error_type {
ERROR_TYPE_NOT_SUPPORTED, // custom error
};
static bool server_task_type_need_embd(server_task_type task_type) {
switch (task_type) {
case SERVER_TASK_TYPE_EMBEDDING:
case SERVER_TASK_TYPE_RERANK:
return true;
default:
return false;
}
}
static bool server_task_type_need_logits(server_task_type task_type) {
switch (task_type) {
case SERVER_TASK_TYPE_COMPLETION:
case SERVER_TASK_TYPE_INFILL:
return true;
default:
return false;
}
}
struct slot_params {
bool stream = true;
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
@ -1330,13 +1350,16 @@ struct server_slot {
n_draft_accepted = 0;
}
bool is_non_causal() const {
return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
bool need_embd() const {
return server_task_type_need_embd(task_type);
}
bool need_logits() const {
return server_task_type_need_logits(task_type);
}
bool can_batch_with(server_slot & other_slot) const {
return is_non_causal() == other_slot.is_non_causal()
&& are_lora_equal(lora, other_slot.lora);
return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
}
bool has_budget(const common_params & global_params) {
@ -1480,7 +1503,6 @@ struct server_slot {
{"n_ctx", n_ctx},
{"speculative", can_speculate()},
{"is_processing", is_processing()},
{"non_causal", is_non_causal()},
{"params", params.to_json()},
{"prompt", prompt_tokens.detokenize(ctx, true)},
{"next_token",
@ -1907,6 +1929,14 @@ struct server_context {
llama_batch_free(batch);
}
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
// also we cannot split if the pooling would require any past tokens
bool can_split() const {
return
!llama_get_embeddings(ctx) ||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
}
bool load_model(const common_params & params) {
SRV_INF("loading model '%s'\n", params.model.path.c_str());
@ -2730,6 +2760,7 @@ struct server_context {
queue_tasks.defer(std::move(task));
break;
}
if (slot->is_processing()) {
// if requested slot is unavailable, we defer this task for processing later
SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
@ -3092,7 +3123,14 @@ struct server_context {
continue;
}
if (slot.is_non_causal()) {
// TODO: support memory-less logits computation
if (slot.need_logits() && !llama_get_memory(ctx)) {
slot.release();
send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
continue;
}
if (!can_split()) {
if (slot.n_prompt_tokens > n_ubatch) {
slot.release();
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
@ -3227,8 +3265,7 @@ struct server_context {
}
if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
// we have to evaluate at least 1 token to generate logits.
SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
SLT_WRN(slot, "need to evaluate at least 1 token for each active slot, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
slot.n_past--;
}
@ -3236,8 +3273,7 @@ struct server_context {
slot.n_prompt_tokens_processed = 0;
}
// non-causal tasks require to fit the entire prompt in the physical batch
if (slot.is_non_causal()) {
if (!can_split()) {
// cannot fit the prompt in the current batch - will try next iter
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
continue;
@ -3259,8 +3295,7 @@ struct server_context {
slot.cache_tokens.keep_first(slot.n_past);
// check if we should process the image
if (slot.n_past < slot.n_prompt_tokens
&& slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
// process the image
int32_t new_n_past;
int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
@ -3291,8 +3326,8 @@ struct server_context {
break; // end of text chunk
}
// without pooling, we want to output the embeddings for all the tokens in the batch
const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
// embedding requires all tokens in the batch to be output
const bool need_embd = server_task_type_need_embd(slot.task_type);
common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
slot.cache_tokens.push_back(cur_tok);
@ -3346,17 +3381,15 @@ struct server_context {
SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);
if (slot_batched) {
// make sure we're in the right embedding mode
llama_set_embeddings(ctx, slot_batched->is_non_causal());
// apply lora, only need to do it once per batch
common_set_adapter_lora(ctx, slot_batched->lora);
}
const bool do_encode = (params_base.embedding || params_base.reranking);
llama_set_embeddings(ctx, slot_batched->need_embd());
}
// pad the batch so that batch.n_tokens >= n_slots
// TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
if (do_encode) {
if (slot_batched->need_embd()) {
const int n_slots = slots.size();
if (batch.n_tokens < n_slots) {
@ -3378,8 +3411,11 @@ struct server_context {
SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
for (int j = 0; j < n_add; ++j) {
common_batch_add(batch, 0, j, { seq_id }, false);
common_batch_add(batch, 0, j, { seq_id }, true);
}
slots[seq_id].cache_tokens.clear();
llama_memory_seq_rm(llama_get_memory(ctx), seq_id, -1, -1);
}
}
@ -4174,11 +4210,6 @@ int main(int argc, char ** argv) {
oaicompat_type oaicompat) -> void {
GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);
if (ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
auto completion_id = gen_chatcmplid();
std::unordered_set<int> task_ids;
try {
@ -4433,12 +4464,8 @@ int main(int argc, char ** argv) {
OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
};
const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
LOG_DBG("request: %s\n", req.body.c_str());
if (ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
auto body = json::parse(req.body);
std::vector<raw_buffer> files;
@ -4566,13 +4593,18 @@ int main(int argc, char ** argv) {
};
const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
const json body = json::parse(req.body);
if (!ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
return;
}
if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
return;
}
const json body = json::parse(req.body);
// for the shape of input/content, see tokenize_input_prompts()
json prompt;
if (body.count("input") != 0) {
@ -4662,8 +4694,8 @@ int main(int argc, char ** argv) {
};
const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
return;
}