llama : rework embeddings logic (#14208)

* llama : rework embeddings logic ggml-ci * cont : fix rerank ggml-ci * cont : engrish [no ci] * cont : fix rerank ggml-ci * server : support both embeddings and completions with single model ggml-ci * cont : avoid embeddings_org ggml-ci
2025-06-27 03:55:20 +00:00 · 2025-06-16 14:14:00 +03:00
parent 3ba0d843c6
commit d3e64b9f49
16 changed files with 159 additions and 114 deletions
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@ -88,6 +88,26 @@ enum error_type {
    ERROR_TYPE_NOT_SUPPORTED, // custom error
 };

+static bool server_task_type_need_embd(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_EMBEDDING:
+        case SERVER_TASK_TYPE_RERANK:
+            return true;
+        default:
+            return false;
+    }
+}
+
+static bool server_task_type_need_logits(server_task_type task_type) {
+    switch (task_type) {
+        case SERVER_TASK_TYPE_COMPLETION:
+        case SERVER_TASK_TYPE_INFILL:
+            return true;
+        default:
+            return false;
+    }
+}
+
 struct slot_params {
    bool stream        = true;
    bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
@ -1330,13 +1350,16 @@ struct server_slot {
        n_draft_accepted = 0;
    }

-    bool is_non_causal() const {
-        return task_type == SERVER_TASK_TYPE_EMBEDDING || task_type == SERVER_TASK_TYPE_RERANK;
+    bool need_embd() const {
+        return server_task_type_need_embd(task_type);
+    }
+
+    bool need_logits() const {
+        return server_task_type_need_logits(task_type);
    }

    bool can_batch_with(server_slot & other_slot) const {
-        return is_non_causal() == other_slot.is_non_causal()
-            && are_lora_equal(lora, other_slot.lora);
+        return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
    }

    bool has_budget(const common_params & global_params) {
@ -1480,7 +1503,6 @@ struct server_slot {
            {"n_ctx",         n_ctx},
            {"speculative",   can_speculate()},
            {"is_processing", is_processing()},
-            {"non_causal",    is_non_causal()},
            {"params",        params.to_json()},
            {"prompt",        prompt_tokens.detokenize(ctx, true)},
            {"next_token",
@ -1907,6 +1929,14 @@ struct server_context {
        llama_batch_free(batch);
    }

+    // if the context does not have a memory module then all embeddings have to be computed within a single ubatch
+    // also we cannot split if the pooling would require any past tokens
+    bool can_split() const {
+        return
+            !llama_get_embeddings(ctx) ||
+            (llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
+    }
+
    bool load_model(const common_params & params) {
        SRV_INF("loading model '%s'\n", params.model.path.c_str());

@ -2730,6 +2760,7 @@ struct server_context {
                        queue_tasks.defer(std::move(task));
                        break;
                    }
+
                    if (slot->is_processing()) {
                        // if requested slot is unavailable, we defer this task for processing later
                        SRV_DBG("requested slot is unavailable, defer task, id_task = %d\n", task.id);
@ -3092,7 +3123,14 @@ struct server_context {
                            continue;
                        }

-                        if (slot.is_non_causal()) {
+                        // TODO: support memory-less logits computation
+                        if (slot.need_logits() && !llama_get_memory(ctx)) {
+                            slot.release();
+                            send_error(slot, "the current context does not logits computation. skipping", ERROR_TYPE_SERVER);
+                            continue;
+                        }
+
+                        if (!can_split()) {
                            if (slot.n_prompt_tokens > n_ubatch) {
                                slot.release();
                                send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
@ -3227,8 +3265,7 @@ struct server_context {
                        }

                        if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) {
-                            // we have to evaluate at least 1 token to generate logits.
-                            SLT_WRN(slot, "need to evaluate at least 1 token to generate logits, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);
+                            SLT_WRN(slot, "need to evaluate at least 1 token for each active slot, n_past = %d, n_prompt_tokens = %d\n", slot.n_past, slot.n_prompt_tokens);

                            slot.n_past--;
                        }
@ -3236,8 +3273,7 @@ struct server_context {
                        slot.n_prompt_tokens_processed = 0;
                    }

-                    // non-causal tasks require to fit the entire prompt in the physical batch
-                    if (slot.is_non_causal()) {
+                    if (!can_split()) {
                        // cannot fit the prompt in the current batch - will try next iter
                        if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
                            continue;
@ -3259,8 +3295,7 @@ struct server_context {
                    slot.cache_tokens.keep_first(slot.n_past);

                    // check if we should process the image
-                    if (slot.n_past < slot.n_prompt_tokens
-                            && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
+                    if (slot.n_past < slot.n_prompt_tokens && slot.prompt_tokens[slot.n_past] == LLAMA_TOKEN_NULL) {
                        // process the image
                        int32_t new_n_past;
                        int32_t res = slot.prompt_tokens.process_chunk(ctx, mctx, slot.n_past, slot.id, new_n_past);
@ -3291,8 +3326,8 @@ struct server_context {
                            break; // end of text chunk
                        }

-                        // without pooling, we want to output the embeddings for all the tokens in the batch
-                        const bool need_embd = slot.task_type == SERVER_TASK_TYPE_EMBEDDING && llama_pooling_type(slot.ctx) == LLAMA_POOLING_TYPE_NONE;
+                        // embedding requires all tokens in the batch to be output
+                        const bool need_embd = server_task_type_need_embd(slot.task_type);

                        common_batch_add(batch, cur_tok, slot.n_past, { slot.id }, need_embd);
                        slot.cache_tokens.push_back(cur_tok);
@ -3346,17 +3381,15 @@ struct server_context {
        SRV_DBG("decoding batch, n_tokens = %d\n", batch.n_tokens);

        if (slot_batched) {
-            // make sure we're in the right embedding mode
-            llama_set_embeddings(ctx, slot_batched->is_non_causal());
            // apply lora, only need to do it once per batch
            common_set_adapter_lora(ctx, slot_batched->lora);
-        }

-        const bool do_encode = (params_base.embedding || params_base.reranking);
+            llama_set_embeddings(ctx, slot_batched->need_embd());
+        }

        // pad the batch so that batch.n_tokens >= n_slots
        // TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
-        if (do_encode) {
+        if (slot_batched->need_embd()) {
            const int n_slots = slots.size();

            if (batch.n_tokens < n_slots) {
@ -3378,8 +3411,11 @@ struct server_context {
                SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);

                for (int j = 0; j < n_add; ++j) {
-                    common_batch_add(batch, 0, j, { seq_id }, false);
+                    common_batch_add(batch, 0, j, { seq_id }, true);
                }
+
+                slots[seq_id].cache_tokens.clear();
+                llama_memory_seq_rm(llama_get_memory(ctx), seq_id, -1, -1);
            }
        }

@ -4174,11 +4210,6 @@ int main(int argc, char ** argv) {
            oaicompat_type oaicompat) -> void {
        GGML_ASSERT(type == SERVER_TASK_TYPE_COMPLETION || type == SERVER_TASK_TYPE_INFILL);

-        if (ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }
-
        auto completion_id = gen_chatcmplid();
        std::unordered_set<int> task_ids;
        try {
@ -4433,12 +4464,8 @@ int main(int argc, char ** argv) {
            OAICOMPAT_TYPE_NONE); // infill is not OAI compatible
    };

-    const auto handle_chat_completions = [&ctx_server, &res_error, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
+    const auto handle_chat_completions = [&ctx_server, &handle_completions_impl](const httplib::Request & req, httplib::Response & res) {
        LOG_DBG("request: %s\n", req.body.c_str());
-        if (ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support completions. Start it without `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
-            return;
-        }

        auto body = json::parse(req.body);
        std::vector<raw_buffer> files;
@ -4566,13 +4593,18 @@ int main(int argc, char ** argv) {
    };

    const auto handle_embeddings_impl = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res, oaicompat_type oaicompat) {
-        const json body = json::parse(req.body);
+        if (!ctx_server.params_base.embedding) {
+            res_error(res, format_error_response("This server does not support embeddings. Start it with `--embeddings`", ERROR_TYPE_NOT_SUPPORTED));
+            return;
+        }

        if (oaicompat != OAICOMPAT_TYPE_NONE && llama_pooling_type(ctx_server.ctx) == LLAMA_POOLING_TYPE_NONE) {
            res_error(res, format_error_response("Pooling type 'none' is not OAI compatible. Please use a different pooling type", ERROR_TYPE_INVALID_REQUEST));
            return;
        }

+        const json body = json::parse(req.body);
+
        // for the shape of input/content, see tokenize_input_prompts()
        json prompt;
        if (body.count("input") != 0) {
@ -4662,8 +4694,8 @@ int main(int argc, char ** argv) {
    };

    const auto handle_rerank = [&ctx_server, &res_error, &res_ok](const httplib::Request & req, httplib::Response & res) {
-        if (!ctx_server.params_base.reranking || ctx_server.params_base.embedding) {
-            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking` and without `--embedding`", ERROR_TYPE_NOT_SUPPORTED));
+        if (!ctx_server.params_base.embedding || ctx_server.params_base.pooling_type != LLAMA_POOLING_TYPE_RANK) {
+            res_error(res, format_error_response("This server does not support reranking. Start it with `--reranking`", ERROR_TYPE_NOT_SUPPORTED));
            return;
        }