llama : rework embeddings logic (#14208)

* llama : rework embeddings logic ggml-ci * cont : fix rerank ggml-ci * cont : engrish [no ci] * cont : fix rerank ggml-ci * server : support both embeddings and completions with single model ggml-ci * cont : avoid embeddings_org ggml-ci
2025-07-29 05:33:37 -04:00 · 2025-06-16 14:14:00 +03:00
parent 3ba0d843c6
commit d3e64b9f49
16 changed files with 159 additions and 114 deletions
--- a/src/llama-context.cpp
+++ b/src/llama-context.cpp
@@ -728,7 +728,7 @@ int llama_context::encode(const llama_batch & batch_inp) {
    }

    // note: during encode, we always pass the full sequence starting from pos = 0
-    if (!batch_allocr->init(batch_inp, model.vocab, nullptr)) {
+    if (!batch_allocr->init(batch_inp, model.vocab, nullptr, true)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@@ -894,7 +894,10 @@ int llama_context::decode(const llama_batch & batch_inp) {
        return -1;
    }

-    if (!batch_allocr->init(batch_inp, model.vocab, memory.get())) {
+    // when computing embeddings, all tokens are output
+    const bool embd_all = cparams.embeddings;
+
+    if (!batch_allocr->init(batch_inp, model.vocab, memory.get(), embd_all)) {
        LLAMA_LOG_ERROR("%s: failed to initialize batch\n", __func__);
        return -1;
    }
@@ -911,12 +914,9 @@ int llama_context::decode(const llama_batch & batch_inp) {

    GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT

-    // this indicates we are doing pooled embedding
-    const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
-
    const uint32_t n_outputs_all = batch_allocr->get_n_outputs();

-    if (embd_pooled) {
+    if (embd_all) {
        // require that all tokens are output
        if (n_outputs_all != n_tokens_all) {
            LLAMA_LOG_ERROR("%s: pooled embedding requires that all tokens are output (n_outputs_all = %d, n_tokens_all = %d)\n",
@@ -945,7 +945,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
    llama_memory_state_ptr mstate;

    while (true) {
-        mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled);
+        mstate = memory->init_batch(batch, cparams.n_ubatch, embd_all);
        if (!mstate) {
            return -2;
        }
@@ -1058,7 +1058,7 @@ int llama_context::decode(const llama_batch & batch_inp) {
        //    ggml_graph_dump_dot(gf, NULL, "llama.dot");
        //}

-        auto * t_logits = cparams.embeddings ? nullptr         : res->get_logits();
+        auto * t_logits = res->get_logits();
        auto * t_embd   = cparams.embeddings ? res->get_embd() : nullptr;

        if (t_embd && res->get_embd_pooled()) {
@@ -1222,9 +1222,8 @@ uint32_t llama_context::output_reserve(int32_t n_outputs) {
    const auto n_vocab = vocab.n_tokens();
    const auto n_embd  = hparams.n_embd;

-    // TODO: use a per-batch flag for logits presence instead
-    bool has_logits = !cparams.embeddings;
-    bool has_embd   =  cparams.embeddings && (cparams.pooling_type == LLAMA_POOLING_TYPE_NONE);
+    bool has_logits = true;
+    bool has_embd   = cparams.embeddings;

    // TODO: hacky enc-dec support
    if (model.arch == LLM_ARCH_T5) {
@@ -2044,14 +2043,11 @@ void llama_context::opt_epoch_iter(

        n_queued_tokens += n_tokens_all;

-        // this indicates we are doing pooled embedding
-        const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE;
-
        embd_seq.clear();

        uint32_t n_outputs_all = n_tokens_all;

-        auto mstate = memory->init_batch(batch, cparams.n_ubatch, embd_pooled);
+        auto mstate = memory->init_batch(batch, cparams.n_ubatch, true);
        if (!mstate || mstate->get_status() != LLAMA_MEMORY_STATUS_SUCCESS) {
            LLAMA_LOG_ERROR("%s: could not initialize batch\n", __func__);
            break;