llama : rework embeddings logic (#14208)

* llama : rework embeddings logic

ggml-ci

* cont : fix rerank

ggml-ci

* cont : engrish [no ci]

* cont : fix rerank

ggml-ci

* server : support both embeddings and completions with single model

ggml-ci

* cont : avoid embeddings_org

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-06-16 14:14:00 +03:00
committed by GitHub
parent 3ba0d843c6
commit d3e64b9f49
16 changed files with 159 additions and 114 deletions

View File

@ -299,7 +299,8 @@ llama_batch_allocr::llama_batch_allocr() {
bool llama_batch_allocr::init(
const llama_batch & batch_inp,
const llama_vocab & vocab,
const llama_memory_i * memory) {
const llama_memory_i * memory,
bool embd_all) {
clear();
batch = batch_inp;
@ -378,10 +379,31 @@ bool llama_batch_allocr::init(
}
if (!batch.logits) {
// by default return the output only for the last token
output.resize(batch.n_tokens);
output[output.size() - 1] = true;
if (embd_all) {
// return the output for all tokens
output.resize(batch.n_tokens, true);
} else {
// return the output only for the last token
output.resize(batch.n_tokens, false);
output[output.size() - 1] = true;
}
batch.logits = output.data();
} else if (embd_all) {
bool warn = false;
for (int32_t i = 0; i < batch.n_tokens; ++i) {
if (batch.logits[i] == 0) {
warn = true;
}
}
if (warn) {
LLAMA_LOG_WARN("%s: embeddings required but some input tokens were not marked as outputs -> overriding\n", __func__);
output.resize(batch.n_tokens, true);
batch.logits = output.data();
}
}
//