mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 12:05:03 +00:00
server : fix incorrect usage of llama_get_embeddings() (#14225)
* server : fix incorrect usage of llama_get_embeddings() ggml-ci * cont : fix the fix ggml-ci
This commit is contained in:
@ -965,6 +965,7 @@ extern "C" {
|
|||||||
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
LLAMA_API int32_t llama_n_threads_batch(struct llama_context * ctx);
|
||||||
|
|
||||||
// Set whether the context outputs embeddings or not
|
// Set whether the context outputs embeddings or not
|
||||||
|
// TODO: rename to avoid confusion with llama_get_embeddings()
|
||||||
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
LLAMA_API void llama_set_embeddings(struct llama_context * ctx, bool embeddings);
|
||||||
|
|
||||||
// Set whether to use causal attention or not
|
// Set whether to use causal attention or not
|
||||||
|
@ -1358,6 +1358,14 @@ struct server_slot {
|
|||||||
return server_task_type_need_logits(task_type);
|
return server_task_type_need_logits(task_type);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
||||||
|
// also we cannot split if the pooling would require any past tokens
|
||||||
|
bool can_split() const {
|
||||||
|
return
|
||||||
|
!need_embd() ||
|
||||||
|
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
|
||||||
|
}
|
||||||
|
|
||||||
bool can_batch_with(server_slot & other_slot) const {
|
bool can_batch_with(server_slot & other_slot) const {
|
||||||
return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
|
return task_type == other_slot.task_type && are_lora_equal(lora, other_slot.lora);
|
||||||
}
|
}
|
||||||
@ -1929,14 +1937,6 @@ struct server_context {
|
|||||||
llama_batch_free(batch);
|
llama_batch_free(batch);
|
||||||
}
|
}
|
||||||
|
|
||||||
// if the context does not have a memory module then all embeddings have to be computed within a single ubatch
|
|
||||||
// also we cannot split if the pooling would require any past tokens
|
|
||||||
bool can_split() const {
|
|
||||||
return
|
|
||||||
!llama_get_embeddings(ctx) ||
|
|
||||||
(llama_get_memory(ctx) && llama_pooling_type(ctx) == LLAMA_POOLING_TYPE_LAST);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool load_model(const common_params & params) {
|
bool load_model(const common_params & params) {
|
||||||
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
SRV_INF("loading model '%s'\n", params.model.path.c_str());
|
||||||
|
|
||||||
@ -3130,7 +3130,7 @@ struct server_context {
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!can_split()) {
|
if (!slot.can_split()) {
|
||||||
if (slot.n_prompt_tokens > n_ubatch) {
|
if (slot.n_prompt_tokens > n_ubatch) {
|
||||||
slot.release();
|
slot.release();
|
||||||
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
send_error(slot, "input is too large to process. increase the physical batch size", ERROR_TYPE_SERVER);
|
||||||
@ -3273,7 +3273,7 @@ struct server_context {
|
|||||||
slot.n_prompt_tokens_processed = 0;
|
slot.n_prompt_tokens_processed = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!can_split()) {
|
if (!slot.can_split()) {
|
||||||
// cannot fit the prompt in the current batch - will try next iter
|
// cannot fit the prompt in the current batch - will try next iter
|
||||||
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
|
if (batch.n_tokens + slot.n_prompt_tokens > n_batch) {
|
||||||
continue;
|
continue;
|
||||||
|
Reference in New Issue
Block a user