llama : streamline embeddings from "non-embedding" models (#8087)

2025-08-19 14:31:06 -04:00 · 2024-07-05 02:05:56 -05:00
parent bcefa03bc0
commit d12f781074
4 changed files with 36 additions and 10 deletions
--- a/common/common.h
+++ b/common/common.h
@@ -99,6 +99,7 @@ struct gpt_params {
    enum llama_split_mode        split_mode        = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
    enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
    enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
+    enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings

    // // sampling parameters
    struct llama_sampling_params sparams;