llama : fix FA when KV cache is not used (i.e. embeddings) (#12825)

* ggml : FA supports F32 V * graph : cast KV to F16 when the KV cache is not used ggml-ci * server : add test that exercises embeddings with FA enabled ggml-ci
2025-06-27 20:05:20 +00:00 · 2025-04-08 19:54:51 +03:00
parent 78a1ba0a4f
commit a19b5cef16
6 changed files with 59 additions and 6 deletions
--- a/examples/server/tests/utils.py
+++ b/examples/server/tests/utils.py
@ -323,6 +323,21 @@ class ServerPreset:
        server.server_embeddings = True
        return server

+    @staticmethod
+    def bert_bge_small_with_fa() -> ServerProcess:
+        server = ServerProcess()
+        server.model_hf_repo = "ggml-org/models"
+        server.model_hf_file = "bert-bge-small/ggml-model-f16.gguf"
+        server.model_alias = "bert-bge-small"
+        server.n_ctx = 1024
+        server.n_batch = 300
+        server.n_ubatch = 300
+        server.n_slots = 2
+        server.fa = True
+        server.seed = 42
+        server.server_embeddings = True
+        return server
+
    @staticmethod
    def tinyllama_infill() -> ServerProcess:
        server = ServerProcess()