mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 20:05:20 +00:00
llama : fix FA when KV cache is not used (i.e. embeddings) (#12825)
* ggml : FA supports F32 V * graph : cast KV to F16 when the KV cache is not used ggml-ci * server : add test that exercises embeddings with FA enabled ggml-ci
This commit is contained in:
@ -15,7 +15,7 @@ async def main():
|
||||
model_url = "http://127.0.0.1:6900"
|
||||
responses: list[requests.Response] = await asyncio.gather(*[requests_post_async(
|
||||
url= f"{model_url}/embedding",
|
||||
json= {"content": str(0)*1024}
|
||||
json= {"content": "a "*1022}
|
||||
) for i in range(n)])
|
||||
|
||||
for response in responses:
|
||||
|
Reference in New Issue
Block a user