mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 12:25:03 +00:00
server : fix cache reuse logic (#12161)
The first kv shift offsets the positions of all tokens after head_c. When using llama_kv_cache_seq_rm next, using head_c will remove the valid tokens because their positions have already been offset.
This commit is contained in:
@ -3003,7 +3003,7 @@ struct server_context {
|
|||||||
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
const int64_t kv_shift = (int64_t) head_p - (int64_t) head_c;
|
||||||
|
|
||||||
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
llama_kv_cache_seq_rm (ctx, slot.id, head_p, head_c);
|
||||||
llama_kv_cache_seq_add(ctx, slot.id, head_c, -1, kv_shift);
|
llama_kv_cache_seq_add(ctx, slot.id, head_c, head_c + n_match, kv_shift);
|
||||||
|
|
||||||
for (size_t i = 0; i < n_match; i++) {
|
for (size_t i = 0; i < n_match; i++) {
|
||||||
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
slot.cache_tokens[head_p + i] = slot.cache_tokens[head_c + i];
|
||||||
|
Reference in New Issue
Block a user