mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 12:05:03 +00:00
@ -3385,38 +3385,6 @@ struct server_context {
|
||||
llama_set_embeddings(ctx, slot_batched->need_embd());
|
||||
}
|
||||
|
||||
// pad the batch so that batch.n_tokens >= n_slots
|
||||
// TODO: temporary workaround for https://github.com/ggml-org/llama.cpp/issues/13689
|
||||
if (slot_batched->need_embd()) {
|
||||
const int n_slots = slots.size();
|
||||
|
||||
if (batch.n_tokens < n_slots) {
|
||||
std::set<llama_seq_id> seq_ids;
|
||||
for (int j = 0; j < batch.n_tokens; ++j) {
|
||||
seq_ids.insert(batch.seq_id[j][0]);
|
||||
}
|
||||
|
||||
// find unused sequence id
|
||||
llama_seq_id seq_id = -1;
|
||||
for (int i = 0; i < n_slots; ++i) {
|
||||
if (seq_ids.find(i) == seq_ids.end()) {
|
||||
seq_id = i;
|
||||
}
|
||||
}
|
||||
|
||||
const int n_add = n_slots - batch.n_tokens;
|
||||
|
||||
SRV_WRN("adding %d dummy tokens to the batch, seq_id = %d\n", n_add, seq_id);
|
||||
|
||||
for (int j = 0; j < n_add; ++j) {
|
||||
common_batch_add(batch, 0, j, { seq_id }, true);
|
||||
}
|
||||
|
||||
slots[seq_id].cache_tokens.clear();
|
||||
llama_memory_seq_rm(llama_get_memory(ctx), seq_id, -1, -1);
|
||||
}
|
||||
}
|
||||
|
||||
int32_t i_next = 0;
|
||||
|
||||
// process the created batch of tokens
|
||||
|
Reference in New Issue
Block a user