context : fix init of n_outputs (#12397)

ggml-ci
This commit is contained in:
Georgi Gerganov
2025-03-16 19:29:36 +02:00
committed by GitHub
parent 7b61bcc87c
commit dc079cfdff

View File

@ -285,11 +285,15 @@ llama_context::llama_context(
// reserve worst-case graph // reserve worst-case graph
if (!hparams.vocab_only) { if (!hparams.vocab_only) {
uint32_t n_seqs = 1; // TODO: worst-case number of sequences const uint32_t n_seqs = 1; // TODO: worst-case number of sequences
uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch); const uint32_t n_tokens = std::min(cparams.n_ctx, cparams.n_ubatch);
llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph llama_token token = model.vocab.token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
// restore later
// TODO: something cleaner
const auto n_outputs_save = n_outputs;
// max number of outputs // max number of outputs
n_outputs = n_tokens; n_outputs = n_tokens;
@ -341,6 +345,8 @@ llama_context::llama_context(
} }
} }
n_outputs = n_outputs_save;
for (size_t i = 0; i < backend_ptrs.size(); ++i) { for (size_t i = 0; i < backend_ptrs.size(); ++i) {
ggml_backend_t backend = backend_ptrs[i]; ggml_backend_t backend = backend_ptrs[i];
ggml_backend_buffer_type_t buft = backend_buft[i]; ggml_backend_buffer_type_t buft = backend_buft[i];