server : implement universal assisted decoding (#12635)

* llama-server : implement universal assisted decoding

* Erase prompt tail for kv-cache

* set vocab_dft_compatible in common_speculative

* rename ctx_main to ctx_tgt

* move vocab_dft_compatible to spec struct

* clear mem_dft, remove mem

* detokenize id_last for incompatible models

* update comment

* add --spec-replace flag

* accept special tokens when translating between draft/main models

* Escape spec-replace

* clamp draft result to size to params.n_draft

* fix comment

* clean up code

* restore old example

* log common_speculative_are_compatible in speculative example

* fix

* Update common/speculative.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/speculative.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/speculative.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
g2mt
2025-07-31 05:25:23 -07:00
committed by GitHub
parent c1dacaa99b
commit 94933c8c2e
6 changed files with 168 additions and 62 deletions

View File

@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
ctx_dft = llama_init_dft.context.get();
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
return 1;
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
}
// Tokenize the prompt
@@ -130,7 +130,10 @@ int main(int argc, char ** argv) {
params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
params_spec.p_min = p_min;
struct common_speculative * spec = common_speculative_init(ctx_dft);
struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
for (auto &pair : params.speculative.replacements) {
common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
}
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);