server : implement universal assisted decoding (#12635)

* llama-server : implement universal assisted decoding * Erase prompt tail for kv-cache * set vocab_dft_compatible in common_speculative * rename ctx_main to ctx_tgt * move vocab_dft_compatible to spec struct * clear mem_dft, remove mem * detokenize id_last for incompatible models * update comment * add --spec-replace flag * accept special tokens when translating between draft/main models * Escape spec-replace * clamp draft result to size to params.n_draft * fix comment * clean up code * restore old example * log common_speculative_are_compatible in speculative example * fix * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-08-13 20:07:41 -04:00 · 2025-07-31 05:25:23 -07:00
parent c1dacaa99b
commit 94933c8c2e
6 changed files with 168 additions and 62 deletions
--- a/tools/server/server.cpp
+++ b/tools/server/server.cpp
@@ -1929,6 +1929,7 @@ struct server_context {
    mtmd_context * mctx = nullptr;

    const llama_vocab * vocab = nullptr;
+    bool vocab_dft_compatible = true;

    llama_model * model_dft = nullptr;

@@ -2019,10 +2020,9 @@ struct server_context {
                return false;
            }

-            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
-                SRV_ERR("the draft model '%s' is not compatible with the target model '%s'\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
-
-                return false;
+            vocab_dft_compatible = common_speculative_are_compatible(ctx, llama_init_dft.context.get());
+            if (!vocab_dft_compatible) {
+                SRV_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params_base.speculative.model.path.c_str(), params_base.model.path.c_str());
            }

            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
@@ -2112,11 +2112,14 @@ struct server_context {
                    return;
                }

-                slot.spec = common_speculative_init(slot.ctx_dft);
+                slot.spec = common_speculative_init(slot.ctx, slot.ctx_dft);
                if (slot.spec == nullptr) {
                    SRV_ERR("%s", "failed to create speculator\n");
                    return;
                }
+                for (auto &pair : params_base.speculative.replacements) {
+                    common_speculative_add_replacement_tgt_dft(slot.spec, pair.first.c_str(), pair.second.c_str());
+                }
            }

            SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);