mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-14 04:17:53 -04:00
server : implement universal assisted decoding (#12635)
* llama-server : implement universal assisted decoding * Erase prompt tail for kv-cache * set vocab_dft_compatible in common_speculative * rename ctx_main to ctx_tgt * move vocab_dft_compatible to spec struct * clear mem_dft, remove mem * detokenize id_last for incompatible models * update comment * add --spec-replace flag * accept special tokens when translating between draft/main models * Escape spec-replace * clamp draft result to size to params.n_draft * fix comment * clean up code * restore old example * log common_speculative_are_compatible in speculative example * fix * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> * Update common/speculative.cpp Co-authored-by: Georgi Gerganov <ggerganov@gmail.com> --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
@@ -65,7 +65,7 @@ int main(int argc, char ** argv) {
|
||||
ctx_dft = llama_init_dft.context.get();
|
||||
|
||||
if (!common_speculative_are_compatible(ctx_tgt, ctx_dft)) {
|
||||
return 1;
|
||||
LOG_INF("the draft model '%s' is not compatible with the target model '%s'. tokens will be translated between the draft and target models.\n", params.speculative.model.path.c_str(), params.model.path.c_str());
|
||||
}
|
||||
|
||||
// Tokenize the prompt
|
||||
@@ -130,7 +130,10 @@ int main(int argc, char ** argv) {
|
||||
params_spec.n_reuse = llama_n_ctx(ctx_dft) - n_draft;
|
||||
params_spec.p_min = p_min;
|
||||
|
||||
struct common_speculative * spec = common_speculative_init(ctx_dft);
|
||||
struct common_speculative * spec = common_speculative_init(ctx_tgt, ctx_dft);
|
||||
for (auto &pair : params.speculative.replacements) {
|
||||
common_speculative_add_replacement_tgt_dft(spec, pair.first.c_str(), pair.second.c_str());
|
||||
}
|
||||
|
||||
llama_batch batch_tgt = llama_batch_init(llama_n_batch(ctx_tgt), 0, 1);
|
||||
|
||||
|
Reference in New Issue
Block a user