diff --git a/ci/run.sh b/ci/run.sh
index 940055705..e1b777c30 100755
--- a/ci/run.sh
+++ b/ci/run.sh
@@ -779,7 +779,7 @@ function gg_run_rerank_tiny {
     model_f16="${path_models}/ggml-model-f16.gguf"
 
     # for this model, the SEP token is "</s>"
-    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?</s></s>hi\nwhat is panda?</s></s>it's a bear\nwhat is panda?</s></s>The giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
+    (time ./bin/llama-embedding --model ${model_f16} -p "what is panda?\thi\nwhat is panda?\tit's a bear\nwhat is panda?\tThe giant panda (Ailuropoda melanoleuca), sometimes called a panda bear or simply panda, is a bear species endemic to China." -ngl 99 -c 0 --pooling rank --embd-normalize -1 --verbose-prompt) 2>&1 | tee -a $OUT/${ci}-rk-f16.log
 
     # sample output
     # rerank score 0:    0.029
diff --git a/common/arg.cpp b/common/arg.cpp
index 3dfaa71ef..c4ad85c47 100644
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2706,6 +2706,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.embd_sep = value;
         }
     ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
+    add_opt(common_arg(
+        {"--cls-separator"}, "STRING",
+        "separator of classification sequences (default \\t) for example \"<#seq#>\"",
+        [](common_params & params, const std::string & value) {
+            params.cls_sep = value;
+        }
+    ).set_examples({LLAMA_EXAMPLE_EMBEDDING}));
     add_opt(common_arg(
         {"--host"}, "HOST",
         string_format("ip address to listen, or bind to an UNIX socket if the address ends with .sock (default: %s)", params.hostname.c_str()),
diff --git a/common/common.h b/common/common.h
index 5710c4e97..e08a59eae 100644
--- a/common/common.h
+++ b/common/common.h
@@ -358,6 +358,7 @@ struct common_params {
     int32_t embd_normalize = 2;     // normalisation for embeddings (-1=none, 0=max absolute int16, 1=taxicab, 2=euclidean, >2=p-norm)
     std::string embd_out   = "";    // empty = default, "array" = [[],[]...], "json" = openai style, "json+" = same "json" + cosine similarity matrix
     std::string embd_sep   = "\n";  // separator of embeddings
+    std::string cls_sep    = "\t";  // separator of classification sequences
 
     // server params
     int32_t port           = 8080;         // server listens on this network port
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2e08db345..2fe76589e 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -2145,7 +2145,6 @@ class Llama4Model(LlamaModel):
 
     def set_vocab(self):
         self._set_vocab_gpt2()
-        self.gguf_writer.add_add_bos_token(True)
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -3918,9 +3917,6 @@ class BertModel(TextModel):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
-
 
 @ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
 class DistilBertModel(BertModel):
@@ -3962,8 +3958,6 @@ class RobertaModel(BertModel):
         bpe_tok_path = self.dir_model / "tokenizer.json"
         if bpe_tok_path.exists():
             self._set_vocab_gpt2()
-            self.gguf_writer.add_add_bos_token(True)
-            self.gguf_writer.add_add_eos_token(True)
 
             # we need this to validate the size of the token_type embeddings
             # though currently we are passing all zeros to the token_type embeddings
@@ -4848,8 +4842,6 @@ class JinaBertV2Model(BertModel):
             self.gguf_writer.add_token_type_count(2)
         else:
             raise NotImplementedError(f'Tokenizer {tokenizer_class} is not supported for JinaBertModel')
-        self.gguf_writer.add_add_bos_token(True)
-        self.gguf_writer.add_add_eos_token(True)
 
 
 @ModelBase.register("OpenELMForCausalLM")
@@ -5451,9 +5443,6 @@ class T5Model(TextModel):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
-
     def set_gguf_parameters(self):
         if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
             logger.warning("Couldn't find context length in config.json, assuming default value of 512")
@@ -5591,9 +5580,6 @@ class T5EncoderModel(TextModel):
         special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
         special_vocab.add_to_gguf(self.gguf_writer)
 
-        self.gguf_writer.add_add_bos_token(False)
-        self.gguf_writer.add_add_eos_token(True)
-
     def set_gguf_parameters(self):
         if (n_ctx := self.find_hparam(["n_positions"], optional=True)) is None:
             logger.warning("Couldn't find context length in config.json, assuming default value of 512")
diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
index 681929d27..0ec2999a0 100644
--- a/examples/embedding/embedding.cpp
+++ b/examples/embedding/embedding.cpp
@@ -133,10 +133,36 @@ int main(int argc, char ** argv) {
     // max batch size
     const uint64_t n_batch = params.n_batch;
 
+    // get added sep and eos token, if any
+    const std::string added_sep_token = llama_vocab_get_add_sep(vocab) ? llama_vocab_get_text(vocab, llama_vocab_sep(vocab)) : "";
+    const std::string added_eos_token = llama_vocab_get_add_eos(vocab) ? llama_vocab_get_text(vocab, llama_vocab_eos(vocab)) : "";
+
     // tokenize the prompts and trim
     std::vector<std::vector<int32_t>> inputs;
     for (const auto & prompt : prompts) {
-        auto inp = common_tokenize(ctx, prompt, true, true);
+        std::vector<llama_token> inp;
+
+        // split classification pairs and insert expected separator tokens
+        if (pooling_type == LLAMA_POOLING_TYPE_RANK && prompt.find(params.cls_sep) != std::string::npos) {
+            std::vector<std::string> pairs = split_lines(prompt, params.cls_sep);
+            std::string final_prompt;
+
+            for (size_t i = 0; i < pairs.size(); i++) {
+                final_prompt += pairs[i];
+                if (i != pairs.size() - 1) {
+                    if (!added_eos_token.empty()) {
+                        final_prompt += added_eos_token;
+                    }
+                    if (!added_sep_token.empty()) {
+                        final_prompt += added_sep_token;
+                    }
+                }
+            }
+
+            inp = common_tokenize(ctx, final_prompt, true, true);
+        } else {
+            inp = common_tokenize(ctx, prompt, true, true);
+        }
         if (inp.size() > n_batch) {
             LOG_ERR("%s: number of tokens in input line (%lld) exceeds batch size (%lld), increase batch size and re-run\n",
                     __func__, (long long int) inp.size(), (long long int) n_batch);
@@ -145,11 +171,11 @@ int main(int argc, char ** argv) {
         inputs.push_back(inp);
     }
 
-    // check if the last token is SEP
+    // check if the last token is SEP/EOS
     // it should be automatically added by the tokenizer when 'tokenizer.ggml.add_eos_token' is set to 'true'
     for (auto & inp : inputs) {
-        if (inp.empty() || inp.back() != llama_vocab_sep(vocab)) {
-            LOG_WRN("%s: last token in the prompt is not SEP\n", __func__);
+        if (inp.empty() || (inp.back() != llama_vocab_sep(vocab) && inp.back() != llama_vocab_eos(vocab))) {
+            LOG_WRN("%s: last token in the prompt is not SEP or EOS\n", __func__);
             LOG_WRN("%s: 'tokenizer.ggml.add_eos_token' should be set to 'true' in the GGUF header\n", __func__);
         }
     }
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 834a1d5e1..0429b0aaf 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -198,6 +198,7 @@ class Keys:
         MASK_ID              = "tokenizer.ggml.mask_token_id"
         ADD_BOS              = "tokenizer.ggml.add_bos_token"
         ADD_EOS              = "tokenizer.ggml.add_eos_token"
+        ADD_SEP              = "tokenizer.ggml.add_sep_token"
         ADD_PREFIX           = "tokenizer.ggml.add_space_prefix"
         REMOVE_EXTRA_WS      = "tokenizer.ggml.remove_extra_whitespaces"
         PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 54ca0c33f..b9b63d052 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -891,6 +891,9 @@ class GGUFWriter:
     def add_add_eos_token(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_EOS, value)
 
+    def add_add_sep_token(self, value: bool) -> None:
+        self.add_bool(Keys.Tokenizer.ADD_SEP, value)
+
     def add_add_space_prefix(self, value: bool) -> None:
         self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
 
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py
index 44d066ee7..6c4d3a422 100644
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -119,6 +119,7 @@ class SpecialVocab:
         logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
 
     def _try_load_from_tokenizer_json(self, path: Path) -> bool:
+        tokenizer = None
         tokenizer_file = path / 'tokenizer.json'
         if tokenizer_file.is_file():
             with open(tokenizer_file, encoding = 'utf-8') as f:
@@ -152,11 +153,87 @@ class SpecialVocab:
             added_tokens = tokenizer.get('added_tokens', {})
         else:
             added_tokens = {}
+        tokenizer_config = None
         tokenizer_config_file = path / 'tokenizer_config.json'
-        if not tokenizer_config_file.is_file():
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, encoding = 'utf-8') as f:
+                tokenizer_config = json.load(f)
+        if tokenizer:
+            special_bos = (tokenizer_config or {}).get('bos_token')
+            special_cls = (tokenizer_config or {}).get('cls_token')
+            special_eos = (tokenizer_config or {}).get('eos_token')
+            special_sep = (tokenizer_config or {}).get('sep_token')
+            if not special_bos and special_cls and tokenizer_config:
+                tokenizer_config['bos_token'] = special_bos = special_cls
+            if not special_eos and special_sep and tokenizer_config:
+                tokenizer_config['eos_token'] = special_eos = special_sep
+            post_processor = tokenizer.get('post_processor', {})
+            for processor in post_processor.get('processors', [post_processor]):
+                if processor.get('type') == 'RobertaProcessing':
+                    self.add_special_token['bos'] = True
+                    self.add_special_token['eos'] = True
+                    self.add_special_token['sep'] = True
+                    if not special_cls and tokenizer_config:
+                        special_cls = processor.get('cls', [special_bos])[0]
+                        tokenizer_config['cls_token'] = special_cls
+                    if not special_sep and tokenizer_config:
+                        special_sep = processor.get('sep', [special_eos])[0]
+                        tokenizer_config['sep_token'] = special_sep
+                    continue
+                # Crude parsing of TemplateProcessing to determine if BOS/SEP/EOS should be added
+                # Only works with simple templates, **will** get it wrong on unusual sequences
+                if processor.get('type') == 'TemplateProcessing':
+                    tmpl_single = processor.get('single', [])
+                    tmpl_pair = processor.get('pair', [])
+                    special_first = None
+                    special_last = None
+                    if len(tmpl_single) > 1:
+                        if special_first := tmpl_single[0].get('SpecialToken', {}).get('id'):
+                            if not tokenizer_config:
+                                special_bos = special_first
+                            self.add_special_token['bos'] = True if special_first in (special_bos, special_cls) else False
+                            if special_first not in (special_bos, special_cls):
+                                logger.warning(f'Unknown leading special token {special_first!r} in TemplateProcessing<single>')
+                        if special_last := tmpl_single[-1].get('SpecialToken', {}).get('id'):
+                            if not tokenizer_config:
+                                special_eos = special_last
+                            self.add_special_token['eos'] = True if special_last == special_eos else False
+                            if special_last != special_eos:
+                                logger.warning(f'Unknown trailing special token {special_last!r} in TemplateProcessing<single>')
+                    if tmpl_pair:
+                        seq_start = 1 if tmpl_pair[0].get('SpecialToken', {}).get('id') == special_first else 0
+                        seq_stop = -1 if tmpl_pair[-1].get('SpecialToken', {}).get('id') == special_last else None
+                        if seq_start == 0 or seq_stop is None:
+                            logger.warning('TemplateProcessing<single> leading/trailing special tokens do not match TemplateProcessing<pair>')
+                        if tmpl_pair := tmpl_pair[slice(seq_start, seq_stop)]:
+                            tmpl_a = tmpl_pair[0].get('Sequence', {}).get('id')
+                            tmpl_b = tmpl_pair[-1].get('Sequence', {}).get('id')
+                            if tmpl_a != 'A' or tmpl_b != 'B':
+                                logger.warning(f'Unknown sequence {tmpl_a}...{tmpl_b} in TemplateProcessing<pair>')
+                            # A [sep] [eos] B
+                            if tmpl_a == 'A' and tmpl_b == 'B' and (tmpl_pair := tmpl_pair[1:-1]):
+                                add_sep = False
+                                if special_entry := tmpl_pair[0].get('SpecialToken', {}).get('id'):
+                                    if special_entry in (special_sep, special_eos) and not special_last:
+                                        add_sep = True
+                                    if special_entry not in (special_sep, special_eos):
+                                        logger.warning(f'Unknown separator token {special_entry!r} in TemplateProcessing<pair>')
+                                else:
+                                    logger.warning(f'Unknown middle sequence {tmpl_pair[0]!r} in TemplateProcessing<pair>')
+                                if len(tmpl_pair) == 2:
+                                    if special_entry := tmpl_pair[1].get('SpecialToken', {}).get('id'):
+                                        if special_entry in (special_sep, special_eos):
+                                            add_sep = True
+                                        if special_entry not in (special_sep, special_eos):
+                                            logger.warning(f'Unknown second separator token {special_entry!r} in TemplateProcessing<pair>')
+                                    else:
+                                        logger.warning(f'Unknown second middle sequence {tmpl_pair[1]!r} in TemplateProcessing<pair>')
+                                self.add_special_token['sep'] = add_sep
+                                if add_sep and not special_sep and tokenizer_config:
+                                    tokenizer_config['sep_token'] = special_eos
+                    continue
+        if not tokenizer_config:
             return True
-        with open(tokenizer_config_file, encoding = 'utf-8') as f:
-            tokenizer_config = json.load(f)
         chat_template_alt = None
         chat_template_file = path / 'chat_template.json'
         if chat_template_file.is_file():
diff --git a/include/llama.h b/include/llama.h
index 635508b10..3475d5965 100644
--- a/include/llama.h
+++ b/include/llama.h
@@ -1044,6 +1044,7 @@ extern "C" {
 
     LLAMA_API bool llama_vocab_get_add_bos(const struct llama_vocab * vocab);
     LLAMA_API bool llama_vocab_get_add_eos(const struct llama_vocab * vocab);
+    LLAMA_API bool llama_vocab_get_add_sep(const struct llama_vocab * vocab);
 
     LLAMA_API llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab);
     LLAMA_API llama_token llama_vocab_fim_suf(const struct llama_vocab * vocab);
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 0bc60565d..8dadef204 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -198,6 +198,7 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_TOKENIZER_MASK_ID,              "tokenizer.ggml.mask_token_id"            },
     { LLM_KV_TOKENIZER_ADD_BOS,              "tokenizer.ggml.add_bos_token"            },
     { LLM_KV_TOKENIZER_ADD_EOS,              "tokenizer.ggml.add_eos_token"            },
+    { LLM_KV_TOKENIZER_ADD_SEP,              "tokenizer.ggml.add_sep_token"            },
     { LLM_KV_TOKENIZER_ADD_PREFIX,           "tokenizer.ggml.add_space_prefix"         },
     { LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,      "tokenizer.ggml.remove_extra_whitespaces" },
     { LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP, "tokenizer.ggml.precompiled_charsmap"     },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 51b242c66..5b0230c15 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -194,6 +194,7 @@ enum llm_kv {
     LLM_KV_TOKENIZER_MASK_ID,
     LLM_KV_TOKENIZER_ADD_BOS,
     LLM_KV_TOKENIZER_ADD_EOS,
+    LLM_KV_TOKENIZER_ADD_SEP,
     LLM_KV_TOKENIZER_ADD_PREFIX,
     LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,
     LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index a70b98923..563823dc3 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -228,6 +228,7 @@ void llama_model_saver::add_kv_from_model() {
     // add_kv(LLM_KV_TOKENIZER_MASK_ID,                 ???);
     add_kv(LLM_KV_TOKENIZER_ADD_BOS,                 vocab.get_add_bos());
     add_kv(LLM_KV_TOKENIZER_ADD_EOS,                 vocab.get_add_eos());
+    add_kv(LLM_KV_TOKENIZER_ADD_SEP,                 vocab.get_add_sep());
     add_kv(LLM_KV_TOKENIZER_ADD_PREFIX,              vocab.get_add_space_prefix());
     add_kv(LLM_KV_TOKENIZER_REMOVE_EXTRA_WS,         vocab.get_remove_extra_whitespaces());
     add_kv(LLM_KV_TOKENIZER_PRECOMPILED_CHARSMAP,    vocab.get_precompiled_charsmap());
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp
index d90f1d6b1..4ab120d9b 100644
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1269,6 +1269,7 @@ struct llama_vocab::impl {
     bool add_space_prefix           = false;
     bool add_bos                    = false;
     bool add_eos                    = false;
+    bool add_sep                    = false;
     bool ignore_merges              = false;
     bool clean_spaces               = false;  // clean_up_tokenization_spaces
     bool remove_extra_whitespaces   = false;
@@ -1421,6 +1422,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             special_sep_id  = 102;
             special_pad_id  = 0;
             special_mask_id = 103;
+
+            add_sep = true;
         } else if (tokenizer_model == "gpt2") {
             type = LLAMA_VOCAB_TYPE_BPE;
 
@@ -1550,12 +1553,15 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                     tokenizer_pre == "jina-es" ||
                     tokenizer_pre == "jina-de" ||
                     tokenizer_pre == "gigachat"   ||
-                    tokenizer_pre == "jina-v1-en" ||
                     tokenizer_pre == "jina-v2-es" ||
-                    tokenizer_pre == "jina-v2-de" ||
+                    tokenizer_pre == "jina-v2-de") {
+                pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+            } else if (
+                    tokenizer_pre == "jina-v1-en" ||
                     tokenizer_pre == "jina-v2-code" ||
                     tokenizer_pre == "roberta-bpe") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
+                add_sep = true;
             } else if (
                     tokenizer_pre == "refact") {
                 pre_type = LLAMA_VOCAB_PRE_TYPE_REFACT;
@@ -1665,6 +1671,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             clean_spaces = true;
             add_bos = true;
             add_eos = false;
+            add_sep = true;
         } else if (type == LLAMA_VOCAB_TYPE_UGM) {
             pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
             add_bos = false;
@@ -1801,7 +1808,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             }
         }
 
-        // Handle add_bos and add_eos
+        // Handle add_bos, add_eos and add_sep
         {
             bool temp = true;
 
@@ -1811,6 +1818,9 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
             if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
                 add_eos = temp;
             }
+            if (ml.get_key(LLM_KV_TOKENIZER_ADD_SEP, temp, false)) {
+                add_sep = temp;
+            }
         }
 
         // auto-detect special tokens by text
@@ -3000,6 +3010,10 @@ bool llama_vocab::get_add_eos() const {
     return pimpl->add_eos;
 }
 
+bool llama_vocab::get_add_sep() const {
+    return pimpl->add_sep;
+}
+
 bool llama_vocab::get_ignore_merges() const {
     return pimpl->ignore_merges;
 }
@@ -3191,6 +3205,10 @@ bool llama_vocab_get_add_eos(const struct llama_vocab * vocab) {
     return vocab->get_add_eos();
 }
 
+bool llama_vocab_get_add_sep(const struct llama_vocab * vocab) {
+    return vocab->get_add_sep();
+}
+
 llama_token llama_vocab_fim_pre(const struct llama_vocab * vocab) {
     return vocab->token_fim_pre();
 }
diff --git a/src/llama-vocab.h b/src/llama-vocab.h
index daa6cf308..40e4d1c05 100644
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@@ -74,6 +74,7 @@ struct llama_vocab {
     bool get_add_space_prefix          () const;
     bool get_add_bos                   () const;
     bool get_add_eos                   () const;
+    bool get_add_sep                   () const;
     bool get_ignore_merges             () const;
     bool get_clean_spaces              () const;
     bool get_remove_extra_whitespaces  () const;
diff --git a/tools/server/utils.hpp b/tools/server/utils.hpp
index f3e0392a4..f8fab2c86 100644
--- a/tools/server/utils.hpp
+++ b/tools/server/utils.hpp
@@ -271,12 +271,20 @@ static llama_tokens format_rerank(const struct llama_vocab * vocab, const llama_
     }
 
     result.reserve(doc.size() + query.size() + 4);
-    result.push_back(llama_vocab_bos(vocab));
+    if (llama_vocab_get_add_bos(vocab)) {
+        result.push_back(llama_vocab_bos(vocab));
+    }
     result.insert(result.end(), query.begin(), query.end());
-    result.push_back(eos_token);
-    result.push_back(llama_vocab_sep(vocab));
+    if (llama_vocab_get_add_eos(vocab)) {
+        result.push_back(eos_token);
+    }
+    if (llama_vocab_get_add_sep(vocab)) {
+        result.push_back(llama_vocab_sep(vocab));
+    }
     result.insert(result.end(), doc.begin(), doc.end());
-    result.push_back(eos_token);
+    if (llama_vocab_get_add_eos(vocab)) {
+        result.push_back(eos_token);
+    }
 
     return result;
 }