From b4efd77f8ab407836ca73a5176f041650c5b2411 Mon Sep 17 00:00:00 2001 From: IsaacDynamo <61521674+IsaacDynamo@users.noreply.github.com> Date: Mon, 21 Jul 2025 09:24:51 +0200 Subject: [PATCH] server : add parse_special option to /tokenize endpoint (#14783) --- tools/server/README.md | 2 ++ tools/server/server.cpp | 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/server/README.md b/tools/server/README.md index e29511cb1..aa07f1ef5 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -575,6 +575,8 @@ These words will not be included in the completion, so make sure to add them to `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false` +`parse_special`: (Optional) Boolean indicating if special tokens should be tokenized. When `false` special tokens are treated as plaintext. Default: `true` + `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false` **Response:** diff --git a/tools/server/server.cpp b/tools/server/server.cpp index 0afe213af..256a2928b 100644 --- a/tools/server/server.cpp +++ b/tools/server/server.cpp @@ -4516,9 +4516,10 @@ int main(int argc, char ** argv) { json tokens_response = json::array(); if (body.count("content") != 0) { const bool add_special = json_value(body, "add_special", false); + const bool parse_special = json_value(body, "parse_special", true); const bool with_pieces = json_value(body, "with_pieces", false); - llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, true); + llama_tokens tokens = tokenize_mixed(ctx_server.vocab, body.at("content"), add_special, parse_special); if (with_pieces) { for (const auto& token : tokens) {