llama.cpp/common/chat-parser.h

#pragma once

#include "chat.h"
#include "json-partial.h"
#include "regex-partial.h"

#include <nlohmann/json.hpp>

#include <optional>
#include <string>
#include <vector>

class common_chat_msg_partial_exception : public std::runtime_error {
  public:
    common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}
};

class common_chat_msg_parser {
    std::string input_;
    bool is_partial_;
    common_chat_syntax syntax_;
    std::string healing_marker_;

    size_t pos_ = 0;
    common_chat_msg result_;

  public:
    common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
    const std::string & input() const { return input_; }
    size_t pos() const { return pos_; }
    const std::string & healing_marker() const { return healing_marker_; }
    const bool & is_partial() const { return is_partial_; }
    const common_chat_msg & result() const { return result_; }
    const common_chat_syntax & syntax() const { return syntax_; }

    void move_to(size_t pos) {
        if (pos > input_.size()) {
            throw std::runtime_error("Invalid position!");
        }
        pos_ = pos;
    }
    void move_back(size_t n) {
        if (pos_ < n) {
            throw std::runtime_error("Can't move back that far!");
        }
        pos_ -= n;
    }

    // Get the substring of the input at the given range
    std::string str(const common_string_range & rng) const;

    // Appends to the result.content field
    void add_content(const std::string & content);

    // Appends to the result.reasoning_content field
    void add_reasoning_content(const std::string & reasoning_content);

    // Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.
    bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);

    // Adds a tool call using the "name", "id" and "arguments" fields of the json object
    bool add_tool_call(const nlohmann::ordered_json & tool_call);

    // Adds an array of tool calls using their "name", "id" and "arguments" fields.
    bool add_tool_calls(const nlohmann::ordered_json & arr);

    void finish();

    bool consume_spaces();

    void consume_literal(const std::string & literal);

    bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);

    std::string consume_rest();

    struct find_regex_result {
        std::string prelude;
        std::vector<common_string_range> groups;
    };

    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);

    bool try_consume_literal(const std::string & literal);

    std::optional<find_regex_result> try_find_literal(const std::string & literal);

    find_regex_result consume_regex(const common_regex & regex);

    std::optional<find_regex_result> try_consume_regex(const common_regex & regex);

    std::optional<common_json> try_consume_json();
    common_json consume_json();

    struct consume_json_result {
        nlohmann::ordered_json value;
        bool is_partial;
    };

    /*
        Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.

        By default, object keys can't be truncated, nor can string values (their corresponding key is removed,
        e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`

        But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings
        - with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
        - with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
    */
    consume_json_result consume_json_with_dumped_args(
        const std::vector<std::vector<std::string>> & args_paths = {},
        const std::vector<std::vector<std::string>> & content_paths = {}
    );
    std::optional<consume_json_result> try_consume_json_with_dumped_args(
        const std::vector<std::vector<std::string>> & args_paths = {},
        const std::vector<std::vector<std::string>> & content_paths = {}
    );
};
`server`: streaming of tool calls and thoughts when `--jinja` is on (#12379) * add common_json w/ support for truncated json healing * add common_chat_msg_diff * partial common_chat_parse * refactor parser w/ optionals * server: wire chat diffs in stream mode * fix trigger of thinking models (must happen after thoughts are closed) * fix functionary v3.2 raw python! * rename: common_chat_syntax (now contains format) * rm common_regex.at_start * don't return empty <think></think> * accommodate yet another deepseek r1 distill fantasy syntax (`<｜tool▁calls｜>`) * fix QwQ 32B tool call parsing after thoughts (hermes2) * better logs for grammar triggers * consume spaces after parse_json_tool_calls * fix required tool calls w/ thinking models that have pre-opened thinking tags * fix thinking model's initial trigger + test qwq's template * run most test_tool_call tests in stream + non-stream modes * make functionary v3.2 parsing more strict (differentiate first match from others) * send final diff from server, to close off raw python arguments * support partial content streaming in Generic mode * tool-call: allow content prelude before hermes2 tool calls (for Qwen2.5) * Update function-calling.md * Update tool_bench.py * chat-parser: remove input from exception (llm output may contain PII) --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Olivier Chafik <ochafik@users.noreply.github.com> 2025-05-25 01:48:08 +01:00			`#pragma once`

			`#include "chat.h"`
			`#include "json-partial.h"`
			`#include "regex-partial.h"`

sync : vendor (#13901) * sync : vendor ggml-ci * cont : fix httplib version ggml-ci * cont : fix lint * cont : fix lint * vendor : move to common folder /vendor ggml-ci * cont : fix lint * cont : move httplib to /vendor + use json_fwd.hpp ggml-ci * cont : fix server build ggml-ci * cont : add missing headers ggml-ci * cont : header clean-up ggml-ci 2025-05-30 16:25:45 +03:00			`#include <nlohmann/json.hpp>`

`server`: streaming of tool calls and thoughts when `--jinja` is on (#12379) * add common_json w/ support for truncated json healing * add common_chat_msg_diff * partial common_chat_parse * refactor parser w/ optionals * server: wire chat diffs in stream mode * fix trigger of thinking models (must happen after thoughts are closed) * fix functionary v3.2 raw python! * rename: common_chat_syntax (now contains format) * rm common_regex.at_start * don't return empty <think></think> * accommodate yet another deepseek r1 distill fantasy syntax (`<｜tool▁calls｜>`) * fix QwQ 32B tool call parsing after thoughts (hermes2) * better logs for grammar triggers * consume spaces after parse_json_tool_calls * fix required tool calls w/ thinking models that have pre-opened thinking tags * fix thinking model's initial trigger + test qwq's template * run most test_tool_call tests in stream + non-stream modes * make functionary v3.2 parsing more strict (differentiate first match from others) * send final diff from server, to close off raw python arguments * support partial content streaming in Generic mode * tool-call: allow content prelude before hermes2 tool calls (for Qwen2.5) * Update function-calling.md * Update tool_bench.py * chat-parser: remove input from exception (llm output may contain PII) --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Olivier Chafik <ochafik@users.noreply.github.com> 2025-05-25 01:48:08 +01:00			`#include <optional>`
			`#include <string>`
			`#include <vector>`

			`class common_chat_msg_partial_exception : public std::runtime_error {`
			`public:`
			`common_chat_msg_partial_exception(const std::string & message) : std::runtime_error(message) {}`
			`};`

			`class common_chat_msg_parser {`
			`std::string input_;`
			`bool is_partial_;`
			`common_chat_syntax syntax_;`
			`std::string healing_marker_;`

			`size_t pos_ = 0;`
			`common_chat_msg result_;`

			`public:`
			`common_chat_msg_parser(const std::string & input, bool is_partial, const common_chat_syntax & syntax);`
			`const std::string & input() const { return input_; }`
			`size_t pos() const { return pos_; }`
			`const std::string & healing_marker() const { return healing_marker_; }`
			`const bool & is_partial() const { return is_partial_; }`
			`const common_chat_msg & result() const { return result_; }`
server: fix streaming crashes (#13786) * add preludes to content on partial regex match * allow all parsers to parse non-tool-call content. * tweak order of <\|python_tag\|> vs <function= parsing for functionary v3.1 format. still not ideal but hopefully less prone to crash 2025-05-26 08:03:57 -07:00			`const common_chat_syntax & syntax() const { return syntax_; }`
`server`: streaming of tool calls and thoughts when `--jinja` is on (#12379) * add common_json w/ support for truncated json healing * add common_chat_msg_diff * partial common_chat_parse * refactor parser w/ optionals * server: wire chat diffs in stream mode * fix trigger of thinking models (must happen after thoughts are closed) * fix functionary v3.2 raw python! * rename: common_chat_syntax (now contains format) * rm common_regex.at_start * don't return empty <think></think> * accommodate yet another deepseek r1 distill fantasy syntax (`<｜tool▁calls｜>`) * fix QwQ 32B tool call parsing after thoughts (hermes2) * better logs for grammar triggers * consume spaces after parse_json_tool_calls * fix required tool calls w/ thinking models that have pre-opened thinking tags * fix thinking model's initial trigger + test qwq's template * run most test_tool_call tests in stream + non-stream modes * make functionary v3.2 parsing more strict (differentiate first match from others) * send final diff from server, to close off raw python arguments * support partial content streaming in Generic mode * tool-call: allow content prelude before hermes2 tool calls (for Qwen2.5) * Update function-calling.md * Update tool_bench.py * chat-parser: remove input from exception (llm output may contain PII) --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Olivier Chafik <ochafik@users.noreply.github.com> 2025-05-25 01:48:08 +01:00
			`void move_to(size_t pos) {`
			`if (pos > input_.size()) {`
			`throw std::runtime_error("Invalid position!");`
			`}`
			`pos_ = pos;`
			`}`
			`void move_back(size_t n) {`
			`if (pos_ < n) {`
			`throw std::runtime_error("Can't move back that far!");`
			`}`
			`pos_ -= n;`
			`}`

			`// Get the substring of the input at the given range`
			`std::string str(const common_string_range & rng) const;`

			`// Appends to the result.content field`
			`void add_content(const std::string & content);`

			`// Appends to the result.reasoning_content field`
			`void add_reasoning_content(const std::string & reasoning_content);`

			`// Adds a tool call to the result. If the tool call is too incomplete (e.g. name empty), it won't add anything.`
			`bool add_tool_call(const std::string & name, const std::string & id, const std::string & arguments);`

			`// Adds a tool call using the "name", "id" and "arguments" fields of the json object`
			`bool add_tool_call(const nlohmann::ordered_json & tool_call);`

			`// Adds an array of tool calls using their "name", "id" and "arguments" fields.`
			`bool add_tool_calls(const nlohmann::ordered_json & arr);`

			`void finish();`

			`bool consume_spaces();`

			`void consume_literal(const std::string & literal);`

			`bool try_parse_reasoning(const std::string & start_think, const std::string & end_think);`

			`std::string consume_rest();`

			`struct find_regex_result {`
			`std::string prelude;`
			`std::vector<common_string_range> groups;`
			`};`

server: fix streaming crashes (#13786) * add preludes to content on partial regex match * allow all parsers to parse non-tool-call content. * tweak order of <\|python_tag\|> vs <function= parsing for functionary v3.1 format. still not ideal but hopefully less prone to crash 2025-05-26 08:03:57 -07:00			`std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);`
`server`: streaming of tool calls and thoughts when `--jinja` is on (#12379) * add common_json w/ support for truncated json healing * add common_chat_msg_diff * partial common_chat_parse * refactor parser w/ optionals * server: wire chat diffs in stream mode * fix trigger of thinking models (must happen after thoughts are closed) * fix functionary v3.2 raw python! * rename: common_chat_syntax (now contains format) * rm common_regex.at_start * don't return empty <think></think> * accommodate yet another deepseek r1 distill fantasy syntax (`<｜tool▁calls｜>`) * fix QwQ 32B tool call parsing after thoughts (hermes2) * better logs for grammar triggers * consume spaces after parse_json_tool_calls * fix required tool calls w/ thinking models that have pre-opened thinking tags * fix thinking model's initial trigger + test qwq's template * run most test_tool_call tests in stream + non-stream modes * make functionary v3.2 parsing more strict (differentiate first match from others) * send final diff from server, to close off raw python arguments * support partial content streaming in Generic mode * tool-call: allow content prelude before hermes2 tool calls (for Qwen2.5) * Update function-calling.md * Update tool_bench.py * chat-parser: remove input from exception (llm output may contain PII) --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Olivier Chafik <ochafik@users.noreply.github.com> 2025-05-25 01:48:08 +01:00
			`bool try_consume_literal(const std::string & literal);`

			`std::optional<find_regex_result> try_find_literal(const std::string & literal);`

			`find_regex_result consume_regex(const common_regex & regex);`

			`std::optional<find_regex_result> try_consume_regex(const common_regex & regex);`

			`std::optional<common_json> try_consume_json();`
			`common_json consume_json();`

			`struct consume_json_result {`
			`nlohmann::ordered_json value;`
			`bool is_partial;`
			`};`

			`/*`
			`Consume (possibly partial) json and converts specific subtrees to (possibly truncated) JSON strings.`

			`By default, object keys can't be truncated, nor can string values (their corresponding key is removed,`
			e.g. `{"foo": "bar", "baz": "b` -> `{"foo": "bar"}`

			`But one can allow subpaths to be kept truncated, and possibly json-dumped to truncated json strings`
			- with `content_paths={{"foo"}}` -> `{"foo": "b` -> {"foo": "b"}`
			- with `args_paths={{"foo"}}` -> `{"foo": {"b` -> `{"foo": "{b"}`
			`*/`
			`consume_json_result consume_json_with_dumped_args(`
			`const std::vector<std::vector<std::string>> & args_paths = {},`
			`const std::vector<std::vector<std::string>> & content_paths = {}`
			`);`
			`std::optional<consume_json_result> try_consume_json_with_dumped_args(`
			`const std::vector<std::vector<std::string>> & args_paths = {},`
			`const std::vector<std::vector<std::string>> & content_paths = {}`
			`);`
			`};`