mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 12:05:03 +00:00
server
: streaming of tool calls and thoughts when --jinja
is on (#12379)
* add common_json w/ support for truncated json healing * add common_chat_msg_diff * partial common_chat_parse * refactor parser w/ optionals * server: wire chat diffs in stream mode * fix trigger of thinking models (must happen after thoughts are closed) * fix functionary v3.2 raw python! * rename: common_chat_syntax (now contains format) * rm common_regex.at_start * don't return empty <think></think> * accommodate yet another deepseek r1 distill fantasy syntax (`<|tool▁calls|>`) * fix QwQ 32B tool call parsing after thoughts (hermes2) * better logs for grammar triggers * consume spaces after parse_json_tool_calls * fix required tool calls w/ thinking models that have pre-opened thinking tags * fix thinking model's initial trigger + test qwq's template * run most test_tool_call tests in stream + non-stream modes * make functionary v3.2 parsing more strict (differentiate first match from others) * send final diff from server, to close off raw python arguments * support partial content streaming in Generic mode * tool-call: allow content prelude before hermes2 tool calls (for Qwen2.5) * Update function-calling.md * Update tool_bench.py * chat-parser: remove input from exception (llm output may contain PII) --------- Co-authored-by: ochafik <ochafik@google.com> Co-authored-by: Olivier Chafik <ochafik@users.noreply.github.com>
This commit is contained in:
@ -12,6 +12,7 @@
|
||||
export LLAMA_SERVER_BIN_PATH=$PWD/build/bin/llama-server
|
||||
export LLAMA_CACHE=${LLAMA_CACHE:-$HOME/Library/Caches/llama.cpp}
|
||||
|
||||
./scripts/tool_bench.py run --n 10 --temp -1 --temp 0 --temp 1 --temp 2 --temp 5 --llama-baseline $PWD/buildMaster/bin/llama-server --output qwen14b.jsonl --hf bartowski/Qwen2.5-14B-Instruct-GGUF:Q4_K_L
|
||||
./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 1.5B Q4_K_M" --output qwen1.5b.jsonl --hf bartowski/Qwen2.5-1.5B-Instruct-GGUF --ollama qwen2.5:1.5b-instruct-q4_K_M
|
||||
./scripts/tool_bench.py run --n 30 --temp -1 --temp 0 --temp 1 --model "Qwen 2.5 Coder 7B Q4_K_M" --output qwenc7b.jsonl --hf bartowski/Qwen2.5-Coder-7B-Instruct-GGUF --ollama qwen2.5-coder:7b
|
||||
|
||||
@ -205,6 +206,7 @@ def run(
|
||||
model: Annotated[Optional[str], typer.Option(help="Name of the model to test (server agnostic)")] = None,
|
||||
hf: Annotated[Optional[str], typer.Option(help="GGUF huggingface model repo id (+ optional quant) to test w/ llama-server")] = None,
|
||||
chat_template: Annotated[Optional[str], typer.Option(help="Chat template override for llama-server")] = None,
|
||||
chat_template_file: Annotated[Optional[str], typer.Option(help="Chat template file override for llama-server")] = None,
|
||||
ollama: Annotated[Optional[str], typer.Option(help="Ollama model tag to test")] = None,
|
||||
llama_baseline: Annotated[Optional[str], typer.Option(help="llama-server baseline binary path to use as baseline")] = None,
|
||||
n: Annotated[int, typer.Option(help="Number of times to run each test")] = 10,
|
||||
@ -229,6 +231,12 @@ def run(
|
||||
# n_ctx = 8192
|
||||
n_ctx = 2048
|
||||
|
||||
if model is None:
|
||||
if hf is not None:
|
||||
model = hf.split("/")[-1]
|
||||
elif ollama is not None:
|
||||
model = ollama
|
||||
|
||||
assert force or append or not output.exists(), f"Output file already exists: {output}; use --force to overwrite"
|
||||
|
||||
with output.open('a' if append else 'w') as output_file:
|
||||
@ -320,6 +328,7 @@ def run(
|
||||
server.model_hf_repo = hf
|
||||
server.model_hf_file = None
|
||||
server.chat_template = chat_template
|
||||
server.chat_template_file = chat_template_file
|
||||
server.server_path = server_path
|
||||
if port is not None:
|
||||
server.server_port = port
|
||||
@ -335,6 +344,7 @@ def run(
|
||||
temp=t,
|
||||
output_kwargs=dict(
|
||||
chat_template=chat_template,
|
||||
chat_template_file=chat_template_file,
|
||||
),
|
||||
request_kwargs=dict(
|
||||
ignore_chat_grammar=ignore_chat_grammar,
|
||||
@ -355,6 +365,7 @@ def run(
|
||||
temp=t,
|
||||
output_kwargs=dict(
|
||||
chat_template=None,
|
||||
chat_template_file=None,
|
||||
),
|
||||
request_kwargs=dict(
|
||||
model=ollama,
|
||||
|
Reference in New Issue
Block a user