server: fix regression on streamed non-chat completion w/ stops (#13785)

* more forgiving message diffs: partial stop words aren't erased, full stops are * Add (slow) server test for completion + stream + stop
2025-08-18 05:56:00 -04:00 · 2025-05-26 06:16:37 -07:00
parent 79c137f776
commit f13847cfb5
2 changed files with 29 additions and 0 deletions
--- a/common/chat.cpp
+++ b/common/chat.cpp
@@ -31,6 +31,11 @@ static std::string string_diff(const std::string & last, const std::string & cur
        return current;
    }
    if (!string_starts_with(current, last)) {
        if (string_starts_with(last, current)) {
            // This happens if the last generation ended on a partial stop word (not erased),
            // and the current ended on a stop word (erased).
            return "";
        }
        throw std::runtime_error("Invalid diff: '" + last + "' not found at start of '" + current + "'");
    }
    return current.substr(last.size());
--- a/tools/server/tests/unit/test_completion.py
+++ b/tools/server/tests/unit/test_completion.py
@@ -121,6 +121,30 @@ def test_completion_stream_with_openai_library():
    assert match_regex("(going|bed)+", output_text)
 # Test case from https://github.com/ggml-org/llama.cpp/issues/13780
@pytest.mark.slow
 def test_completion_stream_with_openai_library_stops():
    global server
    server.model_hf_repo = "bartowski/Phi-3.5-mini-instruct-GGUF:Q4_K_M"
    server.model_hf_file = None
    server.start()
    client = OpenAI(api_key="dummy", base_url=f"http://{server.server_host}:{server.server_port}/v1")
    res = client.completions.create(
        model="davinci-002",
        prompt="System: You are helpfull assistant.\nAssistant:\nHey! How could I help?\nUser:\nTell me a joke.\nAssistant:\n",
        stop=["User:\n", "Assistant:\n"],
        max_tokens=200,
        stream=True,
    )
    output_text = ''
    for data in res:
        choice = data.choices[0]
        if choice.finish_reason is None:
            assert choice.text is not None
            output_text += choice.text
    assert match_regex("Sure, here's one for[\\s\\S]*", output_text), f'Unexpected output: {output_text}'
@pytest.mark.parametrize("n_slots", [1, 2])
 def test_consistent_result_same_seed(n_slots: int):
    global server