diff --git a/examples/parallel/README.md b/examples/parallel/README.md index ece3a6641..2468a30d2 100644 --- a/examples/parallel/README.md +++ b/examples/parallel/README.md @@ -4,7 +4,7 @@ Simplified simulation of serving incoming requests in parallel ## Example -Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of 10 junk questions (`-j 10`) followed by the actual question. +Generate 128 client requests (`-ns 128`), simulating 8 concurrent clients (`-np 8`). The system prompt is shared (`-pps`), meaning that it is computed once at the start. The client requests consist of up to 10 junk questions (`--junk 10`) followed by the actual question. ```bash llama-parallel -m model.gguf -np 8 -ns 128 --top-k 1 -pps --junk 10 -c 16384 diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index acb1301a2..931ea0035 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -315,7 +315,10 @@ int main(int argc, char ** argv) { } else { client.prompt += k_system; } - for (int i = 0; i < n_junk; ++i) { + + const int n_junk_cur = rand() % n_junk; + + for (int i = 0; i < n_junk_cur; ++i) { const int r = rand() % k_questions.size(); client.prompt += "User:\n" + k_questions[r] + "\nAssistant:\n " + k_answers[r] + "\n"; } @@ -340,7 +343,7 @@ int main(int argc, char ** argv) { client.n_decoded = 0; client.i_batch = batch.n_tokens - 1; - LOG_INF("\033[31mClient %3d, seq %4d, started decoding ...\033[0m\n", client.id, client.seq_id); + LOG_INF("\033[31mClient %3d, seq %4d, junk = %4d, started decoding ...\033[0m\n", client.id, client.seq_id, n_junk_cur); g_seq_id += 1;