From 053b1539c02617eff744f89525ee57497c3c1fbe Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Sat, 31 May 2025 15:39:19 -0700 Subject: [PATCH] threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling (#12995) * threading: support for GGML_SCHED_PRIO_LOW, update thread info on Windows to avoid throttling We talked about adding LOW priority for GGML threads in the original threadpool PR. It might be useful for some cases to avoid contention. Latest Windows ARM64 releases started parking (offlining) the CPU cores more aggresively which results in suboptimal performance with n_threads > 4. To deal with that we now disable Power Throttling for our threads for the NORMAL and higher priorities. Co-authored-by: Diego Devesa * threading: disable SetThreadInfo() calls for older Windows versions * Update tools/llama-bench/llama-bench.cpp Co-authored-by: Diego Devesa --------- Co-authored-by: Diego Devesa --- common/arg.cpp | 4 ++-- common/common.cpp | 2 ++ ggml/include/ggml.h | 1 + ggml/src/ggml-cpu/ggml-cpu.c | 23 +++++++++++++++++++++++ tools/llama-bench/llama-bench.cpp | 2 +- 5 files changed, 29 insertions(+), 3 deletions(-) diff --git a/common/arg.cpp b/common/arg.cpp index b6cb0aa91..cfa9878f9 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -1348,9 +1348,9 @@ common_params_context common_params_parser_init(common_params & params, llama_ex )); add_opt(common_arg( {"--prio"}, "N", - string_format("set process/thread priority : 0-normal, 1-medium, 2-high, 3-realtime (default: %d)\n", params.cpuparams.priority), + string_format("set process/thread priority : low(-1), normal(0), medium(1), high(2), realtime(3) (default: %d)\n", params.cpuparams.priority), [](common_params & params, int prio) { - if (prio < 0 || prio > 3) { + if (prio < GGML_SCHED_PRIO_LOW || prio > GGML_SCHED_PRIO_REALTIME) { throw std::invalid_argument("invalid value"); } params.cpuparams.priority = (enum ggml_sched_priority) prio; diff --git a/common/common.cpp b/common/common.cpp index d80c47bc3..4cc40ed8b 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -203,6 +203,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { DWORD p = NORMAL_PRIORITY_CLASS; switch (prio) { + case GGML_SCHED_PRIO_LOW: p = BELOW_NORMAL_PRIORITY_CLASS; break; case GGML_SCHED_PRIO_NORMAL: p = NORMAL_PRIORITY_CLASS; break; case GGML_SCHED_PRIO_MEDIUM: p = ABOVE_NORMAL_PRIORITY_CLASS; break; case GGML_SCHED_PRIO_HIGH: p = HIGH_PRIORITY_CLASS; break; @@ -228,6 +229,7 @@ bool set_process_priority(enum ggml_sched_priority prio) { int p = 0; switch (prio) { + case GGML_SCHED_PRIO_LOW: p = 5; break; case GGML_SCHED_PRIO_NORMAL: p = 0; break; case GGML_SCHED_PRIO_MEDIUM: p = -5; break; case GGML_SCHED_PRIO_HIGH: p = -10; break; diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 21ee8a141..2226aadcf 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2181,6 +2181,7 @@ extern "C" { // scheduling priorities enum ggml_sched_priority { + GGML_SCHED_PRIO_LOW = -1, GGML_SCHED_PRIO_NORMAL, GGML_SCHED_PRIO_MEDIUM, GGML_SCHED_PRIO_HIGH, diff --git a/ggml/src/ggml-cpu/ggml-cpu.c b/ggml/src/ggml-cpu/ggml-cpu.c index 1dc425fef..c7426df2b 100644 --- a/ggml/src/ggml-cpu/ggml-cpu.c +++ b/ggml/src/ggml-cpu/ggml-cpu.c @@ -2418,12 +2418,32 @@ static bool ggml_thread_apply_priority(int32_t prio) { // This is up to the applications. DWORD p = THREAD_PRIORITY_NORMAL; switch (prio) { + case GGML_SCHED_PRIO_LOW: p = THREAD_PRIORITY_BELOW_NORMAL; break; case GGML_SCHED_PRIO_NORMAL: p = THREAD_PRIORITY_NORMAL; break; case GGML_SCHED_PRIO_MEDIUM: p = THREAD_PRIORITY_ABOVE_NORMAL; break; case GGML_SCHED_PRIO_HIGH: p = THREAD_PRIORITY_HIGHEST; break; case GGML_SCHED_PRIO_REALTIME: p = THREAD_PRIORITY_TIME_CRITICAL; break; } + if (prio != GGML_SCHED_PRIO_LOW) { + // Tell Windows that this thread should not be throttled (needs its own CPU core). + // Newer Windows 11 versions aggresively park (offline) CPU cores and often place + // all our threads onto the first 4 cores which results in terrible performance with + // n_threads > 4 + #if _WIN32_WINNT >= 0x0602 + THREAD_POWER_THROTTLING_STATE t; + ZeroMemory(&t, sizeof(t)); + t.Version = THREAD_POWER_THROTTLING_CURRENT_VERSION; + t.ControlMask = THREAD_POWER_THROTTLING_EXECUTION_SPEED; + t.StateMask = 0; + + if (!SetThreadInformation(GetCurrentThread(), ThreadPowerThrottling, &t, sizeof(t))) { + GGML_LOG_DEBUG("failed to disable thread power throttling %d : (%d)\n", prio, (int) GetLastError()); + return false; + } + #endif + } + if (prio == GGML_SCHED_PRIO_NORMAL) { // Keep inherited policy/priority return true; @@ -2451,6 +2471,8 @@ static bool ggml_thread_apply_priority(int32_t prio) { struct sched_param p; int32_t policy = SCHED_OTHER; switch (prio) { + // TODO: there seems to be no way to set lower prio on Apple platforms + case GGML_SCHED_PRIO_LOW: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; @@ -2507,6 +2529,7 @@ static bool ggml_thread_apply_priority(int32_t prio) { struct sched_param p; int32_t policy = SCHED_OTHER; switch (prio) { + case GGML_SCHED_PRIO_LOW: policy = SCHED_BATCH; p.sched_priority = 0; break; case GGML_SCHED_PRIO_NORMAL: policy = SCHED_OTHER; p.sched_priority = 0; break; case GGML_SCHED_PRIO_MEDIUM: policy = SCHED_FIFO; p.sched_priority = 40; break; case GGML_SCHED_PRIO_HIGH: policy = SCHED_FIFO; p.sched_priority = 80; break; diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 06196cf24..803630d26 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -315,7 +315,7 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --numa numa mode (default: disabled)\n"); printf(" -r, --repetitions number of times to repeat each test (default: %d)\n", cmd_params_defaults.reps); - printf(" --prio <0|1|2|3> process/thread priority (default: %d)\n", + printf(" --prio <-1|0|1|2|3> process/thread priority (default: %d)\n", cmd_params_defaults.prio); printf(" --delay <0...N> (seconds) delay between each test (default: %d)\n", cmd_params_defaults.delay);