From 6028bf74351d35a06bd98498624f8c2f029f7d1a Mon Sep 17 00:00:00 2001 From: Oliver Simons Date: Wed, 13 Aug 2025 10:04:46 +0200 Subject: [PATCH] CUDA: Optimize `reduce_rows_f32` kernel, leading up to 25x perf improvement on kernel-level and 10% perf increase for Gemma3n (#15132) * Factor out `reduce_rows_f32` from common.cuh This increases iteration cycle speed by not having to recompile every kernel all the time * Hide memory-latency by loop unrolling in reduce_rows_f32 * Further optimizations to `reduce_rows_f32` 1. Increase threadblock size to better hide latency of memory requests. As a consequence of bigger threadblocks, do 2-step summation, using shared memory to communicate results between invocations 2. Use sum_temp array to reduce waits on sum 3. Adjust num_unroll to reflext bigger threadblock 4. Improve default block_dims, increase support for more block_dims * Add perf tests for `reduce_rows_f32` kernel * Add heuristic to toggle 128/512 threads based on sm count Break even point was the minimum of the following multiples. | GPU Model | Nrow SM Count Multiple | | ----------- | ----------- | | RTX 4000 SFF ADA | 2.0x | | RTX 6000 ADA | 2.5x | | RTX PRO 6000 Blackwell Max-Q | 3.04x | | RTX PRO 4500 Blackwell | 3.15x | * Ensure perf gains also for small ncols and large nrows Alternative to this, one could have also made the number of unrollings template-able, but that would require compiling the kernel multiple times, increasing binary size unnecessarily * Modify perf and unit-tests * Apply auto-formatting by clang * Fix CI build failure See https://github.com/ggml-org/llama.cpp/actions/runs/16798370266/job/47573716079?pr=15132#step:7:486 Building with VS generator worked though. * Remove sm_count property from `ggml_backend_cuda_context` Requested by @JohannesGaessler, and should fix remaining CI issues as a side-effect * Add CUB-based implementation for GGML_OP_MEAN Currently this branch is only executed for nrows==1 * Add heuristics to execute CUB branch only when it brings perf Heuristics were determined on the following HW: * RTX 4000 SFF ADA * RTX 6000 ADA * RTX PRO 6000 Blackwell Max-Q * RTX PRO 4500 Blackwell * Add unit-test for CUB-based mean Tests should run with CUDA Graphs enabled per default on NVGPUs * Rename `USE_CUB` to `GGML_CUDA_USE_CUB` Suggested by @JohannesGaessler * Unindent Preprocessor directives See https://github.com/ggml-org/llama.cpp/pull/15132#discussion_r2269213506 --- ggml/src/ggml-cuda/common.cuh | 24 +++----------- ggml/src/ggml-cuda/mean.cu | 52 +++++++++++++++++++++++++++-- ggml/src/ggml-cuda/reduce_rows.cuh | 53 ++++++++++++++++++++++++++++++ ggml/src/ggml-cuda/sum.cu | 16 ++++----- ggml/src/ggml-cuda/sumrows.cu | 25 +++++++++++--- tests/test-backend-ops.cpp | 21 ++++++++++++ 6 files changed, 155 insertions(+), 36 deletions(-) create mode 100644 ggml/src/ggml-cuda/reduce_rows.cuh diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index a23da57e3..5a2a3478d 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -87,6 +87,10 @@ #define GGML_CUDA_CC_IS_QY2(cc) (cc >= GGML_CUDA_CC_QY2 && cc < GGML_CUDA_CC_NG) #define GGML_CUDA_CC_IS_NG(cc) (cc >= GGML_CUDA_CC_NG) +#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 +# define GGML_CUDA_USE_CUB +#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 + #ifdef __CUDA_ARCH_LIST__ constexpr bool ggml_cuda_has_arch_impl(int) { return false; @@ -420,26 +424,6 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { #endif // FP16_AVAILABLE } -// Row reduction kernel template - compute sum (norm=false) or mean (norm=true) -template -static __global__ void reduce_rows_f32(const float * x, float * dst, const int ncols) { - const int row = blockIdx.x; - const int col = threadIdx.x; - - float sum = 0.0f; - for (int i = col; i < ncols; i += blockDim.x) { - sum += x[row * ncols + i]; - } - - sum = warp_reduce_sum(sum); - - if (col != 0) { - return; - } - - dst[row] = norm ? sum / ncols : sum; -} - template static __device__ __forceinline__ int warp_reduce_all(int x) { #ifdef GGML_USE_HIP diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu index 4b238a399..2ad493239 100644 --- a/ggml/src/ggml-cuda/mean.cu +++ b/ggml/src/ggml-cuda/mean.cu @@ -1,4 +1,14 @@ #include "mean.cuh" +#include "reduce_rows.cuh" + +#ifdef GGML_CUDA_USE_CUB +#include +using namespace cub; +#endif // GGML_CUDA_USE_CUB + +template __global__ void divide_by_count(T * result, size_t count) { + *result /= static_cast(count); +} void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const ggml_tensor * src0 = dst->src[0]; @@ -13,7 +23,45 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ncols = src0->ne[0]; const int64_t nrows = ggml_nrows(src0); - const dim3 block_dims(WARP_SIZE, 1, 1); +// Special case for reducing vectors +#ifdef GGML_CUDA_USE_CUB + cudaStreamCaptureStatus iscapturing; + CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing)); + if ((nrows == 1) && + // CUDA_GRAPHS_DISABLED + ((ncols > 65536) && + ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || + ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || + ctx.cuda_graph->disable_due_to_failed_graph_capture)) || + // CUDA_GRAPHS ENABLED + ((ncols > 32768) && + !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || + ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || + ctx.cuda_graph->disable_due_to_failed_graph_capture))) { + // Single row - use device-wide reduction + size_t tmp_size = 0; + ggml_cuda_pool & pool = ctx.pool(); + + DeviceReduce::Sum(nullptr, tmp_size, src0_d, dst_d, ncols, stream); + + ggml_cuda_pool_alloc tmp_alloc(pool, tmp_size); + DeviceReduce::Sum(tmp_alloc.ptr, tmp_size, src0_d, dst_d, ncols, stream); + + // Divide by ncols + divide_by_count<<<1, 1, 0, stream>>>(dst_d, ncols); + return; + } +#endif + const dim3 block_nums(nrows, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + + const int id = ggml_cuda_get_device(); + const int nsm = ggml_cuda_info().devices[id].nsm; + if ((nrows / nsm) < 2) { + const dim3 block_dims(512, 1, 1); + reduce_rows_f32<<>>(src0_d, dst_d, ncols); + } else { + const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); + reduce_rows_f32<<>>(src0_d, dst_d, ncols); + } } diff --git a/ggml/src/ggml-cuda/reduce_rows.cuh b/ggml/src/ggml-cuda/reduce_rows.cuh new file mode 100644 index 000000000..6bee20413 --- /dev/null +++ b/ggml/src/ggml-cuda/reduce_rows.cuh @@ -0,0 +1,53 @@ +#include "common.cuh" + +// Row reduction kernel template - compute sum (norm=false) or mean (norm=true) +template +static __global__ void reduce_rows_f32(const float * __restrict__ x, float * __restrict__ dst, const int ncols) { + const int row = blockIdx.x; + const int col = threadIdx.x; + + float sum = 0.0f; + const int num_unroll = 8; + float temp[num_unroll]; + float sum_temp[num_unroll] = { 0.0f }; + for (int i = col; i < ncols;) { + for (int j = 0; j < num_unroll; ++j) { + if (i < ncols) { + temp[j] = x[row * ncols + i]; + } else { + temp[j] = 0; + } + i += blockDim.x; + } + for (int j = 0; j < num_unroll; ++j) { + sum_temp[j] += temp[j]; + } + } + for (int j = 0; j < num_unroll; ++j) { + sum += sum_temp[j]; + } + + // sum up partial sums + sum = warp_reduce_sum(sum); + if (blockDim.x > WARP_SIZE) { + assert((blockDim.x <= 1024) && (blockDim.x % WARP_SIZE) == 0); + __shared__ float s_sum[32]; + const int warp_id = threadIdx.x / WARP_SIZE; + const int lane_id = threadIdx.x % WARP_SIZE; + if (lane_id == 0) { + s_sum[warp_id] = sum; + } + __syncthreads(); + sum = 0.0f; + if (lane_id < (blockDim.x / WARP_SIZE)) { + sum = s_sum[lane_id]; + } + sum = warp_reduce_sum(sum); + } + + if (col != 0) { + return; + } + + dst[row] = norm ? sum / ncols : sum; +} diff --git a/ggml/src/ggml-cuda/sum.cu b/ggml/src/ggml-cuda/sum.cu index eb3d7cdba..c56257b44 100644 --- a/ggml/src/ggml-cuda/sum.cu +++ b/ggml/src/ggml-cuda/sum.cu @@ -1,19 +1,15 @@ -#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 -#define USE_CUB -#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA) && CUDART_VERSION >= 11070 +#include "sum.cuh" +#include "sumrows.cuh" -#ifdef USE_CUB +#ifdef GGML_CUDA_USE_CUB #include using namespace cub; -#endif // USE_CUB - -#include "sumrows.cuh" -#include "sum.cuh" +#endif // GGML_CUDA_USE_CUB #include void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int64_t ne, cudaStream_t stream) { -#ifdef USE_CUB +#ifdef GGML_CUDA_USE_CUB size_t tmp_size = 0; DeviceReduce::Sum(nullptr, tmp_size, x, dst, ne, stream); ggml_cuda_pool_alloc tmp_alloc(pool, tmp_size); @@ -23,7 +19,7 @@ void sum_f32_cuda(ggml_cuda_pool & pool, const float * x, float * dst, const int // For AMD there is rocPRIM which could be used as a drop-in replacement via hipcub but this would require C++11 -> C++14. sum_rows_f32_cuda(x, dst, ne, 1, stream); GGML_UNUSED(pool); -#endif // USE_CUB +#endif // GGML_CUDA_USE_CUB } void ggml_cuda_op_sum(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { diff --git a/ggml/src/ggml-cuda/sumrows.cu b/ggml/src/ggml-cuda/sumrows.cu index 2eee08fa0..4025771aa 100644 --- a/ggml/src/ggml-cuda/sumrows.cu +++ b/ggml/src/ggml-cuda/sumrows.cu @@ -1,9 +1,17 @@ +#include "reduce_rows.cuh" #include "sumrows.cuh" void sum_rows_f32_cuda(const float * x, float * dst, const int ncols, const int nrows, cudaStream_t stream) { - const dim3 block_dims(WARP_SIZE, 1, 1); + const int id = ggml_cuda_get_device(); + const int nsm = ggml_cuda_info().devices[id].nsm; const dim3 block_nums(nrows, 1, 1); - reduce_rows_f32<<>>(x, dst, ncols); + if ((nrows / nsm) < 2) { + const dim3 block_dims(512, 1, 1); + reduce_rows_f32<<>>(x, dst, ncols); + } else { + const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); + reduce_rows_f32<<>>(x, dst, ncols); + } } void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { @@ -19,8 +27,17 @@ void ggml_cuda_op_sum_rows(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { const int64_t ncols = src0->ne[0]; const int64_t nrows = ggml_nrows(src0); - const dim3 block_dims(WARP_SIZE, 1, 1); const dim3 block_nums(nrows, 1, 1); - reduce_rows_f32<<>>(src0_d, dst_d, ncols); + const int id = ggml_cuda_get_device(); + const int nsm = ggml_cuda_info().devices[id].nsm; + if ((nrows / nsm) < 2) { + // Increase num threads to 512 for small nrows to better hide the latency + const dim3 block_dims(512, 1, 1); + reduce_rows_f32<<>>(src0_d, dst_d, ncols); + } else { + // Enough active SMs to hide latency, use smaller blocks to allow better scheduling + const dim3 block_dims(ncols < 1024 ? 32 : 128, 1, 1); + reduce_rows_f32<<>>(src0_d, dst_d, ncols); + } } diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index d29779cd1..63e03978e 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -5998,6 +5998,15 @@ static std::vector> make_test_cases_eval() { test_cases.emplace_back(new test_sum()); test_cases.emplace_back(new test_sum_rows()); test_cases.emplace_back(new test_mean()); + test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1, 1, 1 })); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1, 1, 1 })); + test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 1, 1, 1 })); + test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 1024, 1, 1 })); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 1024, 1, 1 })); + test_cases.emplace_back(new test_sum(GGML_TYPE_F32, { 33, 256, 1, 1 })); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, { 33, 256, 1, 1 })); + test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 33, 256, 1, 1 })); + test_cases.emplace_back(new test_mean(GGML_TYPE_F32, { 32769, 1, 1, 1 })); test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {64, 64, 320, 1})); test_cases.emplace_back(new test_group_norm(GGML_TYPE_F32, {9, 9, 1280, 1})); test_cases.emplace_back(new test_acc()); @@ -6179,6 +6188,18 @@ static std::vector> make_test_cases_perf() { test_cases.emplace_back(new test_add_id(GGML_TYPE_F32, GGML_TYPE_F32, 2880, 32, 4, n_token)); } + std::vector> reduce_rows_cases = { + { 8192, 1, 1, 1 }, + { 8192, 8192, 1, 1 }, + { 128, 8192, 1, 1 }, + }; + + for (auto it: reduce_rows_cases){ + test_cases.emplace_back(new test_mean(GGML_TYPE_F32, it)); + test_cases.emplace_back(new test_sum_rows(GGML_TYPE_F32, it)); + test_cases.emplace_back(new test_sum(GGML_TYPE_F32, it)); + } + return test_cases; }