cuda : fix GGML_CUDA_GRAPHS=OFF (#15300)

* fix USE_CUDA_GRAPH=OFF

ggml-ci

* check capture status

* completely disable capturing check instead
This commit is contained in:
Sigbjørn Skjæret
2025-08-14 12:22:07 +02:00
committed by GitHub
parent 5cdb27e091
commit 4ebd0c125b

View File

@@ -25,9 +25,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
// Special case for reducing vectors // Special case for reducing vectors
#ifdef GGML_CUDA_USE_CUB #ifdef GGML_CUDA_USE_CUB
#ifdef USE_CUDA_GRAPH
cudaStreamCaptureStatus iscapturing; cudaStreamCaptureStatus iscapturing;
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing)); CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
#endif // USE_CUDA_GRAPH
if ((nrows == 1) && if ((nrows == 1) &&
#ifdef USE_CUDA_GRAPH
// CUDA_GRAPHS_DISABLED // CUDA_GRAPHS_DISABLED
((ncols > 65536) && ((ncols > 65536) &&
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || ((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
@@ -38,6 +41,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) || !((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates || ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) { ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
#else
(ncols > 65536)) {
#endif // USE_CUDA_GRAPH
// Single row - use device-wide reduction // Single row - use device-wide reduction
size_t tmp_size = 0; size_t tmp_size = 0;
ggml_cuda_pool & pool = ctx.pool(); ggml_cuda_pool & pool = ctx.pool();
@@ -51,7 +57,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols); divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
return; return;
} }
#endif #endif // GGML_CUDA_USE_CUB
const dim3 block_nums(nrows, 1, 1); const dim3 block_nums(nrows, 1, 1);