mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-14 12:19:48 -04:00
cuda : fix GGML_CUDA_GRAPHS=OFF (#15300)
* fix USE_CUDA_GRAPH=OFF ggml-ci * check capture status * completely disable capturing check instead
This commit is contained in:
@@ -25,9 +25,12 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||||||
|
|
||||||
// Special case for reducing vectors
|
// Special case for reducing vectors
|
||||||
#ifdef GGML_CUDA_USE_CUB
|
#ifdef GGML_CUDA_USE_CUB
|
||||||
|
#ifdef USE_CUDA_GRAPH
|
||||||
cudaStreamCaptureStatus iscapturing;
|
cudaStreamCaptureStatus iscapturing;
|
||||||
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
|
CUDA_CHECK(cudaStreamIsCapturing(stream, &iscapturing));
|
||||||
|
#endif // USE_CUDA_GRAPH
|
||||||
if ((nrows == 1) &&
|
if ((nrows == 1) &&
|
||||||
|
#ifdef USE_CUDA_GRAPH
|
||||||
// CUDA_GRAPHS_DISABLED
|
// CUDA_GRAPHS_DISABLED
|
||||||
((ncols > 65536) &&
|
((ncols > 65536) &&
|
||||||
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||||
@@ -38,6 +41,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||||||
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
!((ctx.cuda_graph->instance == nullptr) && (iscapturing == cudaStreamCaptureStatusNone) ||
|
||||||
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
ctx.cuda_graph->disable_due_to_gpu_arch || ctx.cuda_graph->disable_due_to_too_many_updates ||
|
||||||
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
ctx.cuda_graph->disable_due_to_failed_graph_capture))) {
|
||||||
|
#else
|
||||||
|
(ncols > 65536)) {
|
||||||
|
#endif // USE_CUDA_GRAPH
|
||||||
// Single row - use device-wide reduction
|
// Single row - use device-wide reduction
|
||||||
size_t tmp_size = 0;
|
size_t tmp_size = 0;
|
||||||
ggml_cuda_pool & pool = ctx.pool();
|
ggml_cuda_pool & pool = ctx.pool();
|
||||||
@@ -51,7 +57,7 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||||||
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
|
divide_by_count<float><<<1, 1, 0, stream>>>(dst_d, ncols);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif // GGML_CUDA_USE_CUB
|
||||||
|
|
||||||
const dim3 block_nums(nrows, 1, 1);
|
const dim3 block_nums(nrows, 1, 1);
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user