CUDA: stream-k decomposition for MMQ (#8018)

* CUDA: stream-k decomposition for MMQ * fix undefined memory reads for small matrices
2025-08-29 03:28:52 -04:00 · 2024-06-20 14:39:21 +02:00
parent 2075a66a96
commit d50f8897a7
4 changed files with 292 additions and 113 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -635,7 +635,7 @@ static int64_t get_row_rounding(const std::array<float, GGML_CUDA_MAX_DEVICES> &
        }

        const int cc = ggml_cuda_info().devices[id].cc;
-        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc, get_mmq_x_max_host(cc)));
+        row_rounding = std::max(row_rounding, (int64_t)get_mmq_y_host(cc));
    }
    return row_rounding;
 }