CUDA: stream-k decomposition for MMQ (#8018)

* CUDA: stream-k decomposition for MMQ * fix undefined memory reads for small matrices
2025-08-18 05:56:00 -04:00 · 2024-06-20 14:39:21 +02:00
parent 2075a66a96
commit d50f8897a7
4 changed files with 292 additions and 113 deletions
--- a/ggml-cuda/common.cuh
+++ b/ggml-cuda/common.cuh
@@ -652,8 +652,8 @@ static int get_mmq_x_max_host(const int cc) {
 }

 // Round rows to this value for --split-mode row:
-static int get_mmq_y_host(const int cc, const int mmq_x) {
-    return cc >= CC_VOLTA && mmq_x >= 32 ? 128 : 64;
+static int get_mmq_y_host(const int cc) {
+    return cc >= CC_VOLTA ? 128 : 64;
 }

 //////////////////////