CUDA: revise q8_1 data layout for mul_mat_q (#7824)

2025-08-01 15:09:32 -04:00 · 2024-06-09 09:42:25 +02:00
parent 2decf57bc6
commit 42b53d192f
5 changed files with 282 additions and 151 deletions
--- a/ggml-cuda/mmq.cu
+++ b/ggml-cuda/mmq.cu
@@ -11,6 +11,7 @@ void ggml_cuda_op_mul_mat_q(
    const int64_t nb01 = src0->nb[1];

    const int64_t ne10 = src1->ne[0];
+    const int64_t ne11 = src1->ne[1];
    GGML_ASSERT(ne10 % QK8_1 == 0);

    const int64_t ne0 = dst->ne[0];
@@ -25,7 +26,7 @@ void ggml_cuda_op_mul_mat_q(
    // nrows_dst == nrows of the matrix that the kernel writes into
    const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff;

-    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, nrows_dst};
+    const mmq_args args = {src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stride00, src1_padded_row_size, src1_ncols, ne11, nrows_dst};

    switch (src0->type) {
        case GGML_TYPE_Q4_0: