CUDA: revise q8_1 data layout for mul_mat_q (#7824)

2025-08-17 21:51:27 -04:00 · 2024-06-09 09:42:25 +02:00
parent 2decf57bc6
commit 42b53d192f
5 changed files with 282 additions and 151 deletions
--- a/ggml-cuda/quantize.cuh
+++ b/ggml-cuda/quantize.cuh
@@ -1,5 +1,20 @@
+#pragma once
+
 #include "common.cuh"
+#include "mmq.cuh"
+
+#include <cstdint>

 #define CUDA_QUANTIZE_BLOCK_SIZE 256

-void quantize_row_q8_1_cuda(const float * x, void * vy, const int64_t kx, const int64_t ky, const int64_t kx_padded, cudaStream_t stream);
+typedef void (*quantize_cuda_t)(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_row_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);
+
+void quantize_mmq_q8_1_cuda(
+    const float * x, void * vy, const int64_t kx0, const int64_t kx1, const int64_t channels, const int64_t kx0_padded,
+    const ggml_type type_x, cudaStream_t stream);