CUDA: optimize and refactor MMQ (#8416)

* CUDA: optimize and refactor MMQ * explicit q8_1 memory layouts, add documentation
2025-08-19 22:36:13 -04:00 · 2024-07-11 16:47:47 +02:00
parent a977c11544
commit 808aba3916
5 changed files with 867 additions and 687 deletions
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -70,6 +70,10 @@ struct mma_int_A_I16K8 {
        }
 #endif // defined(INT8_MMA_AVAILABLE)
    }
+
+    __device__ __forceinline__ void load_low(const int * __restrict__ xs0, const int & stride) {
+        ((mma_int_A_I16K4 *) x)[0].load(xs0, stride);
+    }
 };

 struct mma_int_B_J8K4 {