CUDA: optimize and refactor MMQ (#8416)

* CUDA: optimize and refactor MMQ

* explicit q8_1 memory layouts, add documentation
This commit is contained in:
Johannes Gäßler
2024-07-11 16:47:47 +02:00
committed by GitHub
parent a977c11544
commit 808aba3916
5 changed files with 867 additions and 687 deletions

View File

@@ -70,6 +70,10 @@ struct mma_int_A_I16K8 {
}
#endif // defined(INT8_MMA_AVAILABLE)
}
__device__ __forceinline__ void load_low(const int * __restrict__ xs0, const int & stride) {
((mma_int_A_I16K4 *) x)[0].load(xs0, stride);
}
};
struct mma_int_B_J8K4 {