mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-30 06:03:37 -04:00
CUDA: use tensor cores for MMQ (#7676)
* CUDA: int8 tensor cores for MMQ (legacy quants) * fix out-of-bounds writes * __builtin_assume -> GGML_CUDA_ASSUME * fix writeback returning too early
This commit is contained in:
@@ -43,7 +43,7 @@ static __global__ void flash_attn_tile_ext_f16(
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_AVAILABLE
|
||||
#ifdef FP16_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
|
Reference in New Issue
Block a user