CUDA : faster k-quant dot kernels (#1862)

* cuda : faster k-quant dot kernels * Imrove Q2_K dot kernel on older GPUs We now have a K_QUANTS_PER_ITERATION macro, which should be set to 1 on older and to 2 on newer GPUs. With this, we preserve the performance of the original PR on RTX-4080, and are faster compared to master on GTX-1660. * Imrove Q6_K dot kernel on older GPUs Using the same K_QUANTS_PER_ITERATION macro as last commit, we preserve performance on RTX-4080 and speed up Q6_K on a GTX-1660. * Add LLAMA_CUDA_KQUANTS_ITER to CMakeLists.txt and Makefile Allowed values are 1 or 2. 2 gives the best performance on modern GPUs and is set as default. On older GPUs 1 may work better. * PR comments --------- Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
2025-06-26 19:55:04 +00:00 · 2023-06-16 20:08:44 +03:00
parent 602c748863
commit 3d01122610
3 changed files with 385 additions and 221 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -70,6 +70,7 @@ set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
 option(LLAMA_CUBLAS                          "llama: use cuBLAS"                                OFF)
 set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
 set(LLAMA_CUDA_DMMV_Y       "1" CACHE STRING "llama: y block size for dmmv CUDA kernels")
+set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
 option(LLAMA_METAL                           "llama: use Metal"                                 OFF)
 option(LLAMA_K_QUANTS                        "llama: use k-quants"                              ON)
@ -201,6 +202,7 @@ if (LLAMA_CUBLAS)
        add_compile_definitions(GGML_USE_CUBLAS)
        add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
        add_compile_definitions(GGML_CUDA_DMMV_Y=${LLAMA_CUDA_DMMV_Y})
+        add_compile_definitions(K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

        if (LLAMA_STATIC)
            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)