mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 11:45:21 +00:00
CUDA: use MMQ instead of cuBLAS by default (#8075)
This commit is contained in:
@ -102,7 +102,8 @@ option(LLAMA_LLAMAFILE "llama: use llamafile SGEMM"
|
||||
option(LLAMA_CUDA "llama: use CUDA" OFF)
|
||||
option(LLAMA_CUBLAS "llama: use CUDA (deprecated, use LLAMA_CUDA)" OFF)
|
||||
option(LLAMA_CUDA_FORCE_DMMV "llama: use dmmv instead of mmvq CUDA kernels" OFF)
|
||||
option(LLAMA_CUDA_FORCE_MMQ "llama: use mmq kernels instead of cuBLAS" OFF)
|
||||
option(LLAMA_CUDA_FORCE_MMQ "llama: always use mmq kernels instead of cuBLAS" OFF)
|
||||
option(LLAMA_CUDA_FORCE_CUBLAS "llama: always use cuBLAS instead of mmq kernels" OFF)
|
||||
set(LLAMA_CUDA_DMMV_X "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
|
||||
set(LLAMA_CUDA_MMV_Y "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
|
||||
option(LLAMA_CUDA_F16 "llama: use 16 bit floats for some calculations" OFF)
|
||||
@ -416,13 +417,14 @@ if (LLAMA_CUDA)
|
||||
|
||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||
# 52 == lowest CUDA 12 standard
|
||||
# 60 == f16 CUDA intrinsics
|
||||
# 60 == FP16 CUDA intrinsics
|
||||
# 61 == integer CUDA intrinsics
|
||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
||||
# 70 == FP16 tensor cores
|
||||
# 75 == int8 tensor cores
|
||||
if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
|
||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75")
|
||||
else()
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
|
||||
set(CMAKE_CUDA_ARCHITECTURES "52;61;70;75")
|
||||
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
||||
endif()
|
||||
endif()
|
||||
@ -447,6 +449,9 @@ if (LLAMA_CUDA)
|
||||
if (LLAMA_CUDA_FORCE_MMQ)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||
endif()
|
||||
if (LLAMA_CUDA_FORCE_CUBLAS)
|
||||
add_compile_definitions(GGML_CUDA_FORCE_CUBLAS)
|
||||
endif()
|
||||
if (LLAMA_CUDA_NO_VMM)
|
||||
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||
endif()
|
||||
|
Reference in New Issue
Block a user