mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-09-02 21:28:38 -04:00
musa: enable building fat binaries, enable unified memory, and disable Flash Attention on QY1 (MTT S80) (#9526)
* mtgpu: add mp_21 support Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * mtgpu: disable flash attention on qy1 (MTT S80); disable q3_k and mul_mat_batched_cublas Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * mtgpu: enable unified memory Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * mtgpu: map cublasOperation_t to mublasOperation_t (sync code to latest) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> --------- Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
This commit is contained in:
2
Makefile
2
Makefile
@@ -611,7 +611,7 @@ ifdef GGML_CUDA
|
||||
|
||||
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include
|
||||
MK_LDFLAGS += -lmusa -lmublas -lmusart -lpthread -ldl -lrt -L$(CUDA_PATH)/lib -L/usr/lib64
|
||||
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_22
|
||||
MK_NVCCFLAGS += -x musa -mtgpu --cuda-gpu-arch=mp_21 --cuda-gpu-arch=mp_22
|
||||
else
|
||||
ifneq ('', '$(wildcard /opt/cuda)')
|
||||
CUDA_PATH ?= /opt/cuda
|
||||
|
Reference in New Issue
Block a user