CUDA: remove DMMV, consolidate F16 mult mat vec (#10318)

This commit is contained in:
Johannes Gäßler
2024-11-17 09:09:55 +01:00
committed by GitHub
parent 467576b6cc
commit c3ea58aca4
10 changed files with 246 additions and 1000 deletions

View File

@ -635,10 +635,6 @@ else ifndef CUDA_POWER_ARCH
MK_NVCCFLAGS += -arch=native
endif # CUDA_DOCKER_ARCH
ifdef GGML_CUDA_FORCE_DMMV
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # GGML_CUDA_FORCE_MMQ
@ -647,20 +643,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
MK_NVCCFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # GGML_CUDA_FORCE_CUBLAS
ifdef GGML_CUDA_DMMV_X
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
else
MK_NVCCFLAGS += -DGGML_CUDA_DMMV_X=32
endif # GGML_CUDA_DMMV_X
ifdef GGML_CUDA_MMV_Y
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
else ifdef GGML_CUDA_DMMV_Y
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_DMMV_Y) # for backwards compatibility
else
MK_NVCCFLAGS += -DGGML_CUDA_MMV_Y=1
endif # GGML_CUDA_MMV_Y
ifdef GGML_CUDA_F16
MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_F16
@ -669,12 +651,6 @@ ifdef GGML_CUDA_DMMV_F16
MK_NVCCFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_DMMV_F16
ifdef GGML_CUDA_KQUANTS_ITER
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
else
MK_NVCCFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
MK_NVCCFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
else
@ -783,10 +759,6 @@ ifdef GGML_HIPBLAS
AMDGPU_TARGETS ?= $(shell $(ROCM_PATH)/llvm/bin/amdgpu-arch)
endif
GGML_CUDA_DMMV_X ?= 32
GGML_CUDA_MMV_Y ?= 1
GGML_CUDA_KQUANTS_ITER ?= 2
MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
ifdef GGML_HIP_UMA
@ -800,13 +772,6 @@ endif # GGML_HIP_UMA
HIPCC ?= $(CCACHE) $(ROCM_PATH)/bin/hipcc
HIPFLAGS += $(addprefix --offload-arch=,$(AMDGPU_TARGETS))
HIPFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
HIPFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
HIPFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
ifdef GGML_CUDA_FORCE_DMMV
HIPFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ
HIPFLAGS += -DGGML_CUDA_FORCE_MMQ
@ -869,10 +834,6 @@ ifdef GGML_MUSA
MUSAFLAGS += $(addprefix --cuda-gpu-arch=, $(MTGPU_TARGETS))
ifdef GGML_CUDA_FORCE_DMMV
MUSAFLAGS += -DGGML_CUDA_FORCE_DMMV
endif # GGML_CUDA_FORCE_DMMV
ifdef GGML_CUDA_FORCE_MMQ
MUSAFLAGS += -DGGML_CUDA_FORCE_MMQ
endif # GGML_CUDA_FORCE_MMQ
@ -881,18 +842,6 @@ ifdef GGML_CUDA_FORCE_CUBLAS
MUSAFLAGS += -DGGML_CUDA_FORCE_CUBLAS
endif # GGML_CUDA_FORCE_CUBLAS
ifdef GGML_CUDA_DMMV_X
MUSAFLAGS += -DGGML_CUDA_DMMV_X=$(GGML_CUDA_DMMV_X)
else
MUSAFLAGS += -DGGML_CUDA_DMMV_X=32
endif # GGML_CUDA_DMMV_X
ifdef GGML_CUDA_MMV_Y
MUSAFLAGS += -DGGML_CUDA_MMV_Y=$(GGML_CUDA_MMV_Y)
else
MUSAFLAGS += -DGGML_CUDA_MMV_Y=1
endif # GGML_CUDA_MMV_Y
ifdef GGML_CUDA_F16
MUSAFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_F16
@ -901,12 +850,6 @@ ifdef GGML_CUDA_DMMV_F16
MUSAFLAGS += -DGGML_CUDA_F16
endif # GGML_CUDA_DMMV_F16
ifdef GGML_CUDA_KQUANTS_ITER
MUSAFLAGS += -DK_QUANTS_PER_ITERATION=$(GGML_CUDA_KQUANTS_ITER)
else
MUSAFLAGS += -DK_QUANTS_PER_ITERATION=2
endif
ifdef GGML_CUDA_PEER_MAX_BATCH_SIZE
MUSAFLAGS += -DGGML_CUDA_PEER_MAX_BATCH_SIZE=$(GGML_CUDA_PEER_MAX_BATCH_SIZE)
else