mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-28 03:55:06 -04:00
musa: upgrade musa sdk to rc4.2.0 (#14498)
* musa: apply mublas API changes Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: update musa version to 4.2.0 Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: restore MUSA graph settings in CMakeLists.txt Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: disable mudnnMemcpyAsync by default Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: switch back to non-mudnn images Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * minor changes Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: restore rc in docker image tag Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> --------- Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
This commit is contained in:
@ -1,10 +1,10 @@
|
|||||||
ARG UBUNTU_VERSION=22.04
|
ARG UBUNTU_VERSION=22.04
|
||||||
# This needs to generally match the container host's environment.
|
# This needs to generally match the container host's environment.
|
||||||
ARG MUSA_VERSION=rc4.0.1
|
ARG MUSA_VERSION=rc4.2.0
|
||||||
# Target the MUSA build image
|
# Target the MUSA build image
|
||||||
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
|
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
|
||||||
|
|
||||||
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
|
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
|
||||||
|
|
||||||
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
FROM ${BASE_MUSA_DEV_CONTAINER} AS build
|
||||||
|
|
||||||
|
2
.github/workflows/build.yml
vendored
2
.github/workflows/build.yml
vendored
@ -515,7 +515,7 @@ jobs:
|
|||||||
|
|
||||||
ubuntu-22-cmake-musa:
|
ubuntu-22-cmake-musa:
|
||||||
runs-on: ubuntu-22.04
|
runs-on: ubuntu-22.04
|
||||||
container: mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
container: mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Clone
|
- name: Clone
|
||||||
|
@ -54,7 +54,7 @@ docker run --privileged -it \
|
|||||||
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
-v $HOME/llama.cpp/ci-cache:/ci-cache \
|
||||||
-v $HOME/llama.cpp/ci-results:/ci-results \
|
-v $HOME/llama.cpp/ci-results:/ci-results \
|
||||||
-v $PWD:/ws -w /ws \
|
-v $PWD:/ws -w /ws \
|
||||||
mthreads/musa:rc4.0.1-mudnn-devel-ubuntu22.04
|
mthreads/musa:rc4.2.0-devel-ubuntu22.04-amd64
|
||||||
```
|
```
|
||||||
|
|
||||||
Inside the container, execute the following commands:
|
Inside the container, execute the following commands:
|
||||||
|
@ -110,7 +110,7 @@ You may want to pass in some different `ARGS`, depending on the MUSA environment
|
|||||||
|
|
||||||
The defaults are:
|
The defaults are:
|
||||||
|
|
||||||
- `MUSA_VERSION` set to `rc4.0.1`
|
- `MUSA_VERSION` set to `rc4.2.0`
|
||||||
|
|
||||||
The resulting images, are essentially the same as the non-MUSA images:
|
The resulting images, are essentially the same as the non-MUSA images:
|
||||||
|
|
||||||
|
@ -174,6 +174,8 @@ option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental,
|
|||||||
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
|
||||||
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
|
||||||
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
option(GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 "ggml: enable rocWMMA FlashAttention on GFX12" OFF)
|
||||||
|
option(GGML_MUSA_GRAPHS "ggml: use MUSA graph, experimental, unstable" OFF)
|
||||||
|
option(GGML_MUSA_MUDNN_COPY "ggml: enable muDNN for accelerated copy" OFF)
|
||||||
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
option(GGML_VULKAN "ggml: use Vulkan" OFF)
|
||||||
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
|
||||||
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
|
||||||
|
@ -765,7 +765,7 @@ struct ggml_tensor_extra_gpu {
|
|||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS))
|
#if (defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)) || defined(GGML_MUSA_GRAPHS)
|
||||||
#define USE_CUDA_GRAPH
|
#define USE_CUDA_GRAPH
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1,9 +1,9 @@
|
|||||||
#include "cpy.cuh"
|
#include "cpy.cuh"
|
||||||
#include "dequantize.cuh"
|
#include "dequantize.cuh"
|
||||||
#include "cpy-utils.cuh"
|
#include "cpy-utils.cuh"
|
||||||
#ifdef GGML_USE_MUSA
|
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||||
#include "ggml-musa/mudnn.cuh"
|
#include "ggml-musa/mudnn.cuh"
|
||||||
#endif // GGML_USE_MUSA
|
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||||
|
|
||||||
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
typedef void (*cpy_kernel_t)(const char * cx, char * cdst);
|
||||||
|
|
||||||
@ -121,7 +121,7 @@ static __global__ void cpy_q_f32(const char * cx, char * cdst_direct, const int
|
|||||||
// Copy destination pointers to GPU to be available when pointer indirection is in use
|
// Copy destination pointers to GPU to be available when pointer indirection is in use
|
||||||
|
|
||||||
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
void ggml_cuda_cpy_dest_ptrs_copy(ggml_cuda_graph * cuda_graph, char ** host_dest_ptrs, const int host_dest_ptrs_size, cudaStream_t stream) {
|
||||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||||
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
if (cuda_graph->dest_ptrs_size < host_dest_ptrs_size) { // (re-)allocate GPU memory for destination pointers
|
||||||
CUDA_CHECK(cudaStreamSynchronize(stream));
|
CUDA_CHECK(cudaStreamSynchronize(stream));
|
||||||
if (cuda_graph->dest_ptrs_d != nullptr) {
|
if (cuda_graph->dest_ptrs_d != nullptr) {
|
||||||
@ -314,7 +314,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||||||
|
|
||||||
char ** dest_ptrs_d = nullptr;
|
char ** dest_ptrs_d = nullptr;
|
||||||
int graph_cpynode_index = -1;
|
int graph_cpynode_index = -1;
|
||||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||||
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
dest_ptrs_d = ctx.cuda_graph->dest_ptrs_d;
|
||||||
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
graph_cpynode_index = ctx.cuda_graph->graph_cpynode_index;
|
||||||
@ -324,11 +324,11 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||||||
#endif
|
#endif
|
||||||
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
if (src0->type == src1->type && ggml_is_contiguous(src0) && ggml_is_contiguous(src1)) {
|
||||||
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
GGML_ASSERT(ggml_nbytes(src0) == ggml_nbytes(src1));
|
||||||
#ifdef GGML_USE_MUSA
|
#if defined(GGML_USE_MUSA) && defined(GGML_MUSA_MUDNN_COPY)
|
||||||
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
if (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16) {
|
||||||
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
CUDA_CHECK(mudnnMemcpyAsync(ctx, src1, src0));
|
||||||
} else
|
} else
|
||||||
#endif // GGML_USE_MUSA
|
#endif // GGML_USE_MUSA && GGML_MUSA_MUDNN_COPY
|
||||||
{
|
{
|
||||||
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
CUDA_CHECK(cudaMemcpyAsync(src1_ddc, src0_ddc, ggml_nbytes(src0), cudaMemcpyDeviceToDevice, main_stream));
|
||||||
}
|
}
|
||||||
@ -379,7 +379,7 @@ void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, gg
|
|||||||
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
GGML_ABORT("%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
}
|
}
|
||||||
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS)
|
#if defined(GGML_CUDA_USE_GRAPHS) || defined(GGML_HIP_GRAPHS) || defined(GGML_MUSA_GRAPHS)
|
||||||
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
if(ctx.cuda_graph->use_cpy_indirection && !disable_indirection_for_this_node) {
|
||||||
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
ctx.cuda_graph->graph_cpynode_index = graph_cpynode_index;
|
||||||
}
|
}
|
||||||
|
4
ggml/src/ggml-cuda/vendors/musa.h
vendored
4
ggml/src/ggml-cuda/vendors/musa.h
vendored
@ -13,7 +13,7 @@
|
|||||||
#define CUBLAS_OP_N MUBLAS_OP_N
|
#define CUBLAS_OP_N MUBLAS_OP_N
|
||||||
#define CUBLAS_OP_T MUBLAS_OP_T
|
#define CUBLAS_OP_T MUBLAS_OP_T
|
||||||
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
#define CUBLAS_STATUS_SUCCESS MUBLAS_STATUS_SUCCESS
|
||||||
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_MATH_MODE_DEFAULT
|
#define CUBLAS_TF32_TENSOR_OP_MATH MUBLAS_TENSOR_OP_MATH
|
||||||
#define CUDA_R_16F MUSA_R_16F
|
#define CUDA_R_16F MUSA_R_16F
|
||||||
#define CUDA_R_16BF MUSA_R_16BF
|
#define CUDA_R_16BF MUSA_R_16BF
|
||||||
#define CUDA_R_32F MUSA_R_32F
|
#define CUDA_R_32F MUSA_R_32F
|
||||||
@ -29,7 +29,7 @@
|
|||||||
#define cublasSgemm mublasSgemm
|
#define cublasSgemm mublasSgemm
|
||||||
#define cublasStatus_t mublasStatus_t
|
#define cublasStatus_t mublasStatus_t
|
||||||
#define cublasOperation_t mublasOperation_t
|
#define cublasOperation_t mublasOperation_t
|
||||||
#define cublasGetStatusString mublasStatus_to_string
|
#define cublasGetStatusString mublasGetStatusString
|
||||||
#define cudaDataType_t musaDataType_t
|
#define cudaDataType_t musaDataType_t
|
||||||
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
#define cudaDeviceCanAccessPeer musaDeviceCanAccessPeer
|
||||||
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
#define cudaDeviceDisablePeerAccess musaDeviceDisablePeerAccess
|
||||||
|
@ -34,8 +34,12 @@ if (MUSAToolkit_FOUND)
|
|||||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||||
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
file(GLOB SRCS "../ggml-cuda/template-instances/mmq*.cu")
|
||||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||||
file(GLOB SRCS "../ggml-musa/*.cu")
|
|
||||||
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
if (GGML_MUSA_MUDNN_COPY)
|
||||||
|
file(GLOB SRCS "../ggml-musa/*.cu")
|
||||||
|
list(APPEND GGML_SOURCES_MUSA ${SRCS})
|
||||||
|
add_compile_definitions(GGML_MUSA_MUDNN_COPY)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_CUDA_FA_ALL_QUANTS)
|
if (GGML_CUDA_FA_ALL_QUANTS)
|
||||||
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
file(GLOB SRCS "../ggml-cuda/template-instances/fattn-vec*.cu")
|
||||||
@ -72,6 +76,10 @@ if (MUSAToolkit_FOUND)
|
|||||||
add_compile_definitions(GGML_USE_MUSA)
|
add_compile_definitions(GGML_USE_MUSA)
|
||||||
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
add_compile_definitions(GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
|
||||||
|
|
||||||
|
if (GGML_MUSA_GRAPHS)
|
||||||
|
add_compile_definitions(GGML_MUSA_GRAPHS)
|
||||||
|
endif()
|
||||||
|
|
||||||
if (GGML_CUDA_FORCE_MMQ)
|
if (GGML_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
@ -97,10 +105,16 @@ if (MUSAToolkit_FOUND)
|
|||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_STATIC)
|
if (GGML_STATIC)
|
||||||
# TODO: mudnn has not provided static libraries yet
|
|
||||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
target_link_libraries(ggml-musa PRIVATE MUSA::musart_static MUSA::mublas_static)
|
||||||
|
# TODO: mudnn has not provided static libraries yet
|
||||||
|
# if (GGML_MUSA_MUDNN_COPY)
|
||||||
|
# target_link_libraries(ggml-musa PRIVATE mudnn_static)
|
||||||
|
# endif()
|
||||||
else()
|
else()
|
||||||
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas mudnn)
|
target_link_libraries(ggml-musa PRIVATE MUSA::musart MUSA::mublas)
|
||||||
|
if (GGML_MUSA_MUDNN_COPY)
|
||||||
|
target_link_libraries(ggml-musa PRIVATE mudnn)
|
||||||
|
endif()
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
if (GGML_CUDA_NO_VMM)
|
if (GGML_CUDA_NO_VMM)
|
||||||
|
Reference in New Issue
Block a user