mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
cuda : ROCm AMD Unified Memory Architecture (UMA) handling (#4449)
* AMD ROCm: handle UMA memory VRAM expansions This resolves #2797 by allowing ROCm AMD GPU users with a UMA to dynamically expand the VRAM allocated to the GPU. Without this, AMD ROCm users with shared CPU/GPU memory usually are stuck with the BIOS-set (or fixed) framebuffer VRAM, making it impossible to load more than 1-2 layers. Note that the model is duplicated in RAM because it's loaded once for the CPU and then copied into a second set of allocations that are managed by the HIP UMA system. We can fix this later. * clarify build process for ROCm on linux with cmake * avoid using deprecated ROCm hipMallocHost * keep simplifying the change required for UMA * cmake: enable UMA-compatible allocation when LLAMA_HIP_UMA=ON
This commit is contained in:
@ -91,6 +91,7 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
|
||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||
"llama: max. batch size for using peer access")
|
||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||
option(LLAMA_CLBLAST "llama: use CLBlast" OFF)
|
||||
option(LLAMA_METAL "llama: use Metal" ${LLAMA_METAL_DEFAULT})
|
||||
option(LLAMA_METAL_NDEBUG "llama: disable Metal debugging" OFF)
|
||||
@ -377,6 +378,9 @@ if (LLAMA_HIPBLAS)
|
||||
if (${hipblas_FOUND} AND ${hip_FOUND})
|
||||
message(STATUS "HIP and hipBLAS found")
|
||||
add_compile_definitions(GGML_USE_HIPBLAS GGML_USE_CUBLAS)
|
||||
if (LLAMA_HIP_UMA)
|
||||
add_compile_definitions(GGML_HIP_UMA)
|
||||
endif()
|
||||
add_library(ggml-rocm OBJECT ggml-cuda.cu ggml-cuda.h)
|
||||
if (BUILD_SHARED_LIBS)
|
||||
set_target_properties(ggml-rocm PROPERTIES POSITION_INDEPENDENT_CODE ON)
|
||||
|
Reference in New Issue
Block a user