diff --git a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml index d538fdff1..95a0b5cc7 100644 --- a/.github/ISSUE_TEMPLATE/010-bug-compilation.yml +++ b/.github/ISSUE_TEMPLATE/010-bug-compilation.yml @@ -40,7 +40,7 @@ body: attributes: label: GGML backends description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL] + options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL] multiple: true validations: required: true diff --git a/.github/ISSUE_TEMPLATE/011-bug-results.yml b/.github/ISSUE_TEMPLATE/011-bug-results.yml index 2acec7114..d1034bbb6 100644 --- a/.github/ISSUE_TEMPLATE/011-bug-results.yml +++ b/.github/ISSUE_TEMPLATE/011-bug-results.yml @@ -42,7 +42,7 @@ body: attributes: label: GGML backends description: Which GGML backends do you know to be affected? - options: [AMX, BLAS, CPU, CUDA, HIP, Kompute, Metal, Musa, RPC, SYCL, Vulkan, OpenCL] + options: [AMX, BLAS, CPU, CUDA, HIP, Metal, Musa, RPC, SYCL, Vulkan, OpenCL] multiple: true validations: required: true diff --git a/.github/labeler.yml b/.github/labeler.yml index a4c6e3d33..df6a7a40e 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,10 +1,4 @@ # https://github.com/actions/labeler -Kompute: - - changed-files: - - any-glob-to-any-file: - - ggml/include/ggml-kompute.h - - ggml/src/ggml-kompute/** - - README-kompute.md Apple Metal: - changed-files: - any-glob-to-any-file: diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 5d4fb5272..42d63b7c5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -740,9 +740,6 @@ jobs: - build: 'llvm-arm64-opencl-adreno' arch: 'arm64' defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON' - # - build: 'kompute-x64' - # arch: 'x64' - # defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON' steps: - name: Clone @@ -756,12 +753,6 @@ jobs: variant: ccache evict-old-files: 1d - - name: Clone Kompute submodule - id: clone_kompute - if: ${{ matrix.build == 'kompute-x64' }} - run: | - git submodule update --init ggml/src/ggml-kompute/kompute - - name: Download OpenBLAS id: get_openblas if: ${{ matrix.build == 'openblas-x64' }} @@ -777,7 +768,7 @@ jobs: - name: Install Vulkan SDK id: get_vulkan - if: ${{ matrix.build == 'kompute-x64' || matrix.build == 'vulkan-x64' }} + if: ${{ matrix.build == 'vulkan-x64' }} run: | curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/vulkansdk-windows-X64-${env:VULKAN_VERSION}.exe" & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install diff --git a/.gitmodules b/.gitmodules index 23ce5ff05..e69de29bb 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +0,0 @@ -[submodule "kompute"] - path = ggml/src/ggml-kompute/kompute - url = https://github.com/nomic-ai/kompute.git diff --git a/CMakeLists.txt b/CMakeLists.txt index d2becb04c..c79ccd09e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -120,7 +120,6 @@ endfunction() llama_option_depr(FATAL_ERROR LLAMA_CUBLAS GGML_CUDA) llama_option_depr(WARNING LLAMA_CUDA GGML_CUDA) -llama_option_depr(WARNING LLAMA_KOMPUTE GGML_KOMPUTE) llama_option_depr(WARNING LLAMA_METAL GGML_METAL) llama_option_depr(WARNING LLAMA_METAL_EMBED_LIBRARY GGML_METAL_EMBED_LIBRARY) llama_option_depr(WARNING LLAMA_NATIVE GGML_NATIVE) diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt index 675f3bf75..eaba9c704 100644 --- a/ggml/CMakeLists.txt +++ b/ggml/CMakeLists.txt @@ -181,7 +181,6 @@ option(GGML_VULKAN_MEMORY_DEBUG "ggml: enable Vulkan memory debug ou option(GGML_VULKAN_SHADER_DEBUG_INFO "ggml: enable Vulkan shader debug info" OFF) option(GGML_VULKAN_VALIDATE "ggml: enable Vulkan validation" OFF) option(GGML_VULKAN_RUN_TESTS "ggml: run Vulkan tests" OFF) -option(GGML_KOMPUTE "ggml: use Kompute" OFF) option(GGML_METAL "ggml: use Metal" ${GGML_METAL_DEFAULT}) option(GGML_METAL_USE_BF16 "ggml: use bfloat if available" OFF) option(GGML_METAL_NDEBUG "ggml: disable Metal debugging" OFF) @@ -266,7 +265,6 @@ set(GGML_PUBLIC_HEADERS include/ggml-cann.h include/ggml-cpp.h include/ggml-cuda.h - include/ggml-kompute.h include/ggml-opt.h include/ggml-metal.h include/ggml-rpc.h diff --git a/ggml/include/ggml-kompute.h b/ggml/include/ggml-kompute.h deleted file mode 100644 index 154aa56a7..000000000 --- a/ggml/include/ggml-kompute.h +++ /dev/null @@ -1,50 +0,0 @@ -#pragma once - -#include "ggml.h" -#include "ggml-backend.h" - -#include -#include -#include - -#ifdef __cplusplus -extern "C" { -#endif - -#define GGML_KOMPUTE_MAX_DEVICES 16 - -struct ggml_vk_device { - int index; - int type; // same as VkPhysicalDeviceType - size_t heapSize; - const char * name; - const char * vendor; - int subgroupSize; - uint64_t bufferAlignment; - uint64_t maxAlloc; -}; - -struct ggml_vk_device * ggml_vk_available_devices(size_t memoryRequired, size_t * count); -bool ggml_vk_get_device(struct ggml_vk_device * device, size_t memoryRequired, const char * name); -bool ggml_vk_has_vulkan(void); -bool ggml_vk_has_device(void); -struct ggml_vk_device ggml_vk_current_device(void); - -// -// backend API -// - -// forward declaration -typedef struct ggml_backend * ggml_backend_t; - -GGML_BACKEND_API ggml_backend_t ggml_backend_kompute_init(int device); - -GGML_BACKEND_API bool ggml_backend_is_kompute(ggml_backend_t backend); - -GGML_BACKEND_API ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device); - -GGML_BACKEND_API ggml_backend_reg_t ggml_backend_kompute_reg(void); - -#ifdef __cplusplus -} -#endif diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt index 9cb2c228d..8760c2d35 100644 --- a/ggml/src/CMakeLists.txt +++ b/ggml/src/CMakeLists.txt @@ -365,7 +365,6 @@ ggml_add_backend(BLAS) ggml_add_backend(CANN) ggml_add_backend(CUDA) ggml_add_backend(HIP) -ggml_add_backend(Kompute) ggml_add_backend(METAL) ggml_add_backend(MUSA) ggml_add_backend(RPC) diff --git a/ggml/src/ggml-backend-reg.cpp b/ggml/src/ggml-backend-reg.cpp index 2d93771fd..042ea77ac 100644 --- a/ggml/src/ggml-backend-reg.cpp +++ b/ggml/src/ggml-backend-reg.cpp @@ -61,10 +61,6 @@ #include "ggml-cann.h" #endif -#ifdef GGML_USE_KOMPUTE -#include "ggml-kompute.h" -#endif - // disable C++17 deprecation warning for std::codecvt_utf8 #if defined(__clang__) # pragma clang diagnostic push @@ -189,9 +185,6 @@ struct ggml_backend_registry { #ifdef GGML_USE_RPC register_backend(ggml_backend_rpc_reg()); #endif -#ifdef GGML_USE_KOMPUTE - register_backend(ggml_backend_kompute_reg()); -#endif #ifdef GGML_USE_CPU register_backend(ggml_backend_cpu_reg()); #endif @@ -575,7 +568,6 @@ void ggml_backend_load_all_from_path(const char * dir_path) { ggml_backend_load_best("cann", silent, dir_path); ggml_backend_load_best("cuda", silent, dir_path); ggml_backend_load_best("hip", silent, dir_path); - ggml_backend_load_best("kompute", silent, dir_path); ggml_backend_load_best("metal", silent, dir_path); ggml_backend_load_best("rpc", silent, dir_path); ggml_backend_load_best("sycl", silent, dir_path); diff --git a/ggml/src/ggml-kompute/CMakeLists.txt b/ggml/src/ggml-kompute/CMakeLists.txt deleted file mode 100644 index c9109d5e8..000000000 --- a/ggml/src/ggml-kompute/CMakeLists.txt +++ /dev/null @@ -1,166 +0,0 @@ - -find_package(Vulkan COMPONENTS glslc REQUIRED) -find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc) - -if (NOT glslc_executable) - message(FATAL_ERROR "glslc not found") -endif() - -ggml_add_backend_library(ggml-kompute - ggml-kompute.cpp - ../../include/ggml-kompute.h - ) - -target_link_libraries(ggml-kompute PRIVATE ggml-base kompute) -target_include_directories(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}) - -add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1) - -function(compile_shader) - set(options) - set(oneValueArgs) - set(multiValueArgs SOURCES) - cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) - foreach(source ${compile_shader_SOURCES}) - get_filename_component(filename ${source} NAME) - set(spv_file ${filename}.spv) - add_custom_command( - OUTPUT ${spv_file} - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${source} - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/common.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_getrows.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp - ${CMAKE_CURRENT_SOURCE_DIR}/kompute-shaders/op_mul_mv_q_n.comp - COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${CMAKE_CURRENT_SOURCE_DIR}/${source} - COMMENT "Compiling ${source} to ${spv_file}" - ) - - get_filename_component(RAW_FILE_NAME ${spv_file} NAME) - set(FILE_NAME "shader${RAW_FILE_NAME}") - string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME}) - string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE) - string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}") - set(OUTPUT_HEADER_FILE "${HEADER_FILE}") - message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}") - if(CMAKE_GENERATOR MATCHES "Visual Studio") - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/$/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/$/xxd" - ) - else() - add_custom_command( - OUTPUT ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${RAW_FILE_NAME} >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE} - COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE} - DEPENDS ${spv_file} xxd - COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd" - ) - endif() - endforeach() -endfunction() - -if (EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/kompute/CMakeLists.txt") - message(STATUS "Kompute found") - set(KOMPUTE_OPT_LOG_LEVEL Error CACHE STRING "Kompute log level") - add_subdirectory(kompute) - - # Compile our shaders - compile_shader(SOURCES - kompute-shaders/op_scale.comp - kompute-shaders/op_scale_8.comp - kompute-shaders/op_add.comp - kompute-shaders/op_addrow.comp - kompute-shaders/op_mul.comp - kompute-shaders/op_silu.comp - kompute-shaders/op_relu.comp - kompute-shaders/op_gelu.comp - kompute-shaders/op_softmax.comp - kompute-shaders/op_norm.comp - kompute-shaders/op_rmsnorm.comp - kompute-shaders/op_diagmask.comp - kompute-shaders/op_mul_mat_mat_f32.comp - kompute-shaders/op_mul_mat_f16.comp - kompute-shaders/op_mul_mat_q8_0.comp - kompute-shaders/op_mul_mat_q4_0.comp - kompute-shaders/op_mul_mat_q4_1.comp - kompute-shaders/op_mul_mat_q4_k.comp - kompute-shaders/op_mul_mat_q6_k.comp - kompute-shaders/op_getrows_f32.comp - kompute-shaders/op_getrows_f16.comp - kompute-shaders/op_getrows_q4_0.comp - kompute-shaders/op_getrows_q4_1.comp - kompute-shaders/op_getrows_q6_k.comp - kompute-shaders/op_rope_norm_f16.comp - kompute-shaders/op_rope_norm_f32.comp - kompute-shaders/op_rope_neox_f16.comp - kompute-shaders/op_rope_neox_f32.comp - kompute-shaders/op_cpy_f16_f16.comp - kompute-shaders/op_cpy_f16_f32.comp - kompute-shaders/op_cpy_f32_f16.comp - kompute-shaders/op_cpy_f32_f32.comp - ) - - # Create a custom target for our generated shaders - add_custom_target(generated_shaders DEPENDS - shaderop_scale.h - shaderop_scale_8.h - shaderop_add.h - shaderop_addrow.h - shaderop_mul.h - shaderop_silu.h - shaderop_relu.h - shaderop_gelu.h - shaderop_softmax.h - shaderop_norm.h - shaderop_rmsnorm.h - shaderop_diagmask.h - shaderop_mul_mat_mat_f32.h - shaderop_mul_mat_f16.h - shaderop_mul_mat_q8_0.h - shaderop_mul_mat_q4_0.h - shaderop_mul_mat_q4_1.h - shaderop_mul_mat_q4_k.h - shaderop_mul_mat_q6_k.h - shaderop_getrows_f32.h - shaderop_getrows_f16.h - shaderop_getrows_q4_0.h - shaderop_getrows_q4_1.h - shaderop_getrows_q6_k.h - shaderop_rope_norm_f16.h - shaderop_rope_norm_f32.h - shaderop_rope_neox_f16.h - shaderop_rope_neox_f32.h - shaderop_cpy_f16_f16.h - shaderop_cpy_f16_f32.h - shaderop_cpy_f32_f16.h - shaderop_cpy_f32_f32.h - ) - - # Create a custom command that depends on the generated_shaders - add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp - DEPENDS generated_shaders - COMMENT "Ensuring shaders are generated before compiling ggml-kompute.cpp" - ) - - # Add the stamp to the main sources to ensure dependency tracking - target_sources(ggml-kompute PRIVATE ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp) -else() - message(WARNING "Kompute not found") -endif() diff --git a/ggml/src/ggml-kompute/ggml-kompute.cpp b/ggml/src/ggml-kompute/ggml-kompute.cpp deleted file mode 100644 index 505792271..000000000 --- a/ggml/src/ggml-kompute/ggml-kompute.cpp +++ /dev/null @@ -1,2251 +0,0 @@ -#include "ggml-impl.h" -#include "ggml-backend.h" -#include "ggml-backend-impl.h" -#include "ggml-kompute.h" - -// These are generated at build time by cmake custom command -#include "shaderop_scale.h" -#include "shaderop_scale_8.h" -#include "shaderop_add.h" -#include "shaderop_addrow.h" -#include "shaderop_mul.h" -#include "shaderop_silu.h" -#include "shaderop_relu.h" -#include "shaderop_gelu.h" -#include "shaderop_softmax.h" -#include "shaderop_norm.h" -#include "shaderop_rmsnorm.h" -#include "shaderop_diagmask.h" -#include "shaderop_mul_mat_f16.h" -#include "shaderop_mul_mat_q8_0.h" -#include "shaderop_mul_mat_q4_0.h" -#include "shaderop_mul_mat_q4_1.h" -#include "shaderop_mul_mat_q4_k.h" -#include "shaderop_mul_mat_q6_k.h" -#include "shaderop_mul_mat_mat_f32.h" -#include "shaderop_getrows_f32.h" -#include "shaderop_getrows_f16.h" -#include "shaderop_getrows_q4_0.h" -#include "shaderop_getrows_q4_1.h" -#include "shaderop_getrows_q6_k.h" -#include "shaderop_rope_norm_f16.h" -#include "shaderop_rope_norm_f32.h" -#include "shaderop_rope_neox_f16.h" -#include "shaderop_rope_neox_f32.h" -#include "shaderop_cpy_f16_f16.h" -#include "shaderop_cpy_f16_f32.h" -#include "shaderop_cpy_f32_f16.h" -#include "shaderop_cpy_f32_f32.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#ifdef __linux__ -#include // for setenv -#endif - -#define QK4_0 32 -#define QR4_0 2 -#define QK4_1 32 -#define QK_NL 16 - -typedef ggml_fp16_t half; - -static std::string ggml_kompute_format_name(int device) { - return "Kompute" + std::to_string(device); -} - -struct ggml_kompute_context { - int device; - std::string name; - std::shared_ptr pool; - - ggml_kompute_context(int device) - : device(device), name(ggml_kompute_format_name(device)) {} -}; - -// FIXME: It would be good to consolidate the kompute manager and the kompute context into one object -// and consolidate the init functions and simplify object lifetime management. As it currently stands, -// we *have* to have the kompute manager no matter what for device discovery, but the kompute context -// is only created when a device is set and vulkan is explicitly turned on. -static ggml_kompute_context *s_kompute_context = nullptr; - -class kompute_manager { - kp::Manager *s_mgr = nullptr; - -public: - kp::Manager *operator()() { - if (s_mgr && !s_mgr->hasInstance()) { - destroy(); - } - if (!s_mgr) { - s_mgr = new kp::Manager; - } - return s_mgr; - } - - void destroy() { - delete s_mgr; - s_mgr = nullptr; - } -}; - -static kompute_manager komputeManager; - -struct ggml_vk_memory { - void *data = nullptr; - size_t size = 0; - vk::DeviceMemory *primaryMemory = nullptr; - vk::Buffer *primaryBuffer = nullptr; - vk::DeviceMemory *stagingMemory = nullptr; - vk::Buffer *stagingBuffer = nullptr; -}; - -#ifdef __linux__ -__attribute__((constructor)) -static void enable_sam() { - setenv("RADV_PERFTEST", "sam", false); -} -#endif - -static bool ggml_vk_checkPhysicalDeviceFeatures(vk::PhysicalDevice physical_device) { - vk::PhysicalDeviceFeatures availableFeatures; - physical_device.getFeatures(&availableFeatures); - - if (!availableFeatures.shaderInt16) - return false; - - vk::PhysicalDeviceVulkan11Features availableFeatures11; - vk::PhysicalDeviceVulkan12Features availableFeatures12; - - availableFeatures11.pNext = &availableFeatures12; - availableFeatures12.pNext = nullptr; - - vk::PhysicalDeviceFeatures2 features2; - features2.pNext = &availableFeatures11; - - physical_device.getFeatures2(&features2); - - if (!availableFeatures11.uniformAndStorageBuffer16BitAccess || - !availableFeatures11.storageBuffer16BitAccess) { - return false; - } - - if (!availableFeatures12.storageBuffer8BitAccess || - !availableFeatures12.uniformAndStorageBuffer8BitAccess || - !availableFeatures12.shaderFloat16 || - !availableFeatures12.shaderInt8) { - return false; - } - - return true; -} - -static const char * ggml_vk_getVendorName(uint32_t vendorID) { - switch (vendorID) { - case 0x10DE: - return "nvidia"; - case 0x1002: - return "amd"; - case 0x8086: - return "intel"; - default: - return "unknown"; - } -} - -static std::vector ggml_vk_available_devices_internal(size_t memoryRequired) { - std::vector results; - if (!komputeManager()->hasVulkan() || !komputeManager()->hasInstance()) - return results; - - std::vector physical_devices; - try { - physical_devices = komputeManager()->listDevices(); - } catch (vk::SystemError & err) { - std::cerr << __func__ << ": ignoring Vulkan exception: " << err.what() << "\n"; - return results; - } - - uint32_t deviceCount = physical_devices.size(); - if (deviceCount == 0) - return results; - - std::unordered_map count_by_name; - - for (uint32_t i = 0; i < deviceCount; i++) { - const auto & physical_device = physical_devices[i]; - - VkPhysicalDeviceProperties dev_props = physical_device.getProperties(); - VkPhysicalDeviceMemoryProperties memoryProperties = physical_device.getMemoryProperties(); - const uint32_t major = VK_VERSION_MAJOR(dev_props.apiVersion); - const uint32_t minor = VK_VERSION_MINOR(dev_props.apiVersion); - if (major < 1 || minor < 2) - continue; - - if (!ggml_vk_checkPhysicalDeviceFeatures(physical_device)) - continue; - - size_t heapSize = 0; - for (uint32_t j = 0; j < memoryProperties.memoryHeapCount; ++j) { - VkMemoryHeap heap = memoryProperties.memoryHeaps[j]; - if (heap.flags & VK_MEMORY_HEAP_DEVICE_LOCAL_BIT) { - heapSize = heap.size; - break; - } - } - - if (heapSize < memoryRequired) - continue; - - auto ext_props = physical_device.enumerateDeviceExtensionProperties(); - bool has_maintenance4 = false; - - // Check if maintenance4 is supported - for (const auto & properties : ext_props) { - if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { - has_maintenance4 = true; - } - } - - vk::PhysicalDeviceSubgroupProperties subgroup_props; - vk::PhysicalDeviceProperties2 dev_props2; - vk::PhysicalDeviceMaintenance3Properties dev_props3; - vk::PhysicalDeviceMaintenance4Properties dev_props4; - dev_props2.pNext = &dev_props3; - dev_props3.pNext = &subgroup_props; - if (has_maintenance4) { - subgroup_props.pNext = &dev_props4; - } - physical_device.getProperties2(&dev_props2); - - if (subgroup_props.subgroupSize < 32) - continue; - - ggml_vk_device d; - d.index = i; - d.type = dev_props.deviceType; - d.heapSize = heapSize; - d.vendor = strdup(ggml_vk_getVendorName(dev_props.vendorID)); - d.subgroupSize = subgroup_props.subgroupSize; - d.bufferAlignment = dev_props.limits.minStorageBufferOffsetAlignment; - - if (has_maintenance4) { - d.maxAlloc = std::min(dev_props3.maxMemoryAllocationSize, dev_props4.maxBufferSize); - } else { - d.maxAlloc = dev_props3.maxMemoryAllocationSize; - } - - std::string name(dev_props.deviceName); - size_t n_idx = ++count_by_name[name]; - if (n_idx > 1) { - name += " (" + std::to_string(n_idx) + ")"; - } - d.name = strdup(name.c_str()); - - results.push_back(d); - } - - std::stable_sort(results.begin(), results.end(), - [](const ggml_vk_device& lhs, const ggml_vk_device& rhs) -> bool { - if (lhs.type != rhs.type) { - if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return true; - if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_DISCRETE_GPU) return false; - - if (lhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return true; - if (rhs.type == VK_PHYSICAL_DEVICE_TYPE_INTEGRATED_GPU) return false; - } - return lhs.heapSize < rhs.heapSize; - } - ); - - return results; -} - -static std::vector& ggml_vk_available_devices() { - static std::vector devices = ggml_vk_available_devices_internal(0); - return devices; -} - -static void ggml_vk_filterByVendor(std::vector& devices, const std::string& targetVendor) { - devices.erase( - std::remove_if(devices.begin(), devices.end(), - [&targetVendor](const ggml_vk_device& device) { - return device.vendor != targetVendor; - }), - devices.end() - ); -} - -static void ggml_vk_filterByName(std::vector& devices, const std::string& targetName) { - devices.erase( - std::remove_if(devices.begin(), devices.end(), - [&targetName](const ggml_vk_device& device) { - return device.name != targetName; - }), - devices.end() - ); -} - -static bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const std::string & name) { - if (name.empty()) - return false; - - auto devices = ggml_vk_available_devices_internal(memoryRequired); - if (name == "amd" || name == "nvidia" || name == "intel") { - ggml_vk_filterByVendor(devices, name); - } else if (name != "gpu") { - ggml_vk_filterByName(devices, name); - } - - if (devices.empty()) - return false; - - *device = devices.front(); - return true; -} - -bool ggml_vk_get_device(ggml_vk_device * device, size_t memoryRequired, const char * name) { - return ggml_vk_get_device(device, memoryRequired, std::string(name)); -} - -bool ggml_vk_has_vulkan() { - return komputeManager()->hasVulkan(); -} - -bool ggml_vk_has_device() { - return komputeManager()->hasDevice(); -} - -ggml_vk_device ggml_vk_current_device() { - if (!komputeManager()->hasDevice()) - return ggml_vk_device(); - - auto devices = ggml_vk_available_devices(); - ggml_vk_filterByName(devices, komputeManager()->physicalDevice()->getProperties().deviceName.data()); - GGML_ASSERT(!devices.empty()); - return devices.front(); -} - -static -void ggml_vk_allocate_descriptor_pool(struct ggml_kompute_context * ctx, size_t size) { - std::vector descriptorPoolSizes = { - vk::DescriptorPoolSize( - vk::DescriptorType::eStorageBuffer, - 4 * size // Descriptor count is number of possible tensors to pass into an algorithm - ) - }; - - vk::DescriptorPoolCreateInfo descriptorPoolInfo( - vk::DescriptorPoolCreateFlags(), - size, // Max sets - static_cast(descriptorPoolSizes.size()), - descriptorPoolSizes.data()); - - ctx->pool = std::make_shared(); - vk::Result r = komputeManager()->device()->createDescriptorPool( - &descriptorPoolInfo, nullptr, ctx->pool.get()); - if (r != vk::Result::eSuccess) - std::cerr << "Error allocating descriptor pool" << vk::to_string(r); -} - -static -void ggml_vk_free_descriptor_pool(struct ggml_kompute_context * ctx) { - if (ctx->pool) { - komputeManager()->device()->destroy( - *ctx->pool, - (vk::Optional)nullptr); - ctx->pool = nullptr; - } -} - -static -vk::Buffer *ggml_vk_allocate_buffer(size_t size) { - vk::BufferCreateInfo bufferCreateInfo; - bufferCreateInfo.size = size; - bufferCreateInfo.usage = vk::BufferUsageFlagBits::eStorageBuffer | - vk::BufferUsageFlagBits::eTransferSrc | - vk::BufferUsageFlagBits::eTransferDst; - bufferCreateInfo.sharingMode = vk::SharingMode::eExclusive; - - vk::Buffer *vkBuffer = new vk::Buffer; - vk::Result r = komputeManager()->device()->createBuffer(&bufferCreateInfo, nullptr, vkBuffer); - if (r != vk::Result::eSuccess) - std::cerr << "Error allocating buffer " << vk::to_string(r) << std::endl; - return vkBuffer; -} - -static -vk::DeviceMemory *ggml_vk_allocate(size_t size, vk::MemoryPropertyFlags flags, vk::MemoryRequirements requirements, bool *isHostVisible) { - - uint32_t memoryTypeIndex = -1; - bool memoryTypeIndexFound = false; - vk::PhysicalDeviceMemoryProperties memoryProperties = komputeManager()->physicalDevice()->getMemoryProperties(); - for (uint32_t i = 0; i < memoryProperties.memoryTypeCount; i++) { - const vk::MemoryType &memoryType = memoryProperties.memoryTypes[i]; - const vk::MemoryHeap &memoryHeap = memoryProperties.memoryHeaps[memoryType.heapIndex]; - if (memoryHeap.size < size) { - continue; - } - - if (requirements.memoryTypeBits & (1 << i)) { - if (((memoryProperties.memoryTypes[i]).propertyFlags & - flags) == flags) { - memoryTypeIndex = i; - memoryTypeIndexFound = true; - if (isHostVisible && (memoryProperties.memoryTypes[i].propertyFlags & vk::MemoryPropertyFlagBits::eHostVisible)) { - *isHostVisible = true; - } - break; - } - } - } - if (!memoryTypeIndexFound) { - throw std::runtime_error( - "Memory type index for buffer creation not found"); - } - - vk::MemoryAllocateInfo allocInfo; - allocInfo.allocationSize = size; - allocInfo.memoryTypeIndex = memoryTypeIndex; - vk::DeviceMemory *vkDeviceMemory = new vk::DeviceMemory; - vk::Result r = komputeManager()->device()->allocateMemory(&allocInfo, nullptr, vkDeviceMemory); - if (r != vk::Result::eSuccess) { - std::cerr << "Error allocating memory " << vk::to_string(r) << std::endl; - throw std::runtime_error("Error allocating vulkan memory."); - } - return vkDeviceMemory; -} - -static size_t ggml_vk_aligned_offset(ggml_backend_buffer_t buffer, size_t offset) { - size_t minStorageBufferOffsetAlignment = ggml_backend_buffer_get_alignment(buffer); - - // If offset is already aligned, return it directly - if (offset % minStorageBufferOffsetAlignment == 0) { - return offset; - } - - // Otherwise, return the largest multiple of minStorageBufferOffsetAlignment less than offset - return (offset / minStorageBufferOffsetAlignment) * minStorageBufferOffsetAlignment; -} - -static ggml_vk_memory ggml_vk_allocate(size_t size) { - ggml_vk_memory memory; - bool isHostVisible = false; - { - memory.primaryBuffer = ggml_vk_allocate_buffer(size); - vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.primaryBuffer); - vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eDeviceLocal; - memory.primaryMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); - komputeManager()->device()->bindBufferMemory(*memory.primaryBuffer, *memory.primaryMemory, 0); - if (isHostVisible) { - vk::Result r = komputeManager()->device()->mapMemory(*memory.primaryMemory, 0, size, vk::MemoryMapFlags(), &memory.data); - if (r != vk::Result::eSuccess) - std::cerr << "Error mapping memory" << vk::to_string(r); - } - } - - if (!isHostVisible) { - memory.stagingBuffer = ggml_vk_allocate_buffer(size); - vk::MemoryRequirements memoryRequirements = komputeManager()->device()->getBufferMemoryRequirements(*memory.stagingBuffer); - vk::MemoryPropertyFlags memoryPropertyFlags = vk::MemoryPropertyFlagBits::eHostVisible | - vk::MemoryPropertyFlagBits::eHostCoherent | - vk::MemoryPropertyFlagBits::eHostCached; - memory.stagingMemory = ggml_vk_allocate(size, memoryPropertyFlags, memoryRequirements, &isHostVisible); - komputeManager()->device()->bindBufferMemory(*memory.stagingBuffer, *memory.stagingMemory, 0); - vk::Result r = komputeManager()->device()->mapMemory(*memory.stagingMemory, 0, size, vk::MemoryMapFlags(), &memory.data); - if (r != vk::Result::eSuccess) - std::cerr << "Error mapping memory" << vk::to_string(r); - } - - memory.size = size; - return memory; -} - -static void ggml_vk_free_memory(ggml_vk_memory &memory) -{ - komputeManager()->device()->destroy( - *memory.primaryBuffer, - (vk::Optional)nullptr); - if (memory.stagingBuffer) { - komputeManager()->device()->destroy( - *memory.stagingBuffer, - (vk::Optional)nullptr); - } - komputeManager()->device()->freeMemory( - *memory.primaryMemory, - (vk::Optional)nullptr); - if (memory.stagingMemory) { - komputeManager()->device()->freeMemory( - *memory.stagingMemory, - (vk::Optional)nullptr); - } -} - -static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft); - -static -ggml_vk_memory * ggml_vk_find_tensor(const struct ggml_tensor * t, uint64_t & offset) { - ggml_backend_buffer_t buffer = t->view_src ? t->view_src->buffer : t->buffer; - - // compatibility with ggml-backend - GGML_ASSERT(buffer && buffer->buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name); - - ggml_vk_memory * buf_ctx = static_cast(buffer->context); - - const intptr_t ioffs = intptr_t(t->data) - intptr_t(buf_ctx->data); - - GGML_ASSERT(ioffs >= 0 && ioffs + int64_t(ggml_nbytes(t)) <= int64_t(buffer->size)); - - offset = uint64_t(ioffs); - return buf_ctx; -} - -static -const std::shared_ptr ggml_vk_get_tensor(const struct ggml_tensor * t, uint32_t * alignedOffset = nullptr) { - uint64_t originalOffset = 0; - auto * res = ggml_vk_find_tensor(t, originalOffset); - if (!res) { - static std::shared_ptr nullTensor = nullptr; - return nullTensor; - } - - // Create a tensor whose memory will be composed of our buffers at the correct offset - const size_t nelements = ggml_nelements(t); - size_t nbytes = ggml_nbytes(t); - - size_t vulkanOffset = ggml_vk_aligned_offset(t->buffer, originalOffset); - if (alignedOffset) { - *alignedOffset = originalOffset - vulkanOffset; - nbytes += *alignedOffset; - } - - return komputeManager()->tensor( - t->data, - nelements, - nbytes, kp::Tensor::TensorDataTypes::eFloat, - res->primaryMemory, res->primaryBuffer, - res->stagingMemory, res->stagingBuffer, - vulkanOffset); -} - -static std::vector getSpirvShader(const unsigned char* rawData, size_t size) { - if (size % sizeof(uint32_t) != 0) { - throw std::runtime_error("Invalid size: must be divisible by sizeof(uint32_t)"); - } - - const uint32_t* data_ptr = reinterpret_cast(rawData); - size_t count = size / sizeof(uint32_t); - return std::vector(data_ptr, data_ptr + count); -} - -inline static -uint32_t safe_divide(uint32_t a, uint32_t b) { - if (b <= 1) { - return a; - } - if ((a % b) != 0) { - fprintf(stderr, "((%u %% %u) == %u) != 0\n", a, b, a % b); - GGML_ABORT("safe_divide result would've had remainder"); - } - return a / b; -} - -static void ggml_vk_add( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, - int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13, - int32_t ne0, - int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_add_comp_spv, - kp::shader_data::op_add_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00; - int32_t nb00, nb01, nb02, nb03; - int32_t ne10, ne11, ne12, ne13; - int32_t nb10, nb11, nb12, nb13; - int32_t ne0; - int32_t nb0, nb1, nb2, nb3; - } const pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_addrow(kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - uint32_t size, uint32_t row = 0) { - - const static auto spirv = getSpirvShader(kp::shader_data::op_addrow_comp_spv, - kp::shader_data::op_addrow_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - uint32_t row; - } const pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - row - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); - else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, - int32_t nb00, int32_t nb01, int32_t nb02, int32_t nb03, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t nb10, int32_t nb11, int32_t nb12, int32_t nb13, - int32_t ne0, - int32_t nb0, int32_t nb1, int32_t nb2, int32_t nb3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_comp_spv, - kp::shader_data::op_mul_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00; - int32_t nb00, nb01, nb02, nb03; - int32_t ne10, ne11, ne12, ne13; - int32_t nb10, nb11, nb12, nb13; - int32_t ne0; - int32_t nb0, nb1, nb2, nb3; - } const pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_scale(kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - uint32_t size, float scale) { - const static auto spirv_1 = getSpirvShader( - kp::shader_data::op_scale_comp_spv, kp::shader_data::op_scale_comp_spv_len - ); - const static auto spirv_8 = getSpirvShader( - kp::shader_data::op_scale_8_comp_spv, kp::shader_data::op_scale_8_comp_spv_len - ); - - struct PushConstants { - uint32_t inOff, outOff; - float scale; - } const pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - scale - }; - - const auto * spirv = &spirv_1; - std::string name(__func__); - if (size % 8 == 0) { - size /= 8; - name += "_8"; - spirv = &spirv_8; - } - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, *spirv, {size}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_xxlu( - const std::vector& spirv, const char * suffix, kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - uint32_t size -) { - struct PushConstants { - uint32_t inOff, outOff; - } const pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, spirv, {size}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_silu(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_silu_comp_spv, - kp::shader_data::op_silu_comp_spv_len); - - ggml_vk_xxlu(spirv, "silu", std::forward(args)...); -} - -template -static void ggml_vk_relu(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_relu_comp_spv, - kp::shader_data::op_relu_comp_spv_len); - - ggml_vk_xxlu(spirv, "relu", std::forward(args)...); -} - -template -static void ggml_vk_gelu(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_gelu_comp_spv, - kp::shader_data::op_gelu_comp_spv_len); - - ggml_vk_xxlu(spirv, "gelu", std::forward(args)...); -} - -static void ggml_vk_soft_max( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, uint32_t ne03, - float scale, float max_bias, float m0, float m1, - uint32_t n_head_log2 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_softmax_comp_spv, - kp::shader_data::op_softmax_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02; - float scale, max_bias, m0, m1; - uint32_t n_head_log2; - int32_t mask; - } pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, - scale, max_bias, m0, m1, - n_head_log2, - bool(inB) - }; - - auto & inB_ = inB ? inB : inA; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - // FIXME: The softmax kernel needs to be fixed to use the subgroupsize which can vary by device - const uint32_t local_x = 32; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB_, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {local_x}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB_, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_norm_( - const std::vector& spirv, const char * suffix, kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - int32_t ne00, int32_t nb01, - int32_t nrows, float epsilon -) { - GGML_ASSERT(nb01%sizeof(float) == 0); - GGML_ASSERT(ne00%sizeof(float) == 0); - - struct PushConstants { - uint32_t inOff, outOff; - uint32_t ne00, nb01; - float eps; - } pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - (uint32_t)ne00, (uint32_t)nb01, epsilon - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, spirv, {(uint32_t)nrows}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({(uint32_t)nrows}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_norm(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_norm_comp_spv, - kp::shader_data::op_norm_comp_spv_len); - - ggml_vk_norm_(spirv, "norm", std::forward(args)...); -} - -template -static void ggml_vk_rms_norm(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_rmsnorm_comp_spv, - kp::shader_data::op_rmsnorm_comp_spv_len); - - ggml_vk_norm_(spirv, "rms", std::forward(args)...); -} - -static void ggml_vk_diag_mask_inf(kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - uint32_t n_past, - int32_t ne00, int32_t ne01, int32_t ne02) { - const static auto spirv = getSpirvShader(kp::shader_data::op_diagmask_comp_spv, - kp::shader_data::op_diagmask_comp_spv_len); - - struct PushConstants { - uint32_t inOff, outOff; - uint32_t n_past; - int32_t ne00, ne01; - } pushConsts { - safe_divide(inOff, 4), safe_divide(outOff, 4), - n_past, - ne00, ne01 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne00), unsigned(ne01), unsigned(ne02)}, {}, {pushConsts}); - else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({unsigned(ne00), unsigned(ne01), unsigned(ne02)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_f16( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - uint32_t nb10, uint32_t nb11, uint32_t nb12, uint32_t nb13, - int32_t ne0, int32_t ne1, - uint32_t r2, uint32_t r3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_f16_comp_spv, - kp::shader_data::op_mul_mat_f16_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02; - uint32_t nb00, nb01, nb02, nb03; - int32_t ne10, ne11, ne12; - uint32_t nb10, nb11, nb12, nb13; - int32_t ne0, ne1; - uint32_t r2, r3; - } pushConsts { - safe_divide(inAOff, 2), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, - nb10, nb11, nb12, nb13, - ne0, ne1, - r2, r3 - }; - - const unsigned ny = unsigned((ne11 + 4 - 1)/4); - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - const uint32_t local_x = ggml_vk_current_device().subgroupSize * 2; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned(ne01), ny, unsigned(ne12*ne13)}, {local_x}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), ny, unsigned(ne12*ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_mat_f32(kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - uint32_t nb01, uint32_t nb02, - int32_t ne11, int32_t ne12, - uint32_t nb11, uint32_t nb12, - uint32_t nb1, uint32_t nb2) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_mat_f32_comp_spv, - kp::shader_data::op_mul_mat_mat_f32_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02, ne11, ne12; - uint32_t nb01, nb02; - uint32_t nb11, nb12; - uint32_t nb1, nb2; - } pushConsts { - safe_divide(inAOff, 4), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, ne11, ne12, - nb01, nb02, nb11, nb12, - nb1, nb2 - }; - - const uint32_t local_x = ggml_vk_current_device().subgroupSize; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), - {inA, inB, out}, spirv, - {unsigned(ne01), - unsigned(ne11), - unsigned(std::max(ne12, ne02)) - }, - {local_x}, - {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned(ne01), - unsigned(ne11), - unsigned(std::max(ne12, ne02)), - }); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_impl( - const std::vector& spirv, const char * suffix, uint32_t block_size, kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t ne0, int32_t ne1, - uint32_t nb01, uint32_t nb02, uint32_t nb03, - uint32_t nb11, uint32_t nb12, uint32_t nb13, - uint32_t r2, uint32_t r3 -) { - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne01, ne02; - int32_t ne10, ne12; - int32_t ne0, ne1; - uint32_t nb01, nb02, nb03; - uint32_t nb11, nb12, nb13; - uint32_t r2, r3; - } pushConsts { - safe_divide(inAOff, block_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne01, ne02, - ne10, ne12, - ne0, ne1, - nb01, nb02, nb03, - nb11, nb12, nb13, - r2, r3 - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - const uint32_t local_x = (ggml_vk_current_device().subgroupSize * 2) / 8; - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}, {local_x}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 7)/8), unsigned(ne11), unsigned(ne12*ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_mul_mat_q4_0(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_0_comp_spv, - kp::shader_data::op_mul_mat_q4_0_comp_spv_len); - - ggml_vk_mul_mat_impl(spirv, "q4_0", 1/*We access blocks unaligned*/, std::forward(args)...); -} - -template -static void ggml_vk_mul_mat_q4_1(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_1_comp_spv, - kp::shader_data::op_mul_mat_q4_1_comp_spv_len); - - ggml_vk_mul_mat_impl(spirv, "q4_1", 1/*We access blocks unaligned*/, std::forward(args)...); -} - -template -static void ggml_vk_mul_mat_q8_0(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q8_0_comp_spv, - kp::shader_data::op_mul_mat_q8_0_comp_spv_len); - - ggml_vk_mul_mat_impl(spirv, "q8_0", 1/*We access blocks unaligned*/, std::forward(args)...); -} - -static void ggml_vk_mul_mat_q4_k( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t ne0, int32_t ne1, - uint32_t nb01, uint32_t nb02, uint32_t nb03, - uint32_t nb11, uint32_t nb12, uint32_t nb13, - uint32_t r2, uint32_t r3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q4_k_comp_spv, - kp::shader_data::op_mul_mat_q4_k_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12; - uint32_t nb01, nb02, nb03, nb11, nb12, nb13; - uint32_t r2, r3; - } pushConsts { - inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne10, ne0, ne1, ne01, ne02, ne12, - nb01, nb02, nb03, nb11, nb12, nb13, - r2, r3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 3)/4), unsigned(ne11), unsigned(ne12) * unsigned(ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_mul_mat_q6_k( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, - int32_t ne10, int32_t ne11, int32_t ne12, int32_t ne13, - int32_t ne0, int32_t ne1, - uint32_t nb01, uint32_t nb02, uint32_t nb03, - uint32_t nb11, uint32_t nb12, uint32_t nb13, - uint32_t r2, uint32_t r3 -) { - const static auto spirv = getSpirvShader(kp::shader_data::op_mul_mat_q6_k_comp_spv, - kp::shader_data::op_mul_mat_q6_k_comp_spv_len); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, ne10, ne0, ne1, ne01, ne02, ne12; - uint32_t nb01, nb02, nb03, nb11, nb12, nb13; - uint32_t r2, r3; - } pushConsts { - inAOff, safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, ne10, ne0, ne1, ne01, ne02, ne12, - nb01, nb02, nb03, nb11, nb12, nb13, - r2, r3 - }; - - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(__func__)) { - const uint32_t local_x = 2; - const uint32_t local_y = ggml_vk_current_device().subgroupSize; - s_algo = komputeManager()->algorithm(__func__, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}, {local_x, local_y}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(__func__); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({unsigned((ne01 + 1)/2), unsigned(ne11), unsigned(ne12)*unsigned(ne13)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_get_rows( - const std::vector& spirv, - const char * suffix, - unsigned element_size, unsigned qk, - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t outOff, - int32_t ne00, int32_t nb01, int32_t nb1, - uint32_t size -) { - GGML_ASSERT(nb01%element_size == 0); - GGML_ASSERT(nb1%sizeof(float) == 0); - if (qk) GGML_ASSERT(ne00%qk == 0); - - struct PushConstants { - uint32_t inAOff, inBOff, outOff; - int32_t ne00, nb01, nb1; - } pushConsts { - safe_divide(inAOff, element_size), safe_divide(inBOff, 4), safe_divide(outOff, 4), - ne00, nb01, nb1 - }; - - auto name = std::string(__func__) + "_" + suffix; - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {inA, inB, out}, spirv, {size}, {}, {pushConsts}); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({inA, inB, out}); - s_algo->setWorkgroup({size}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_get_rows_f32(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f32_comp_spv, - kp::shader_data::op_getrows_f32_comp_spv_len); - - ggml_vk_get_rows(spirv, "f32", sizeof(float), 0, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_f16(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_f16_comp_spv, - kp::shader_data::op_getrows_f16_comp_spv_len); - - ggml_vk_get_rows(spirv, "f16", sizeof(half), 0, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_q4_0(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_0_comp_spv, - kp::shader_data::op_getrows_q4_0_comp_spv_len); - - ggml_vk_get_rows(spirv, "q4_0", 1/*We access blocks unaligned*/, QK4_0, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_q4_1(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q4_1_comp_spv, - kp::shader_data::op_getrows_q4_1_comp_spv_len); - - ggml_vk_get_rows(spirv, "q4_1", 1/*We access blocks unaligned*/, QK4_1, std::forward(args)...); -} - -template -static void ggml_vk_get_rows_q6_k(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_getrows_q6_k_comp_spv, - kp::shader_data::op_getrows_q6_k_comp_spv_len); - ggml_vk_get_rows(spirv, "q6_k", 1/*We access blocks unaligned*/, QK_NL, std::forward(args)...); -} - -static void ggml_vk_rope( - kp::Sequence& seq, - const std::shared_ptr& inA, - const std::shared_ptr& inB, - const std::shared_ptr& inC, - const std::shared_ptr& out, - uint32_t inAOff, uint32_t inBOff, uint32_t inCOff, uint32_t outOff, - ggml_type src0t, int32_t n_dims, int32_t mode, int32_t n_ctx_orig, - float freq_base, float freq_scale, bool has_freq_factors, float ext_factor, float attn_factor, float beta_fast, float beta_slow, - int32_t ne01, int32_t ne02, int32_t ne03, - uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, - int32_t ne0, - uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3 -) { - GGML_ASSERT(src0t == GGML_TYPE_F16 || src0t == GGML_TYPE_F32); - - static const auto spirv_norm_f16 = getSpirvShader( - kp::shader_data::op_rope_norm_f16_comp_spv, kp::shader_data::op_rope_norm_f16_comp_spv_len - ); - static const auto spirv_norm_f32 = getSpirvShader( - kp::shader_data::op_rope_norm_f32_comp_spv, kp::shader_data::op_rope_norm_f32_comp_spv_len - ); - static const auto spirv_neox_f16 = getSpirvShader( - kp::shader_data::op_rope_neox_f16_comp_spv, kp::shader_data::op_rope_neox_f16_comp_spv_len - ); - static const auto spirv_neox_f32 = getSpirvShader( - kp::shader_data::op_rope_neox_f32_comp_spv, kp::shader_data::op_rope_neox_f32_comp_spv_len - ); - - int type_size = src0t == GGML_TYPE_F16 ? 2 : 4; - - GGML_ASSERT(nb03 % type_size == 0); - GGML_ASSERT(nb02 % type_size == 0); - GGML_ASSERT(nb01 % type_size == 0); - GGML_ASSERT(nb00 % type_size == 0); - GGML_ASSERT(nb3 % type_size == 0); - GGML_ASSERT(nb2 % type_size == 0); - GGML_ASSERT(nb1 % type_size == 0); - GGML_ASSERT(nb0 % type_size == 0); - - struct PushConstants { - uint32_t inAOff, inBOff, inCOff, outOff; - int32_t n_dims, mode, n_ctx_orig; - float freq_base, freq_scale; - bool has_freq_factors; - float ext_factor, attn_factor, beta_fast, beta_slow; - uint32_t nb00, nb01, nb02, nb03; - int32_t ne0; - uint32_t nb0, nb1, nb2, nb3; - } pushConsts { - safe_divide(inAOff, type_size), safe_divide(inBOff, 4), safe_divide(inCOff, type_size), safe_divide(outOff, type_size), - n_dims, mode, n_ctx_orig, - freq_base, freq_scale, - has_freq_factors, - ext_factor, attn_factor, beta_fast, beta_slow, - nb00, nb01, nb02, nb03, - ne0, - nb0, nb1, nb2, nb3 - }; - - auto & inC_ = inC ? inC : inA; - const bool is_neox = mode & GGML_ROPE_TYPE_NEOX; - const bool is_f16 = src0t == GGML_TYPE_F16; - - auto name = std::string(__func__) + (is_neox ? "_neox" : "_norm") + (src0t == GGML_TYPE_F16 ? "_f16" : "_f32"); - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) { - auto & spirv = is_neox ? is_f16 ? spirv_neox_f16 : spirv_neox_f32 : is_f16 ? spirv_norm_f16 : spirv_norm_f32; - s_algo = komputeManager()->algorithm( - name, s_kompute_context->pool.get(), {inA, inB, inC_, out}, spirv, - {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts} - ); - } else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({inA, inB, inC_, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -static void ggml_vk_cpy( - const std::vector& spirv, - uint32_t in_element_size, uint32_t out_element_size, - kp::Sequence& seq, - const std::shared_ptr& in, - const std::shared_ptr& out, - uint32_t inOff, uint32_t outOff, - int32_t ne00, int32_t ne01, int32_t ne02, int32_t ne03, - uint32_t nb00, uint32_t nb01, uint32_t nb02, uint32_t nb03, - int32_t ne0, int32_t ne1, int32_t ne2, - uint32_t nb0, uint32_t nb1, uint32_t nb2, uint32_t nb3 -) { - struct PushConstants { - uint32_t inOff, outOff; - int32_t ne00, ne01, ne02; - uint32_t nb00, nb01, nb02, nb03; - int32_t ne0, ne1, ne2; - uint32_t nb0, nb1, nb2, nb3; - } pushConsts { - safe_divide(inOff, in_element_size), safe_divide(outOff, out_element_size), - ne00, ne01, ne02, - nb00, nb01, nb02, nb03, - ne0, ne1, ne2, - nb0, nb1, nb2, nb3 - }; - - std::string name = std::string(__func__) - + "_i_" + std::to_string(in_element_size) - + "_o_" + std::to_string(out_element_size); - std::shared_ptr s_algo = nullptr; - if (!komputeManager()->hasAlgorithm(name)) - s_algo = komputeManager()->algorithm(name, s_kompute_context->pool.get(), {in, out}, spirv, {unsigned(ne01), unsigned(ne02), unsigned(ne03)}, {}, {pushConsts}); - else { - s_algo = komputeManager()->getAlgorithm(name); - s_algo->setTensors({in, out}); - s_algo->setWorkgroup({unsigned(ne01), unsigned(ne02), unsigned(ne03)}); - s_algo->setPushConstants({pushConsts}); - s_algo->updateDescriptors(s_kompute_context->pool.get()); - } - seq.record(s_algo); -} - -template -static void ggml_vk_cpy_f32_f16(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f16_comp_spv, - kp::shader_data::op_cpy_f32_f16_comp_spv_len); - ggml_vk_cpy(spirv, 4, 2, std::forward(args)...); -} - -template -static void ggml_vk_cpy_f32_f32(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f32_f32_comp_spv, - kp::shader_data::op_cpy_f32_f32_comp_spv_len); - ggml_vk_cpy(spirv, 4, 4, std::forward(args)...); -} - -template -static void ggml_vk_cpy_f16_f16(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f16_comp_spv, - kp::shader_data::op_cpy_f16_f16_comp_spv_len); - ggml_vk_cpy(spirv, 2, 2, std::forward(args)...); -} - -template -static void ggml_vk_cpy_f16_f32(Args&&... args) { - const static auto spirv = getSpirvShader(kp::shader_data::op_cpy_f16_f32_comp_spv, - kp::shader_data::op_cpy_f16_f32_comp_spv_len); - ggml_vk_cpy(spirv, 2, 4, std::forward(args)...); -} - -static bool ggml_backend_kompute_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) { - int64_t n = ggml_nelements(op); - switch (op->op) { - case GGML_OP_UNARY: - if (n % 4 != 0) return false; - switch (ggml_get_unary_op(op)) { - case GGML_UNARY_OP_GELU: - if (n % 8 != 0) return false; - // fall through - case GGML_UNARY_OP_RELU: - case GGML_UNARY_OP_SILU: - return ggml_is_contiguous(op->src[0]); - default: - ; - } - break; - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - case GGML_OP_ADD: - case GGML_OP_MUL: - case GGML_OP_SCALE: - case GGML_OP_SOFT_MAX: - case GGML_OP_RMS_NORM: - case GGML_OP_NORM: - return true; - case GGML_OP_ROPE: - { - const int mode = ((const int32_t *) op->op_params)[2]; - if (mode & GGML_ROPE_TYPE_MROPE) { - return false; - } - if (mode & GGML_ROPE_TYPE_VISION) { - return false; - } - return true; - } - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - break; - default: - return false; - } - switch (op->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - break; - default: - return false; - } - return true; - case GGML_OP_DIAG_MASK_INF: - return op->ne[3] == 1; - case GGML_OP_GET_ROWS: - switch (op->src[0]->type) { - case GGML_TYPE_F32: - case GGML_TYPE_F16: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q6_K: - return op->ne[2] == 1 && op->ne[3] == 1; - default: - ; - } - return false; - case GGML_OP_MUL_MAT: - if (op->src[1]->type != GGML_TYPE_F32 || ggml_is_transposed(op->src[0]) || ggml_is_transposed(op->src[1])) - return false; - - switch (op->src[0]->type) { - case GGML_TYPE_F32: - return op->ne[3] == 1; - case GGML_TYPE_Q6_K: - case GGML_TYPE_F16: - case GGML_TYPE_Q8_0: - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q4_K: - return true; - default: - ; - } - default: - ; - } - return false; - - GGML_UNUSED(dev); -} - -static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml_cgraph * gf) { - const int n_seq = 8; - - // FIXME: Figure out if we can somehow optimize the size of the pool... right now we're setting - // it to the size of the graph, but I think it can be made smaller? - ggml_vk_allocate_descriptor_pool(ctx, gf->n_nodes); - - std::vector> sequences(n_seq); - - for (auto& sequence : sequences) { - sequence = komputeManager()->sequence(); - } - for (int seq_idx = 0; seq_idx < n_seq; ++seq_idx) { - const int n_nodes_per_seq = (gf->n_nodes + n_seq - 1) / n_seq; - - auto& seq = *sequences[seq_idx]; - - const int node_start = (seq_idx + 0) * n_nodes_per_seq; - const int node_end = std::min((seq_idx == n_seq - 1) ? gf->n_nodes : (seq_idx + 1) * n_nodes_per_seq, gf->n_nodes); - - bool any_commands_recorded = false; - - for (int i = node_start; i < node_end; ++i) { - struct ggml_tensor * src0 = gf->nodes[i]->src[0]; - struct ggml_tensor * src1 = gf->nodes[i]->src[1]; - struct ggml_tensor * src2 = gf->nodes[i]->src[2]; GGML_UNUSED(src2); - struct ggml_tensor * dst = gf->nodes[i]; - GGML_ASSERT(dst->data != nullptr); - - if (ggml_is_empty(dst)) { - continue; - } - - switch (dst->op) { - case GGML_OP_NONE: - case GGML_OP_RESHAPE: - case GGML_OP_VIEW: - case GGML_OP_TRANSPOSE: - case GGML_OP_PERMUTE: - continue; // noop -> next node - default: - break; - } - - any_commands_recorded = true; - - const int32_t ne00 = src0 ? src0->ne[0] : 0; - const int32_t ne01 = src0 ? src0->ne[1] : 0; - const int32_t ne02 = src0 ? src0->ne[2] : 0; - const int32_t ne03 = src0 ? src0->ne[3] : 0; - - const uint32_t nb00 = src0 ? src0->nb[0] : 0; - const uint32_t nb01 = src0 ? src0->nb[1] : 0; - const uint32_t nb02 = src0 ? src0->nb[2] : 0; - const uint32_t nb03 = src0 ? src0->nb[3] : 0; - - const int32_t ne10 = src1 ? src1->ne[0] : 0; - const int32_t ne11 = src1 ? src1->ne[1] : 0; - const int32_t ne12 = src1 ? src1->ne[2] : 0; - const int32_t ne13 = src1 ? src1->ne[3] : 0; - - const uint32_t nb10 = src1 ? src1->nb[0] : 0; - const uint32_t nb11 = src1 ? src1->nb[1] : 0; - const uint32_t nb12 = src1 ? src1->nb[2] : 0; - const uint32_t nb13 = src1 ? src1->nb[3] : 0; - - const int32_t ne0 = dst ? dst->ne[0] : 0; - const int32_t ne1 = dst ? dst->ne[1] : 0; - const int32_t ne2 = dst ? dst->ne[2] : 0; -// const int32_t ne3 = dst ? dst->ne[3] : 0; - - const uint32_t nb0 = dst ? dst->nb[0] : 0; - const uint32_t nb1 = dst ? dst->nb[1] : 0; - const uint32_t nb2 = dst ? dst->nb[2] : 0; - const uint32_t nb3 = dst ? dst->nb[3] : 0; - - const enum ggml_type src0t = src0 ? src0->type : GGML_TYPE_COUNT; - const enum ggml_type src1t = src1 ? src1->type : GGML_TYPE_COUNT; - const enum ggml_type dstt = dst ? dst->type : GGML_TYPE_COUNT; - - const static std::shared_ptr nullTensor = nullptr; - uint32_t off_src0 = 0; - uint32_t off_src1 = 0; - uint32_t off_src2 = 0; - uint32_t off_dst = 0; - const std::shared_ptr& id_src0 = src0 ? ggml_vk_get_tensor(src0, &off_src0) : nullTensor; - const std::shared_ptr& id_src1 = src1 ? ggml_vk_get_tensor(src1, &off_src1) : nullTensor; - const std::shared_ptr& id_src2 = src2 ? ggml_vk_get_tensor(src2, &off_src2) : nullTensor; - const std::shared_ptr& id_dst = dst ? ggml_vk_get_tensor(dst, &off_dst) : nullTensor; - - switch (dst->op) { - case GGML_OP_ADD: - { - if (ggml_nelements(src1) == ne10 && ggml_is_contiguous(src1) && ne00 % 4 == 0 && ne10 % 4 == 0) { - // src1 is a row - ggml_vk_addrow(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ggml_nelements(dst)/4, ne00); - } else { - ggml_vk_add( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne03, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - ); - } - } break; - case GGML_OP_MUL: - { - ggml_vk_mul( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne03, - nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, - nb10, nb11, nb12, nb13, - ne0, - nb0, nb1, nb2, nb3 - ); - } break; - case GGML_OP_SCALE: - { - float scale; memcpy(&scale, dst->op_params, sizeof(float)); - - ggml_vk_scale(seq, id_src0, id_dst, off_src0, off_dst, ggml_nelements(dst), scale); - } break; - case GGML_OP_UNARY: - { - int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - switch (ggml_get_unary_op(gf->nodes[i])) { - case GGML_UNARY_OP_SILU: - { - ggml_vk_silu(seq, id_src0, id_dst, off_src0, off_dst, n/4); - } break; - case GGML_UNARY_OP_RELU: - { - ggml_vk_relu(seq, id_src0, id_dst, off_src0, off_dst, n/4); - } break; - case GGML_UNARY_OP_GELU: - { - GGML_ASSERT(n % 8 == 0); - ggml_vk_gelu(seq, id_src0, id_dst, off_src0, off_dst, n/8); - } break; - default: - { - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - GGML_ABORT("fatal error"); - } - } - } break; - case GGML_OP_SOFT_MAX: - { - float scale; - float max_bias; - - memcpy(&scale, (float *)dst->op_params + 0, sizeof(float)); - memcpy(&max_bias, (float *)dst->op_params + 1, sizeof(float)); - -#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support") -#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") - GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); - - const int64_t nrows_x = ggml_nrows(src0); - const int64_t nrows_y = src0->ne[1]; - - const uint32_t n_head = nrows_x/nrows_y; - const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head)); - - const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); - const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - - ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale, max_bias, m0, m1, n_head_log2); - } break; - case GGML_OP_DIAG_MASK_INF: - { - const int n_past = ((int32_t *)(dst->op_params))[0]; - ggml_vk_diag_mask_inf(seq, id_src0, id_dst, off_src0, off_dst, n_past, ne00, ne01, ne02); - } break; - case GGML_OP_NORM: - { - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - ggml_vk_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); - } break; - case GGML_OP_RMS_NORM: - { - GGML_ASSERT(ne00 % 4 == 0); - - float eps; - memcpy(&eps, dst->op_params, sizeof(float)); - ggml_vk_rms_norm(seq, id_src0, id_dst, off_src0, off_dst, ne00, nb01, ggml_nrows(src0), eps); - } break; - case GGML_OP_MUL_MAT: - { - GGML_ASSERT(ne00 == ne10); - - GGML_ASSERT(ne12 % ne02 == 0); - GGML_ASSERT(ne13 % ne03 == 0); - - const uint32_t r2 = ne12/ne02; - const uint32_t r3 = ne13/ne03; - - if (src1t != GGML_TYPE_F32) { - fprintf(stderr, "%s: %s: Unsupported src1 type: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); - goto not_implemented; - } - - if (ggml_is_transposed(src0) || - ggml_is_transposed(src1)) { - fprintf(stderr, "%s: %s: matmul on tranposed tensor not supported: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); - goto not_implemented; - } - - switch (src0t) { - case GGML_TYPE_F32: - ggml_vk_mul_mat_mat_f32( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, nb01, nb02, ne11, ne12, nb11, nb12, nb1, nb2 - ); - break; - case GGML_TYPE_F16: - ggml_vk_mul_mat_f16( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, nb00, nb01, nb02, nb03, - ne10, ne11, ne12, ne13, nb10, nb11, nb12, nb13, - ne0, ne1, r2, r3 - ); - break; - case GGML_TYPE_Q8_0: - ggml_vk_mul_mat_q8_0( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q4_0: - ggml_vk_mul_mat_q4_0( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q4_1: - ggml_vk_mul_mat_q4_1( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q4_K: - ggml_vk_mul_mat_q4_k( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - case GGML_TYPE_Q6_K: - ggml_vk_mul_mat_q6_k( - seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, - ne00, ne01, ne02, ne10, ne11, ne12, ne13, ne0, ne1, - nb01, nb02, nb03, nb11, nb12, nb13, r2, r3 - ); - break; - default: { - fprintf(stderr, "%s: %s: Unsupported quantization: %u/%u\n", __func__, ggml_op_name(dst->op), src0t, src1t); - goto not_implemented; - } - } - - } break; - case GGML_OP_GET_ROWS: - { - if (src0t == GGML_TYPE_F32) { - ggml_vk_get_rows_f32(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_F16) { - ggml_vk_get_rows_f16(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_Q4_0) { - ggml_vk_get_rows_q4_0(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_Q4_1) { - ggml_vk_get_rows_q4_1(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else if (src0t == GGML_TYPE_Q6_K) { - ggml_vk_get_rows_q6_k(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, nb01, nb1, ggml_nelements(src1)); - } else { - fprintf(stderr, "%s: %s: Unsupported quantization: %u\n", __func__, ggml_op_name(dst->op), src0t); - goto not_implemented; - } - } break; - case GGML_OP_ROPE: - { - GGML_ASSERT(ne10 == ne02); - GGML_ASSERT(src0t == dstt); - // const int n_past = ((int32_t *) dst->op_params)[0]; - const int n_dims = ((int32_t *) dst->op_params)[1]; - const int mode = ((int32_t *) dst->op_params)[2]; - // skip 3, n_ctx used in GLM RoPE, unimplemented in Vulkan - const int n_ctx_orig = ((int32_t *) dst->op_params)[4]; - - const bool has_freq_factors = dst->src[2] != nullptr; - - float freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow; - memcpy(&freq_base, (int32_t *) dst->op_params + 5, sizeof(float)); - memcpy(&freq_scale, (int32_t *) dst->op_params + 6, sizeof(float)); - memcpy(&ext_factor, (int32_t *) dst->op_params + 7, sizeof(float)); - memcpy(&attn_factor, (int32_t *) dst->op_params + 8, sizeof(float)); - memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float)); - memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float)); - ggml_vk_rope( - seq, id_src0, id_src1, id_src2, id_dst, off_src0, off_src1, off_src2, off_dst, src0t, n_dims, mode, n_ctx_orig, - freq_base, freq_scale, has_freq_factors, ext_factor, attn_factor, beta_fast, beta_slow, - ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, nb0, nb1, nb2, nb3 - ); - } break; - case GGML_OP_DUP: - case GGML_OP_CPY: - case GGML_OP_CONT: - { - switch (src0t) { - case GGML_TYPE_F32: - { - switch (dstt) { - case GGML_TYPE_F16: ggml_vk_cpy_f32_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - case GGML_TYPE_F32: ggml_vk_cpy_f32_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - default: goto not_implemented; - } - } break; - case GGML_TYPE_F16: - { - switch (dstt) { - case GGML_TYPE_F16: ggml_vk_cpy_f16_f16(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - case GGML_TYPE_F32: ggml_vk_cpy_f16_f32(seq, id_src0, id_dst, off_src0, off_dst, ne00, ne01, ne02, ne03, nb00, nb01, nb02, nb03, ne0, ne1, ne2, nb0, nb1, nb2, nb3); break; - default: goto not_implemented; - } break; - default: goto not_implemented; - } - } - } break; - default: goto not_implemented; - } - continue; - not_implemented: {} - fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(dst->op)); - //GGML_ABORT("fatal error"); - } - - // Evaluate sequence - if (any_commands_recorded) { - seq.evalAsync(); - } - } - - // Wait for all sequences to finish - for (auto& sequence : sequences) { - if (sequence->isRunning()) - sequence->evalAwait(); - } - - ggml_vk_free_descriptor_pool(ctx); -} - -template<> -kp::Tensor::TensorDataTypes -kp::TensorT::dataType() -{ - return TensorDataTypes::eFloat; -} - -template<> -kp::Tensor::TensorDataTypes -kp::TensorT::dataType() -{ - return TensorDataTypes::eUnsignedInt; -} - -//////////////////////////////////////////////////////////////////////////////// - -// backend interface - -struct ggml_backend_kompute_buffer_type_context { - int device; - int device_ref = 0; - uint64_t buffer_alignment; - uint64_t max_alloc; - std::string name; - - ggml_backend_kompute_buffer_type_context(int device, uint64_t buffer_alignment, uint64_t max_alloc) - : device(device), buffer_alignment(buffer_alignment), max_alloc(max_alloc), name(ggml_kompute_format_name(device)) {} -}; - -static void ggml_backend_kompute_device_ref(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - - if (!ctx->device_ref) { - komputeManager()->initializeDevice( - ctx->device, {}, { - "VK_KHR_shader_float16_int8", "VK_KHR_8bit_storage", - "VK_KHR_16bit_storage", "VK_KHR_shader_non_semantic_info" - } - ); - } - - assert(ggml_vk_has_device()); - ctx->device_ref++; -} - -static void ggml_backend_kompute_device_unref(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - - assert(ctx->device_ref > 0); - - ctx->device_ref--; - - if (!ctx->device_ref) { - komputeManager.destroy(); - } -} - -static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) { - auto * memory = (ggml_vk_memory *)buffer->context; - if (ggml_vk_has_device()) { - ggml_vk_free_memory(*memory); - } - delete memory; -} - -static void * ggml_backend_kompute_buffer_get_base(ggml_backend_buffer_t buffer) { - return ((ggml_vk_memory *)buffer->context)->data; -} - -static void ggml_backend_kompute_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_UNUSED(buffer); - - const auto res = ggml_vk_get_tensor(tensor); - GGML_ASSERT(res); - - memcpy((char *)tensor->data + offset, data, size); - - komputeManager()->sequence()->eval({res}); -} - -static void ggml_backend_kompute_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_UNUSED(buffer); - - const auto res = ggml_vk_get_tensor(tensor); - GGML_ASSERT(res); - - komputeManager()->sequence()->eval({res}); - - memcpy(data, (const char *)tensor->data + offset, size); -} - -static void ggml_backend_kompute_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - auto * memory = (ggml_vk_memory *)buffer->context; - memset(memory->data, value, buffer->size); - - if (memory->stagingBuffer) - komputeManager()->sequence()->eval(memory->primaryBuffer, memory->stagingBuffer, memory->size); -} - -static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = { - /* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer, - /* .get_base = */ ggml_backend_kompute_buffer_get_base, - /* .init_tensor = */ NULL, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ ggml_backend_kompute_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_kompute_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_kompute_buffer_clear, - /* .reset = */ NULL, -}; - -// default buffer type - -static const char * ggml_backend_kompute_buffer_type_get_name(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - return ctx->name.c_str(); -} - -static ggml_backend_buffer_t ggml_backend_kompute_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_kompute_device_ref(buft); - auto * ctx = new ggml_vk_memory(ggml_vk_allocate(size)); - return ggml_backend_buffer_init(buft, ggml_backend_kompute_buffer_i, ctx, size); -} - -static size_t ggml_backend_kompute_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - return ctx->buffer_alignment; -} - -static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { - auto * ctx = static_cast(buft->context); - return ctx->max_alloc; -} - -static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = { - /* .get_name = */ ggml_backend_kompute_buffer_type_get_name, - /* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment, - /* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes - /* .is_host = */ NULL, -}; - -ggml_backend_buffer_type_t ggml_backend_kompute_buffer_type(int device) { - static std::mutex mutex; - std::lock_guard lock(mutex); - - auto devices = ggml_vk_available_devices(); - int32_t device_count = (int32_t) devices.size(); - GGML_ASSERT(device < device_count); - GGML_ASSERT(devices.size() <= GGML_KOMPUTE_MAX_DEVICES); - - static ggml_backend_buffer_type - ggml_backend_kompute_buffer_types[GGML_KOMPUTE_MAX_DEVICES]; - - static bool ggml_backend_kompute_buffer_type_initialized = false; - - if (!ggml_backend_kompute_buffer_type_initialized) { - for (int32_t i = 0; i < device_count; i++) { - ggml_backend_kompute_buffer_types[i] = { - /* .iface = */ ggml_backend_kompute_buffer_type_interface, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), i), - /* .context = */ new ggml_backend_kompute_buffer_type_context{ i, devices[i].bufferAlignment, devices[i].maxAlloc }, - }; - } - ggml_backend_kompute_buffer_type_initialized = true; - } - - return &ggml_backend_kompute_buffer_types[device]; -} - -// backend - -static const char * ggml_backend_kompute_name(ggml_backend_t backend) { - auto * ctx = static_cast(backend->context); - return ctx->name.c_str(); -} - -static void ggml_backend_kompute_free(ggml_backend_t backend) { - auto * ctx = static_cast(backend->context); - - assert(ctx == s_kompute_context); - s_kompute_context = nullptr; - if (ctx != nullptr) { - delete ctx; - } - - delete backend; -} - -static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - auto * ctx = static_cast(backend->context); - ggml_vk_graph_compute(ctx, cgraph); - return GGML_STATUS_SUCCESS; -} - -static struct ggml_backend_i kompute_backend_i = { - /* .get_name = */ ggml_backend_kompute_name, - /* .free = */ ggml_backend_kompute_free, - /* .set_tensor_async = */ NULL, - /* .get_tensor_async = */ NULL, - /* .cpy_tensor_async = */ NULL, - /* .synchronize = */ NULL, - /* .graph_plan_create = */ NULL, - /* .graph_plan_free = */ NULL, - /* .graph_plan_update = */ NULL, - /* .graph_plan_compute = */ NULL, - /* .graph_compute = */ ggml_backend_kompute_graph_compute, - /* .event_record = */ NULL, - /* .event_wait = */ NULL, -}; - -static ggml_guid_t ggml_backend_kompute_guid() { - static ggml_guid guid = { 0x7b, 0x57, 0xdc, 0xaf, 0xde, 0x12, 0x1d, 0x49, 0xfb, 0x35, 0xfa, 0x9b, 0x18, 0x31, 0x1d, 0xca }; - return &guid; -} - -ggml_backend_t ggml_backend_kompute_init(int device) { - GGML_ASSERT(s_kompute_context == nullptr); - s_kompute_context = new ggml_kompute_context(device); - - ggml_backend_t kompute_backend = new ggml_backend { - /* .guid = */ ggml_backend_kompute_guid(), - /* .interface = */ kompute_backend_i, - /* .device = */ ggml_backend_reg_dev_get(ggml_backend_kompute_reg(), device), - /* .context = */ s_kompute_context, - }; - - return kompute_backend; -} - -bool ggml_backend_is_kompute(ggml_backend_t backend) { - return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_kompute_guid()); -} - -static size_t ggml_backend_kompute_get_device_count() { - auto devices = ggml_vk_available_devices(); - return devices.size(); -} - -static void ggml_backend_kompute_get_device_description(int device, char * description, size_t description_size) { - auto devices = ggml_vk_available_devices(); - GGML_ASSERT((size_t) device < devices.size()); - snprintf(description, description_size, "%s", devices[device].name); -} - -static void ggml_backend_kompute_get_device_memory(int device, size_t * free, size_t * total) { - auto devices = ggml_vk_available_devices(); - GGML_ASSERT((size_t) device < devices.size()); - *total = devices[device].heapSize; - *free = devices[device].heapSize; -} - -////////////////////////// - -struct ggml_backend_kompute_device_context { - int device; - std::string name; - std::string description; -}; - -static const char * ggml_backend_kompute_device_get_name(ggml_backend_dev_t dev) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ctx->name.c_str(); -} - -static const char * ggml_backend_kompute_device_get_description(ggml_backend_dev_t dev) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ctx->description.c_str(); -} - -static void ggml_backend_kompute_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - ggml_backend_kompute_get_device_memory(ctx->device, free, total); -} - -static ggml_backend_buffer_type_t ggml_backend_kompute_device_get_buffer_type(ggml_backend_dev_t dev) { - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ggml_backend_kompute_buffer_type(ctx->device); -} - -static bool ggml_backend_kompute_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) { - if (buft->iface.get_name != ggml_backend_kompute_buffer_type_get_name) { - return false; - } - - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - ggml_backend_kompute_buffer_type_context * buft_ctx = (ggml_backend_kompute_buffer_type_context *)buft->context; - - return buft_ctx->device == ctx->device; -} - -static enum ggml_backend_dev_type ggml_backend_kompute_device_get_type(ggml_backend_dev_t dev) { - GGML_UNUSED(dev); - return GGML_BACKEND_DEVICE_TYPE_GPU; -} - -static void ggml_backend_kompute_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) { - props->name = ggml_backend_kompute_device_get_name(dev); - props->description = ggml_backend_kompute_device_get_description(dev); - props->type = ggml_backend_kompute_device_get_type(dev); - ggml_backend_kompute_device_get_memory(dev, &props->memory_free, &props->memory_total); - props->caps = { - /* async = */ false, - /* host_buffer = */ false, - /* .buffer_from_host_ptr = */ false, - /* events = */ false, - }; -} - -static ggml_backend_t ggml_backend_kompute_device_init(ggml_backend_dev_t dev, const char * params) { - GGML_UNUSED(params); - ggml_backend_kompute_device_context * ctx = (ggml_backend_kompute_device_context *)dev->context; - return ggml_backend_kompute_init(ctx->device); -} - -static bool ggml_backend_kompute_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) { - const int min_batch_size = 32; - - return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) || - (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID); - - GGML_UNUSED(dev); -} - -static const struct ggml_backend_device_i ggml_backend_kompute_device_i = { - /* .get_name = */ ggml_backend_kompute_device_get_name, - /* .get_description = */ ggml_backend_kompute_device_get_description, - /* .get_memory = */ ggml_backend_kompute_device_get_memory, - /* .get_type = */ ggml_backend_kompute_device_get_type, - /* .get_props = */ ggml_backend_kompute_device_get_props, - /* .init_backend = */ ggml_backend_kompute_device_init, - /* .get_buffer_type = */ ggml_backend_kompute_device_get_buffer_type, - /* .get_host_buffer_type = */ NULL, - /* .buffer_from_host_ptr = */ NULL, - /* .supports_op = */ ggml_backend_kompute_device_supports_op, - /* .supports_buft = */ ggml_backend_kompute_device_supports_buft, - /* .offload_op = */ ggml_backend_kompute_device_offload_op, - /* .event_new = */ NULL, - /* .event_free = */ NULL, - /* .event_synchronize = */ NULL, -}; - -static const char * ggml_backend_kompute_reg_get_name(ggml_backend_reg_t reg) { - GGML_UNUSED(reg); - return "Kompute"; -} - -static size_t ggml_backend_kompute_reg_get_device_count(ggml_backend_reg_t reg) { - GGML_UNUSED(reg); - return ggml_backend_kompute_get_device_count(); -} - -static ggml_backend_dev_t ggml_backend_kompute_reg_get_device(ggml_backend_reg_t reg, size_t device) { - static std::vector devices; - - static bool initialized = false; - - { - static std::mutex mutex; - std::lock_guard lock(mutex); - if (!initialized) { - for (size_t i = 0; i < ggml_backend_kompute_get_device_count(); i++) { - ggml_backend_kompute_device_context * ctx = new ggml_backend_kompute_device_context; - char desc[256]; - ggml_backend_kompute_get_device_description(i, desc, sizeof(desc)); - ctx->device = i; - ctx->name = "Kompute" + std::to_string(i); - ctx->description = desc; - devices.push_back(new ggml_backend_device { - /* .iface = */ ggml_backend_kompute_device_i, - /* .reg = */ reg, - /* .context = */ ctx, - }); - } - initialized = true; - } - } - - GGML_ASSERT(device < devices.size()); - return devices[device]; -} - -static const struct ggml_backend_reg_i ggml_backend_kompute_reg_i = { - /* .get_name = */ ggml_backend_kompute_reg_get_name, - /* .get_device_count = */ ggml_backend_kompute_reg_get_device_count, - /* .get_device = */ ggml_backend_kompute_reg_get_device, - /* .get_proc_address = */ NULL, -}; - -ggml_backend_reg_t ggml_backend_kompute_reg() { - static ggml_backend_reg reg = { - /* .api_version = */ GGML_BACKEND_API_VERSION, - /* .iface = */ ggml_backend_kompute_reg_i, - /* .context = */ nullptr, - }; - - return ® -} - -GGML_BACKEND_DL_IMPL(ggml_backend_kompute_reg) diff --git a/ggml/src/ggml-kompute/kompute b/ggml/src/ggml-kompute/kompute deleted file mode 160000 index 4565194ed..000000000 --- a/ggml/src/ggml-kompute/kompute +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4565194ed7c32d1d2efa32ceab4d3c6cae006306 diff --git a/ggml/src/ggml-kompute/kompute-shaders/common.comp b/ggml/src/ggml-kompute/kompute-shaders/common.comp deleted file mode 100644 index dbe4cf804..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/common.comp +++ /dev/null @@ -1,112 +0,0 @@ -#extension GL_EXT_shader_16bit_storage: require -#extension GL_EXT_shader_8bit_storage: require -#extension GL_EXT_shader_explicit_arithmetic_types_float16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int8: require -#extension GL_EXT_shader_explicit_arithmetic_types_int16: require -#extension GL_EXT_shader_explicit_arithmetic_types_int64: require -#extension GL_EXT_control_flow_attributes: enable -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_EXT_debug_printf : enable - -#define QK4_0 32 -#define QK4_1 32 - -#define GELU_COEF_A 0.044715 -#define SQRT_2_OVER_PI 0.79788456080286535587989211986876 -#define TWOPI_F 6.283185307179586f - -#define QK_K 256 -#define K_SCALE_SIZE 12 - -#define u8BufToU16(buf, idx) (((uint16_t(buf[idx + 1]) << 8)) | buf[idx]) -#define u8BufToFloat16(buf, idx) uint16BitsToHalf u8BufToU16(buf, idx) -#define u8BufToU32(buf, idx) (((uint32_t u8BufToU16(buf, idx + 2) << 8 | buf[idx + 1]) << 8) | buf[idx]) -#define u8BufToFloat(buf, idx) uintBitsToFloat u8BufToU32(buf, idx) - -#define sizeof_block_q4_0 0x12 -struct block_q4_0 { - float16_t d; - uint8_t qs[QK4_0 / 2]; -}; -mat4 dequantize_q4_0(const block_q4_0 xb, uint il) { - const float d1 = il != 0 ? (xb.d / 16.f) : xb.d; - const float d2 = d1 / 256.f; - const float md = -8.f * xb.d; - const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F); - const uint16_t mask1 = mask0 << 8; - - mat4 reg; - for (int i=0;i<8;i++) { - uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]); - reg[i/2][2*(i%2)+0] = d1 * (b & mask0) + md; - reg[i/2][2*(i%2)+1] = d2 * (b & mask1) + md; - } - return reg; -} - -#define sizeof_block_q4_1 0x14 -struct block_q4_1 { - float16_t d; - float16_t m; - uint8_t qs[QK4_1 / 2]; -}; -mat4 dequantize_q4_1(const block_q4_1 xb, uint il) { - const float d1 = il != 0 ? (xb.d / 16.f) : xb.d; - const float d2 = d1 / 256.f; - const float m = xb.m; - const uint16_t mask0 = il != 0 ? uint16_t(0x00F0) : uint16_t(0x000F); - const uint16_t mask1 = mask0 << 8; - - mat4 reg; - for (int i=0;i<8;i++) { - uint16_t b = (uint16_t(xb.qs[2 * i + 1]) << 8) | uint16_t(xb.qs[2 * i]); - reg[i/2][2*(i%2)+0] = ((b & mask0) * d1) + m; - reg[i/2][2*(i%2)+1] = ((b & mask1) * d2) + m; - } - return reg; -} - -#define sizeof_block_q4_k 144 -struct block_q4_k { - float16_t d; - float16_t dmin; - uint8_t scales[K_SCALE_SIZE]; - uint8_t qs[QK_K/2]; -}; - -#define sizeof_block_q6_k 210 -struct block_q6_k { - uint8_t ql[QK_K/2]; // quants, lower 4 bits - uint8_t qh[QK_K/4]; // quants, upper 2 bits - int8_t scales[QK_K/16]; // scales, quantized with 8 bits - float16_t d; // super-block scale -}; -mat4 dequantize_q6_k(const block_q6_k xb, uint il) { - const float16_t d_all = xb.d; - - const uint qlIndex = 64*(il/8) + 32*((il/2)&1) + 16*(il&1); - const uint qhIndex = 32*(il/8) + 16*(il&1); - float16_t sc = xb.scales[(il%2) + 2 * ((il/2))]; - il = (il/2) & 3; - - const uint16_t kmask1 = il>1 ? uint16_t(il>2 ? 192 : 48) : uint16_t(il>0 ? 12 : 3); - const uint16_t kmask2 = il>1 ? uint8_t(0xF0) : uint8_t(0x0F); - const float16_t coef = il>1 ? float16_t(1.f/16.f) : float16_t(1.f); - const float16_t ml = float16_t(d_all * sc * 32.f); - const float16_t dl = float16_t(d_all * sc * coef); - mat4 reg; - for (int i = 0; i < 16; ++i) { - const float16_t q = (il&1) != 0 ? ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 2)) - : ((xb.ql[qlIndex + i] & kmask2) | ((xb.qh[qhIndex + i] & kmask1) << 4)); - reg[i/4][i%4] = dl * q - ml; - } - return reg; -} - - -#define QK8_0 32 -// struct block_q8_0 { -// float16_t d; // delta -// int8_t qs[QK8_0]; // quants -// }; -#define sizeof_block_q8_0 34 diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_add.comp b/ggml/src/ggml-kompute/kompute-shaders/op_add.comp deleted file mode 100644 index b7b76a79d..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_add.comp +++ /dev/null @@ -1,58 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1024) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb00; - int nb01; - int nb02; - int nb03; - int ne10; - int ne11; - int ne12; - int ne13; - int nb10; - int nb11; - int nb12; - int nb13; - int ne0; - int nb0; - int nb1; - int nb2; - int nb3; - //int offs; // TODO: needed for GGML_OP_ACC, see metal code -} pcs; - -// general-purpose kernel for addition of two tensors -// pros: works for non-contiguous tensors, supports broadcast across dims 1, 2 and 3 -// cons: not very efficient -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const uint i13 = i03 % pcs.ne13; - const uint i12 = i02 % pcs.ne12; - const uint i11 = i01 % pcs.ne11; - - int offs = 0; // TMP (see above) - - uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + offs) / 4); - uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11 ) / 4); - uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1 + offs) / 4); - - for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) { - const uint i10 = i0 % pcs.ne10; - out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] + inB[pcs.inBOff + src1_off + i10]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp b/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp deleted file mode 100644 index 2376a6b8f..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_addrow.comp +++ /dev/null @@ -1,25 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - uint row; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 4; - - for (uint x = 0; x < 4; x++) { - const uint i = baseIndex + x; - out_[i + pcs.outOff] = inA[i + pcs.inAOff] + inB[(i % pcs.row) + pcs.inBOff]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp deleted file mode 100644 index d57247d2d..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float16_t -#define IN_TYPE_SIZE 2 -#define OUT_TYPE float16_t -#define OUT_TYPE_SIZE 2 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp deleted file mode 100644 index b568bcd7b..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f16_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float16_t -#define IN_TYPE_SIZE 2 -#define OUT_TYPE float -#define OUT_TYPE_SIZE 4 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp deleted file mode 100644 index 99b228343..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float -#define IN_TYPE_SIZE 4 -#define OUT_TYPE float16_t -#define OUT_TYPE_SIZE 2 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp deleted file mode 100644 index 2fc998492..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_cpy_f32_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -#define IN_TYPE float -#define IN_TYPE_SIZE 4 -#define OUT_TYPE float -#define OUT_TYPE_SIZE 4 - -layout(local_size_x = 1024) in; - -layout (binding = 0) readonly buffer tensorIn { IN_TYPE in_[]; }; -layout (binding = 1) writeonly buffer tensorOut { OUT_TYPE out_[]; }; - -layout (push_constant) uniform parameter { - uint inOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - int ne1; - int ne2; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const int n = int(i03)*pcs.ne02*pcs.ne01*pcs.ne00 + int(i02)*pcs.ne01*pcs.ne00 + int(i01)*pcs.ne00; - - const int i3 = n / (pcs.ne2*pcs.ne1*pcs.ne0); - const int i2 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0) / (pcs.ne1*pcs.ne0); - const int i1 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0) / pcs.ne0; - const int i0 = (n - i3*pcs.ne2*pcs.ne1*pcs.ne0 - i2*pcs.ne1*pcs.ne0 - i1*pcs.ne0); - - const uint dst_data = (i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / OUT_TYPE_SIZE + pcs.outOff; // Based from out_ - - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - const uint src = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01 + i00*pcs.nb00) / IN_TYPE_SIZE) + pcs.inOff; // Based from in_ - out_[dst_data+i00] = OUT_TYPE(in_[src]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp b/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp deleted file mode 100644 index 291c3fc18..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_diagmask.comp +++ /dev/null @@ -1,30 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - uint n_past; - int ne00; - int ne01; -} pcs; - -void main() { - const uint i02 = gl_WorkGroupID.z; - const uint i01 = gl_WorkGroupID.y; - const uint i00 = gl_WorkGroupID.x; - - const uint index = i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00 + i00; - - if (i00 > pcs.n_past + i01) { - out_[index + pcs.outOff] = uintBitsToFloat(0xFF800000); - } else { - out_[index + pcs.outOff] = in_[index + pcs.inOff]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp deleted file mode 100644 index 9d8c53710..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_gelu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 8; - - for (uint x = 0; x < 8; x++) { - const uint i = baseIndex + x; - const float y = in_[i + pcs.inOff]; - out_[i + pcs.outOff] = 0.5*y*(1.0 + tanh(clamp(SQRT_2_OVER_PI*y*(1.0 + GELU_COEF_A*y*y), -15.0, 15.0))); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp deleted file mode 100644 index 1a5581b23..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows.comp +++ /dev/null @@ -1,17 +0,0 @@ -void main() { - const uint i = gl_WorkGroupID.x; - const int r = inB[i + pcs.inBOff]; - - int z = 0; - for (uint ind = gl_LocalInvocationID.x; ind < pcs.ne00/16; ind += gl_WorkGroupSize.x) { - const uint inIndex = (r * pcs.nb01 + pcs.inAOff) + ind/NL * SIZE_OF_BLOCK; - const mat4 result = dequantize_block(inIndex, ind%NL); - for (uint j = 0; j < 4; ++j) { - for (uint k = 0; k < 4; ++k) { - const uint outIndex = i * pcs.nb1/BYTES_FOR_TYPE + pcs.outOff + z; - out_[outIndex] = result[j][k]; - ++z; - } - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp deleted file mode 100644 index 48c936108..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f16.comp +++ /dev/null @@ -1,31 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -void dequantize_row_f16(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { - for (int j = 0; j < k; j++) { - out_[y + j] = inA[x + j]; - } -} - -void main() { - const uint i = gl_WorkGroupID.x; - const int r = inB[i + pcs.inBOff]; - - dequantize_row_f16(r*pcs.nb01/2/*bytes for float16*/ + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp deleted file mode 100644 index 9d7acdaf8..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_f32.comp +++ /dev/null @@ -1,31 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { float inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -void dequantize_row_f32(uint x /*Based from inA unaligned*/, uint y /*Based from out_*/, int k) { - for (int j = 0; j < k; j++) { - out_[y + j] = inA[x + j]; - } -} - -void main() { - const uint i = gl_WorkGroupID.x; - const int r = inB[i + pcs.inBOff]; - - dequantize_row_f32(r*pcs.nb01/4 + pcs.inAOff, i*pcs.nb1/4 + pcs.outOff, pcs.ne00); -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp deleted file mode 100644 index 32b2e891e..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_0.comp +++ /dev/null @@ -1,38 +0,0 @@ -#version 450 - -#include "common.comp" - -#define NL 2 -#define BYTES_FOR_TYPE 4 /*bytes for float*/ -#define SIZE_OF_BLOCK sizeof_block_q4_0 - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -block_q4_0 get_unaligned_block_q4_0(uint index) { - block_q4_0 fres; - fres.d = u8BufToFloat16(inA, index); - [[unroll]] for (uint it = 0; it != QK4_0 / 2; it++) { - fres.qs[it] = inA[index+2+it]; - } - return fres; -} - -mat4 dequantize_block(uint index, uint il) { - const block_q4_0 block = get_unaligned_block_q4_0(index); - return dequantize_q4_0(block, il); -} - -#include "op_getrows.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp deleted file mode 100644 index 87f2fbe17..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q4_1.comp +++ /dev/null @@ -1,39 +0,0 @@ -#version 450 - -#include "common.comp" - -#define NL 2 -#define BYTES_FOR_TYPE 4 /*bytes for float*/ -#define SIZE_OF_BLOCK sizeof_block_q4_1 - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -block_q4_1 get_unaligned_block_q4_1(uint index) { - block_q4_1 fres; - fres.d = u8BufToFloat16(inA, index); - fres.m = u8BufToFloat16(inA, index+2); - [[unroll]] for (uint it = 0; it != QK4_1 / 2; it++) { - fres.qs[it] = inA[index+4+it]; - } - return fres; -} - -mat4 dequantize_block(uint index, uint il) { - const block_q4_1 block = get_unaligned_block_q4_1(index); - return dequantize_q4_1(block, il); -} - -#include "op_getrows.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp deleted file mode 100644 index 9ce3545d1..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_getrows_q6_k.comp +++ /dev/null @@ -1,44 +0,0 @@ -#version 450 - -#include "common.comp" - -#define NL 16 -#define BYTES_FOR_TYPE 4 /*bytes for float*/ -#define SIZE_OF_BLOCK sizeof_block_q6_k - -layout(local_size_x = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { int inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb01; - int nb1; -} pcs; - -block_q6_k get_unaligned_block_q6_k(uint index) { - block_q6_k fres; - [[unroll]] for (uint it = 0; it != QK_K / 2; it++) { - fres.ql[it] = inA[index + it]; - } - [[unroll]] for (uint it = 0; it != QK_K / 4; it++) { - fres.qh[it] = inA[index + QK_K/2 + it]; - } - [[unroll]] for (uint it = 0; it != QK_K / 16; it++) { - fres.scales[it] = int8_t(inA[index + QK_K/2 + QK_K/4 + it]); - } - fres.d = u8BufToFloat16(inA, index + QK_K/2 + QK_K/4 + QK_K/16); - return fres; -} - -mat4 dequantize_block(uint index, uint il) { - const block_q6_k block = get_unaligned_block_q6_k(index); - return dequantize_q6_k(block, il); -} - -#include "op_getrows.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp deleted file mode 100644 index c92647c4d..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1024) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int nb00; - int nb01; - int nb02; - int nb03; - int ne10; - int ne11; - int ne12; - int ne13; - int nb10; - int nb11; - int nb12; - int nb13; - int ne0; - int nb0; - int nb1; - int nb2; - int nb3; -} pcs; - -void main() { - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const uint i13 = i03 % pcs.ne13; - const uint i12 = i02 % pcs.ne12; - const uint i11 = i01 % pcs.ne11; - - uint src0_off = uint((i03*pcs.nb03 + i02*pcs.nb02 + i01*pcs.nb01) / 4); - uint src1_off = uint((i13*pcs.nb13 + i12*pcs.nb12 + i11*pcs.nb11) / 4); - uint dst_off = uint((i03*pcs.nb3 + i02*pcs.nb2 + i01*pcs.nb1) / 4); - - for (uint i0 = gl_LocalInvocationID.x; i0 < pcs.ne0; i0 += gl_WorkGroupSize.x) { - const uint i10 = i0 % pcs.ne10; - out_[pcs.outOff + dst_off + i0] = inA[pcs.inAOff + src0_off + i0] * inB[pcs.inBOff + src1_off + i10]; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp deleted file mode 100644 index 0ab1b2fc2..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_f16.comp +++ /dev/null @@ -1,69 +0,0 @@ -#version 450 - -#include "common.comp" - -#extension GL_KHR_shader_subgroup_arithmetic : require - -layout(local_size_x_id = 0) in; - -layout (binding = 0) readonly buffer tensorInA { float16_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne10; - int ne11; - int ne12; - uint nb10; - uint nb11; - uint nb12; - uint nb13; - int ne0; - int ne1; - uint r2; - uint r3; -} pcs; - -#define N_F16_F32 4 - -void main() { - const uint r0 = gl_WorkGroupID.x; - const uint rb = gl_WorkGroupID.y*N_F16_F32; - const uint im = gl_WorkGroupID.z; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint offset0 = r0*pcs.nb01 + (i12/pcs.r2)*pcs.nb02 + (i13/pcs.r3)*pcs.nb03; - - const uint x = offset0 / 2 + pcs.inAOff; // Based from inA - - for (uint row = 0; row < N_F16_F32; ++row) { - uint r1 = rb + row; - if (r1 >= pcs.ne11) { - break; - } - - const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; - - float sumf = 0; - for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { - sumf += float(inA[x+i]) * float(inB[y+i]); - } - - const float all_sum = subgroupAdd(sumf); - if (subgroupElect()) { - out_[im*pcs.ne1*pcs.ne0 + r1*pcs.ne0 + r0 + pcs.outOff] = all_sum; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp deleted file mode 100644 index d1ca4ad6c..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_mat_f32.comp +++ /dev/null @@ -1,51 +0,0 @@ -#version 450 - -#include "common.comp" - -#extension GL_KHR_shader_subgroup_arithmetic : require -#extension GL_EXT_debug_printf : enable - -// device subgroup size -layout (local_size_x_id = 0) in; - -layout(binding = 0) readonly buffer tensorInA { float inA[]; }; -layout(binding = 1) readonly buffer tensorInB { float inB[]; }; -layout(binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout(push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - int ne11; - int ne12; - uint nb01; - uint nb02; - uint nb11; - uint nb12; - uint nb1; - uint nb2; -} -pcs; - - -void main() { - uvec3 gid = gl_WorkGroupID; - - uint bc_ab = pcs.ne12 > pcs.ne02 ? gid.z / (pcs.ne12 / pcs.ne02) : gid.z; - uint bc_ba = pcs.ne02 > pcs.ne12 ? gid.z / (pcs.ne02 / pcs.ne12) : gid.z; - - const uint x = (gid.x*pcs.nb01 + bc_ab*pcs.nb02) / 4 + pcs.inAOff; // Based from inA - const uint y = (gid.y*pcs.nb11 + bc_ba*pcs.nb12) / 4 + pcs.inBOff; // based from inB - float sum = 0.0f; - for (uint i = gl_SubgroupInvocationID.x; i < pcs.ne00; i += gl_SubgroupSize) { - sum += float(inA[x+i]) * float(inB[y+i]); - } - - const float all_sum = subgroupAdd(sum); - if (subgroupElect()) { - out_[gid.z*(pcs.nb2/4) + gid.y*(pcs.nb1/4) + gid.x + pcs.outOff] = all_sum; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp deleted file mode 100644 index b0cea8bbe..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_0.comp +++ /dev/null @@ -1,33 +0,0 @@ -#version 450 - -#include "common.comp" - -#define BLOCKS_IN_QUANT QK4_0 -#define SIZE_OF_BLOCK sizeof_block_q4_0 -#define N_ROWS 4 - -#include "op_mul_mv_q_n_pre.comp" - -// The q4_0 version of this function -float block_q_n_dot_y(uint block_index, uint yb, uint il) { - vec2 acc = vec2(0.0, 0.0); - const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; - float d = float(u8BufToFloat16(inA, index)); - float sumy = 0.0f; - for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { - const uint16_t b = u8BufToU16(inA, index + 2 + il + i); - - const float yl0 = inB[yb + i]; - const float yl1 = inB[yb + i + 1]; - const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; - const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; - - sumy += yl0 + yl1 + yl8 + yl9; - - acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); - acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); - } - return d * (sumy * -8.f + acc[0] + acc[1]); -} - -#include "op_mul_mv_q_n.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp deleted file mode 100644 index 8582c61a3..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_1.comp +++ /dev/null @@ -1,35 +0,0 @@ -#version 450 - -#include "common.comp" - -#define BLOCKS_IN_QUANT QK4_1 -#define SIZE_OF_BLOCK sizeof_block_q4_1 -#define N_ROWS 4 - -#include "op_mul_mv_q_n_pre.comp" - -// The q4_1 version of this function -float block_q_n_dot_y(uint block_index, uint yb, uint il) { - vec2 acc = vec2(0.0, 0.0); - const uint index = (block_index) * SIZE_OF_BLOCK + pcs.inAOff; - float d = float(u8BufToFloat16(inA, index)); - float m = float(u8BufToFloat16(inA, index+2)); - - float sumy = 0.0f; - for (int i = 0; i < BLOCKS_IN_QUANT/4; i+=2) { - const uint16_t b = u8BufToU16(inA, index + 4 + il + i); - - const float yl0 = inB[yb + i]; - const float yl1 = inB[yb + i + 1]; - const float yl8 = inB[yb + i + BLOCKS_IN_QUANT/2]; - const float yl9 = inB[yb + i + BLOCKS_IN_QUANT/2 + 1]; - - sumy += yl0 + yl1 + yl8 + yl9; - - acc[0] += yl0 * (b & 0x000F) + yl1 / 256.f * (b & 0x0F00); - acc[1] += yl8 / 16.f * (b & 0x00F0) + yl9 / 4096.f * (b & 0xF000); - } - return d * (acc[0] + acc[1]) + sumy * m; -} - -#include "op_mul_mv_q_n.comp" diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp deleted file mode 100644 index a5752a3a0..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q4_k.comp +++ /dev/null @@ -1,140 +0,0 @@ -#version 450 - -#include "common.comp" - -#define N_DST 4 -#define SIZE_OF_BLOCK sizeof_block_q4_k - -layout(local_size_x = 4) in; -layout(local_size_y = 8) in; -layout(local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { block_q4_k inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne10; - int ne0; - int ne1; - int ne01; - int ne02; - int ne12; - uint nb01; - uint nb02; - uint nb03; - uint nb11; - uint nb12; - uint nb13; - uint r2; - uint r3; -} pcs; - -void main() { - const uint16_t kmask1 = uint16_t(0x3f3f); - const uint16_t kmask2 = uint16_t(0x0f0f); - const uint16_t kmask3 = uint16_t(0xc0c0); - - const uint ix = gl_SubgroupInvocationID/8; // 0...3 - const uint it = gl_SubgroupInvocationID%8; // 0...7 - const uint iq = it/4; // 0 or 1 - const uint ir = it%4; // 0...3 - - const uint nb = pcs.ne00/QK_K; - - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint first_row = r0 * N_DST; - const uint ib_row = first_row * nb; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint offset0 = first_row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); - const uint offset1 = r1*pcs.nb11 + (i12 )*pcs.nb12 + (i13 )*pcs.nb13; - - const uint xblk = offset0 + pcs.inAOff; - const uint y = (offset1 / 4) + pcs.inBOff; - - float yl[16]; - float yh[16]; - float sumf[N_DST] = {0.f, 0.f, 0.f, 0.f}; - float all_sum = 0.f; - - uint y4 = y + ix * QK_K + 64 * iq + 8 * ir; - - for (uint ib = ix; ib < nb; ib += 4) { - const uint blk_idx = ib + xblk; - - float sumy[4] = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; ++i) { - yl[i+0] = inB[y4+i+ 0]; sumy[0] += yl[i+0]; - yl[i+8] = inB[y4+i+ 32]; sumy[1] += yl[i+8]; - yh[i+0] = inB[y4+i+128]; sumy[2] += yh[i+0]; - yh[i+8] = inB[y4+i+160]; sumy[3] += yh[i+8]; - } - - for (int row = 0; row < N_DST; row++) { - uint row_idx = row * (pcs.nb01 / SIZE_OF_BLOCK); - - uint16_t sc_0 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 0); - uint16_t sc_1 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 2); - uint16_t sc_2 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 4); - uint16_t sc_3 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 6); - uint16_t sc_4 = u8BufToU16(inA[blk_idx + row_idx].scales, iq * 2 + 8); - - uint16_t sc16[4]; - sc16[0] = sc_0 & kmask1; - sc16[1] = sc_2 & kmask1; - sc16[2] = ((sc_4 >> 0) & kmask2) | ((sc_0 & kmask3) >> 2); - sc16[3] = ((sc_4 >> 4) & kmask2) | ((sc_2 & kmask3) >> 2); - - float acc1[4] = {0.f, 0.f, 0.f, 0.f}; - float acc2[4] = {0.f, 0.f, 0.f, 0.f}; - for (int i = 0; i < 8; i += 2) { - uint16_t q1 = u8BufToU16(inA[blk_idx + row_idx].qs, 32 * iq + 8 * ir + i); - uint16_t q2 = u8BufToU16(inA[blk_idx + row_idx].qs, 64 + 32 * iq + 8 * ir + i); - acc1[0] += yl[i+0] * (q1 & 0x000F); - acc1[1] += yl[i+1] * (q1 & 0x0F00); - acc1[2] += yl[i+8] * (q1 & 0x00F0); - acc1[3] += yl[i+9] * (q1 & 0xF000); - acc2[0] += yh[i+0] * (q2 & 0x000F); - acc2[1] += yh[i+1] * (q2 & 0x0F00); - acc2[2] += yh[i+8] * (q2 & 0x00F0); - acc2[3] += yh[i+9] * (q2 & 0xF000); - } - - uint8_t sc8_0 = uint8_t(sc16[0] & 0xFF); - uint8_t sc8_1 = uint8_t(sc16[0] >> 8 ); - uint8_t sc8_2 = uint8_t(sc16[1] & 0xFF); - uint8_t sc8_3 = uint8_t(sc16[1] >> 8 ); - uint8_t sc8_4 = uint8_t(sc16[2] & 0xFF); - uint8_t sc8_5 = uint8_t(sc16[2] >> 8 ); - uint8_t sc8_6 = uint8_t(sc16[3] & 0xFF); - uint8_t sc8_7 = uint8_t(sc16[3] >> 8 ); - - float dall = float(inA[blk_idx + row_idx].d); - float dmin = float(inA[blk_idx + row_idx].dmin); - sumf[row] += dall * ((acc1[0] + 1.f/256.f * acc1[1]) * sc8_0 + - (acc1[2] + 1.f/256.f * acc1[3]) * sc8_1 * 1.f/16.f + - (acc2[0] + 1.f/256.f * acc2[1]) * sc8_4 + - (acc2[2] + 1.f/256.f * acc2[3]) * sc8_5 * 1.f/16.f) - - dmin * (sumy[0] * sc8_2 + sumy[1] * sc8_3 + sumy[2] * sc8_6 + sumy[3] * sc8_7); - } - - y4 += 4 * QK_K; - } - - for (int row = 0; row < N_DST; ++row) { - all_sum = subgroupAdd(sumf[row]); - if (subgroupElect()) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = all_sum; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp deleted file mode 100644 index d331d1a70..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q6_k.comp +++ /dev/null @@ -1,106 +0,0 @@ -#version 450 - -#include "common.comp" - -#define SIZE_OF_BLOCK sizeof_block_q6_k - -layout(local_size_x_id = 0) in; -layout(local_size_y_id = 1) in; -layout(local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne10; - int ne0; - int ne1; - int ne01; - int ne02; - int ne12; - uint nb01; - uint nb02; - uint nb03; - uint nb11; - uint nb12; - uint nb13; - uint r2; - uint r3; -} pcs; - -void main() { - const uint8_t kmask1 = uint8_t(0x03); - const uint8_t kmask2 = uint8_t(0x0C); - const uint8_t kmask3 = uint8_t(0x30); - const uint8_t kmask4 = uint8_t(0xC0); - - const uint nb = pcs.ne00/QK_K; - - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint row = (r0 * gl_NumSubgroups + gl_SubgroupID); - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint x = row*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); - const uint yy = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; - - float sumf = 0; - - // bits of invocation ID for gl_SubgroupSize=32: - // x x x x x - // 4 3 2 1 0 - // ( tid ) ix - // ip ( il ) - - const uint block_stride = gl_SubgroupSize / 16; // number of blocks each subgroup processes - const uint tid = gl_SubgroupInvocationID/block_stride; // first block_stride groups have tid=0 - const uint ix = gl_SubgroupInvocationID%block_stride; // first block is 0..block_stride-1 - const uint ip = tid/8; // first or second half of block (0 or 1) - const uint il = tid%8; // each half has 8 parts, one per scale - const uint n = 4; // 4 scales at a time (and 4 sums) - const uint l0 = n*il; // offset into half-block, 0..28 - const uint is = 8*ip + l0/16; // 0, 1, 8, 9 - - const uint y_offset = 128*ip + l0; - const uint q_offset_l = 64*ip + l0; - const uint q_offset_h = 32*ip + l0; - - for (uint i = ix; i < nb; i += block_stride) { - - const uint baseIndex = (x + i) * SIZE_OF_BLOCK + pcs.inAOff; - - const uint qlIndex = q_offset_l; - const uint q2Index = qlIndex + QK_K/8; - const uint qhIndex = q_offset_h; - const uint y = yy + i * QK_K + y_offset; - - float sums[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (uint l = 0; l < n; ++l) { - const uint8_t currentQ1 = inA[baseIndex + qlIndex + l]; - const uint8_t currentQ2 = inA[baseIndex + q2Index + l]; - const uint8_t currentQh = inA[baseIndex + QK_K/2 + qhIndex + l]; - - sums[0] += inB[y+l+ 0] * (int8_t((currentQ1 & 0xF) | ((currentQh & kmask1) << 4)) - 32); - sums[1] += inB[y+l+32] * (int8_t((currentQ2 & 0xF) | ((currentQh & kmask2) << 2)) - 32); - sums[2] += inB[y+l+64] * (int8_t((currentQ1 >> 4) | ((currentQh & kmask3) << 0)) - 32); - sums[3] += inB[y+l+96] * (int8_t((currentQ2 >> 4) | ((currentQh & kmask4) >> 2)) - 32); - } - - float d = u8BufToFloat16(inA, baseIndex + QK_K/2 + QK_K/4 + QK_K/16); - sumf += d * (sums[0] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + is]) + sums[1] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 2 + is]) + sums[2] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 4 + is]) + sums[3] * int8_t(inA[baseIndex + QK_K/2 + QK_K/4 + 6 + is])); - } - - const float tot = subgroupAdd(sumf); - if (subgroupElect()) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + row + pcs.outOff] = tot; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp deleted file mode 100644 index 34d015e90..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mat_q8_0.comp +++ /dev/null @@ -1,73 +0,0 @@ -#version 450 - -#include "common.comp" - -#include "op_mul_mv_q_n_pre.comp" - -#define SIZE_OF_D 2 - -#define N_DST 4 // each SIMD group works on 4 rows -#define N_SIMDGROUP 2 // number of SIMD groups in a thread group -#define N_SIMDWIDTH 32 // assuming SIMD group size is 32 - -#define NB_Q8_0 8 - -void main() { - // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64 - if (gl_SubgroupInvocationID > 31) - return; - - const int nr = N_DST; - const int nsg = N_SIMDGROUP; - const int nw = N_SIMDWIDTH; - - const int nb = pcs.ne00/QK8_0; - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint first_row = (r0 * nsg + gl_SubgroupID) * nr; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - const uint offset0 = first_row * nb + (i12/pcs.r2)*(nb*pcs.ne01) + (i13/pcs.r3)*(nb*pcs.ne01*pcs.ne02); - - const uint x = offset0*sizeof_block_q8_0 + pcs.inAOff; // Based from inA - const uint y = r1*pcs.ne10 + im*pcs.ne00*pcs.ne1 + pcs.inBOff; // based from inB - - float yl[NB_Q8_0]; - float sumf[N_DST]={0.f, 0.f, 0.f, 0.f}; - - const uint ix = gl_SubgroupInvocationID.x/4; - const uint il = gl_SubgroupInvocationID.x%4; - - uint yb = y + ix * QK8_0 + NB_Q8_0*il; - - // each thread in a SIMD group deals with NB_Q8_0 quants at a time - for (uint ib = ix; ib < nb; ib += nw/4) { - for (int i = 0; i < NB_Q8_0; ++i) { - yl[i] = inB[yb + i]; - } - - for (int row = 0; row < nr; row++) { - const uint block_offset = (ib+row*nb) * sizeof_block_q8_0; - float sumq = 0.f; - for (int iq = 0; iq < NB_Q8_0; ++iq) { - const int8_t qs_iq = int8_t(inA[x + block_offset + SIZE_OF_D + NB_Q8_0*il + iq]); - sumq += qs_iq * yl[iq]; - } - const float16_t d = u8BufToFloat16(inA, x + block_offset); - sumf[row] += sumq*d; - } - - yb += NB_Q8_0 * nw; - } - - for (int row = 0; row < nr; ++row) { - const float tot = subgroupAdd(sumf[row]); - if (subgroupElect() && first_row + row < pcs.ne01) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row] = tot; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp deleted file mode 100644 index a6517cc1f..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n.comp +++ /dev/null @@ -1,52 +0,0 @@ -void main() { - // NB: hack to make compatible with AMD GPUs that have a subgroup size of 64 - if (gl_SubgroupInvocationID > 31) - return; - - const uint nb = uint(pcs.ne00/BLOCKS_IN_QUANT); - - const uint r0 = gl_WorkGroupID.x; - const uint r1 = gl_WorkGroupID.y; - const uint im = gl_WorkGroupID.z; - - const uint first_row = (r0 * gl_NumSubgroups + gl_SubgroupID) * N_ROWS; - - const uint i12 = im%pcs.ne12; - const uint i13 = im/pcs.ne12; - - // pointers to src0 rows - uint ax[N_ROWS]; - for (int row = 0; row < N_ROWS; ++row) { - const uint offset0 = (first_row + row)*(pcs.nb01/SIZE_OF_BLOCK) + (i12/pcs.r2)*(pcs.nb02/SIZE_OF_BLOCK) + (i13/pcs.r3)*(pcs.nb03/SIZE_OF_BLOCK); - - ax[row] = offset0 + pcs.inAOff; - } - - const uint y = (r1*pcs.nb11 + i12*pcs.nb12 + i13*pcs.nb13) / 4 + pcs.inBOff; - - float sumf[N_ROWS] = {0.0f, 0.0f, 0.0f, 0.0f}; - - const uint ix = gl_SubgroupInvocationID/2; - const uint il = (BLOCKS_IN_QUANT/4)*(gl_SubgroupInvocationID%2); - - uint yb = y + ix * BLOCKS_IN_QUANT + il; - - //debugPrintfEXT("gl_NumSubgroups=%d, gl_SubgroupID=%d, gl_SubgroupInvocationID=%d, glSubgroupSize=%d, gl_WorkGroupSize.x=%d, gl_WorkGroupSize.y=%d, gl_WorkGroupSize.z=%d\n", - // gl_NumSubgroups, gl_SubgroupID, gl_SubgroupInvocationID, gl_SubgroupSize, - // gl_WorkGroupSize.x, gl_WorkGroupSize.y, gl_WorkGroupSize.z); - - for (uint ib = ix; ib < nb; ib += 16) { - for (int row = 0; row < N_ROWS; row++) { - sumf[row] += block_q_n_dot_y(ax[row] + ib, yb, il); - } - - yb += BLOCKS_IN_QUANT * 16; - } - - for (int row = 0; row < N_ROWS; ++row) { - const float tot = subgroupAdd(sumf[row]); - if (first_row + row < pcs.ne01 && subgroupElect()) { - out_[r1*pcs.ne0 + im*pcs.ne0*pcs.ne1 + first_row + row + pcs.outOff] = tot; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp b/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp deleted file mode 100644 index a9a2f2218..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_mul_mv_q_n_pre.comp +++ /dev/null @@ -1,28 +0,0 @@ -layout(local_size_x_id = 0) in; -layout(local_size_y = 8) in; -layout(local_size_z = 1) in; - -layout (binding = 0) readonly buffer tensorInA { uint8_t inA[]; }; -layout (binding = 1) readonly buffer tensorInB { float inB[]; }; -layout (binding = 2) writeonly buffer tensorOut { float out_[]; }; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - int ne10; - int ne12; - int ne0; - int ne1; - uint nb01; - uint nb02; - uint nb03; - uint nb11; - uint nb12; - uint nb13; - uint r2; - uint r3; -} pcs; diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp deleted file mode 100644 index ad0c3c01b..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_norm.comp +++ /dev/null @@ -1,84 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 256) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - uint ne00; - uint nb01; - float eps; -} pcs; - -shared float sum[gl_WorkGroupSize.x]; - -void main() { - const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ - // MEAN - // parallel sum - sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - sum[gl_LocalInvocationID.x] += in_[x+i00]; - } - - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - if (gl_LocalInvocationID.x == 0) { - sum[0] /= float(pcs.ne00); - } - barrier(); - memoryBarrierShared(); - const float mean = sum[0]; - - // recenter - const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - out_[y+i00] = in_[x+i00] - mean; - } - - // VARIANCE - // parallel sum - sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - sum[gl_LocalInvocationID.x] += out_[y+i00] * out_[y+i00]; - } - - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - if (gl_LocalInvocationID.x == 0) { - sum[0] /= float(pcs.ne00); - } - barrier(); - memoryBarrierShared(); - const float variance = sum[0]; - - const float scale = 1.0f/sqrt(variance + pcs.eps); - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - out_[y+i00] *= scale; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp deleted file mode 100644 index 52a601fe6..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_relu.comp +++ /dev/null @@ -1,21 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 4; - - for (uint x = 0; x < 4; x++) { - const uint i = baseIndex + x; - out_[i + pcs.outOff] = max(0.0, in_[i + pcs.inOff]); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp deleted file mode 100644 index da658c160..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rmsnorm.comp +++ /dev/null @@ -1,53 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 512) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - uint ne00; - uint nb01; - float eps; -} pcs; - -shared float sum[gl_WorkGroupSize.x]; - -void main() { - const uint x = (gl_WorkGroupID.x*pcs.nb01/4) + pcs.inOff; // Based from in_ - - // parallel sum - sum[gl_LocalInvocationID.x] = 0.0; - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - sum[gl_LocalInvocationID.x] += in_[x+i00] * in_[x+i00]; - } - - // reduce - barrier(); - memoryBarrierShared(); - [[unroll]] for (uint i = gl_WorkGroupSize.x/2; i > 0; i /= 2) { - if (gl_LocalInvocationID.x < i) { - sum[gl_LocalInvocationID.x] += sum[gl_LocalInvocationID.x + i]; - } - barrier(); - memoryBarrierShared(); - } - - // broadcast - if (gl_LocalInvocationID.x == 0) { - sum[0] /= float(pcs.ne00); - } - barrier(); - memoryBarrierShared(); - - const float scale = 1.0f/sqrt(sum[0] + pcs.eps); - - const uint y = (gl_WorkGroupID.x*pcs.ne00) + pcs.outOff; // Based from out_ - for (uint i00 = gl_LocalInvocationID.x; i00 < pcs.ne00; i00 += gl_WorkGroupSize.x) { - out_[y+i00] = in_[x+i00] * scale; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp deleted file mode 100644 index 63659cbfe..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - const float x0 = float(inA[src]); - const float x1 = float(inA[src+pcs.n_dims/2]); - - out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); - out_[dst_data+pcs.n_dims/2] = float16_t(x0*sin_theta + x1*cos_theta); - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp deleted file mode 100644 index 4df56204d..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_neox_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + ic*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + ic*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - const float x0 = inA[src]; - const float x1 = inA[src+pcs.n_dims/2]; - - out_[dst_data] = x0*cos_theta - x1*sin_theta; - out_[dst_data+pcs.n_dims/2] = x0*sin_theta + x1*cos_theta; - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp deleted file mode 100644 index a3c0eda8b..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f16.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float16_t inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float16_t out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - const float x0 = float(inA[src]); - const float x1 = float(inA[src+1]); - - out_[dst_data] = float16_t(x0*cos_theta - x1*sin_theta); - out_[dst_data+1] = float16_t(x0*sin_theta + x1*cos_theta); - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 2) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 2) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp b/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp deleted file mode 100644 index b7963ae72..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_rope_norm_f32.comp +++ /dev/null @@ -1,52 +0,0 @@ -#version 450 - -#include "rope_common.comp" - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { int inB[]; }; -layout(binding = 2) buffer restrict readonly tensorInC { float inC[]; }; -layout(binding = 3) buffer restrict writeonly tensorOut { float out_[]; }; - -void main() { - const uint i3 = gl_WorkGroupID.z; - const uint i2 = gl_WorkGroupID.y; - const uint i1 = gl_WorkGroupID.x; - - float corr_dims[2]; - rope_yarn_corr_dims(pcs.n_dims, pcs.n_ctx_orig, pcs.freq_base, pcs.beta_fast, pcs.beta_slow, corr_dims); - - const float theta_scale = pow(pcs.freq_base, -2.0/pcs.n_dims); - - float theta_base = float(inB[pcs.inBOff + i2]); - float inv_ndims = -1.f/pcs.n_dims; - - float cos_theta; - float sin_theta; - - for (uint i0 = 2*gl_LocalInvocationIndex; i0 < pcs.ne0; i0 += 2*gl_WorkGroupSize.x) { - if (i0 < pcs.n_dims) { - uint ic = i0/2; - - float theta = theta_base * pow(pcs.freq_base, inv_ndims*i0); - - const float freq_factor = pcs.has_freq_factors ? inC[pcs.inCOff + ic] : 1.0f; - - rope_yarn(theta/freq_factor, pcs.freq_scale, corr_dims, i0, pcs.ext_factor, pcs.attn_factor, cos_theta, sin_theta); - - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - const float x0 = inA[src]; - const float x1 = inA[src+1]; - - out_[dst_data] = x0*cos_theta - x1*sin_theta; - out_[dst_data+1] = x0*sin_theta + x1*cos_theta; - } else { - const uint src = uint((i3*pcs.nb03 + i2*pcs.nb02 + i1*pcs.nb01 + i0*pcs.nb00) / 4) + pcs.inAOff; // Based from in - const uint dst_data = uint((i3*pcs.nb3 + i2*pcs.nb2 + i1*pcs.nb1 + i0*pcs.nb0) / 4) + pcs.outOff; // Based from out_ - - out_[dst_data] = inA[src]; - out_[dst_data+1] = inA[src+1]; - } - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp deleted file mode 100644 index bdae26738..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_scale.comp +++ /dev/null @@ -1,19 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - float scale; -} pcs; - -void main() { - const uint i = gl_WorkGroupID.x; - out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp b/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp deleted file mode 100644 index ada69754b..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_scale_8.comp +++ /dev/null @@ -1,23 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; - float scale; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 8; - - for (uint x = 0; x < 8; x++) { - const uint i = baseIndex + x; - out_[i + pcs.outOff] = in_[i + pcs.inOff] * pcs.scale; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp b/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp deleted file mode 100644 index 0fb8e4b74..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_silu.comp +++ /dev/null @@ -1,22 +0,0 @@ -#version 450 - -#include "common.comp" - -layout(local_size_x = 1) in; - -layout(binding = 0) buffer restrict readonly tensorIn { float in_[]; }; -layout(binding = 1) buffer restrict writeonly tensorOut { float out_[]; }; -layout(push_constant) uniform PushConstants { - uint inOff; - uint outOff; -} pcs; - -void main() { - const uint baseIndex = gl_WorkGroupID.x * 4; - - for (uint x = 0; x < 4; x++) { - const uint i = baseIndex + x; - const float y = in_[i + pcs.inOff]; - out_[i + pcs.outOff] = y / (1.0 + exp(-y)); - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp b/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp deleted file mode 100644 index 4165295bf..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/op_softmax.comp +++ /dev/null @@ -1,72 +0,0 @@ -// TODO: implement multi-simd softmax (llama.cpp commit e16b9fa4) - -#version 450 - -#include "common.comp" - -layout(local_size_x_id = 0) in; - -layout(binding = 0) buffer restrict readonly tensorInA { float inA[]; }; -layout(binding = 1) buffer restrict readonly tensorInB { float inB[]; }; -layout(binding = 2) buffer restrict writeonly tensorOut { float out_[]; }; - -layout(push_constant) uniform PushConstants { - uint inAOff; - uint inBOff; - uint outOff; - int ne00; - int ne01; - int ne02; - float scale; - float max_bias; - float m0; - float m1; - uint n_head_log2; - int mask; -} pcs; - -void main() { - if (gl_SubgroupInvocationID > 31) - return; - - const uint i03 = gl_WorkGroupID.z; - const uint i02 = gl_WorkGroupID.y; - const uint i01 = gl_WorkGroupID.x; - - const uint extra_off = i03*pcs.ne02*pcs.ne01*pcs.ne00 + i02*pcs.ne01*pcs.ne00 + i01*pcs.ne00; - const uint psrc0 = extra_off + pcs.inAOff; // Based from inA - const uint pmask = i01*pcs.ne00 + pcs.inBOff; // Based from inB - const uint pdst = extra_off + pcs.outOff; // Based from out_ - - float slope = 1.0f; - - // ALiBi - if (pcs.max_bias > 0.0f) { - int64_t h = i02; - - float base = h < pcs.n_head_log2 ? pcs.m0 : pcs.m1; - int64_t exp = h < pcs.n_head_log2 ? h + 1 : 2*(h - pcs.n_head_log2) + 1; - - slope = pow(base, float(exp)); - } - - // parallel max - float localMax = uintBitsToFloat(0xFF800000); - for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - localMax = max(localMax, inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f)); - } - float max_ = subgroupMax(localMax); - - // parallel sum - float localSum = 0.0f; - for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - const float exp_psrc0 = exp(inA[psrc0 + i00]*pcs.scale + (pcs.mask!=0 ? slope*inB[pmask + i00] : 0.0f) - max_); - localSum += exp_psrc0; - out_[pdst + i00] = exp_psrc0; - } - - const float sum = subgroupAdd(localSum); - for (uint i00 = gl_SubgroupInvocationID.x; i00 < pcs.ne00; i00 += 32) { - out_[pdst + i00] /= sum; - } -} diff --git a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp b/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp deleted file mode 100644 index 0fca640dc..000000000 --- a/ggml/src/ggml-kompute/kompute-shaders/rope_common.comp +++ /dev/null @@ -1,71 +0,0 @@ -#include "common.comp" - -#define GGML_ROPE_TYPE_NEOX 2 - -// TODO: use a local size of 32 or more (Metal uses 1024) -layout(local_size_x = 1) in; - -layout (push_constant) uniform parameter { - uint inAOff; - uint inBOff; - uint inCOff; - uint outOff; - int n_dims; - int mode; - int n_ctx_orig; - float freq_base; - float freq_scale; - bool has_freq_factors; - float ext_factor; - float attn_factor; - float beta_fast; - float beta_slow; - uint nb00; - uint nb01; - uint nb02; - uint nb03; - int ne0; - uint nb0; - uint nb1; - uint nb2; - uint nb3; -} pcs; - -float rope_yarn_ramp(const float low, const float high, const float i0) { - const float y = (i0 / 2 - low) / max(0.001f, high - low); - return 1.0f - min(1.0f, max(0.0f, y)); -} - -// YaRN algorithm based on LlamaYaRNScaledRotaryEmbedding.py from https://github.com/jquesnelle/yarn -// MIT licensed. Copyright (c) 2023 Jeffrey Quesnelle and Bowen Peng. -void rope_yarn( - float theta_extrap, float freq_scale, float corr_dims[2], float i0, float ext_factor, float mscale, - out float cos_theta, out float sin_theta -) { - // Get n-d rotational scaling corrected for extrapolation - float theta_interp = freq_scale * theta_extrap; - float theta = theta_interp; - if (ext_factor != 0.0f) { - float ramp_mix = rope_yarn_ramp(corr_dims[0], corr_dims[1], i0) * ext_factor; - theta = theta_interp * (1 - ramp_mix) + theta_extrap * ramp_mix; - - // Get n-d magnitude scaling corrected for interpolation - mscale *= 1.0f + 0.1f * log(1.0f / freq_scale); - } - cos_theta = cos(theta) * mscale; - sin_theta = sin(theta) * mscale; -} - -// Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get -// `corr_fac(n_rot) = n_dims * log(max_pos_emb / (n_rot * 2pi)) / (2 * log(base))` -float rope_yarn_corr_factor(int n_dims, int n_ctx_orig, float n_rot, float base) { - return n_dims * log(n_ctx_orig / (n_rot * TWOPI_F)) / (2 * log(base)); -} - -void rope_yarn_corr_dims( - int n_dims, int n_ctx_orig, float freq_base, float beta_fast, float beta_slow, out float dims[2] -) { - // start and end correction dims - dims[0] = max(0.0f, floor(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_fast, freq_base))); - dims[1] = min(n_dims - 1.0f, ceil(rope_yarn_corr_factor(n_dims, n_ctx_orig, beta_slow, freq_base))); -} diff --git a/scripts/sync-ggml-am.sh b/scripts/sync-ggml-am.sh index 73d4eec0b..29d30e0a1 100755 --- a/scripts/sync-ggml-am.sh +++ b/scripts/sync-ggml-am.sh @@ -83,7 +83,6 @@ while read c; do src/ggml-cpu/* \ src/ggml-cuda/* \ src/ggml-hip/* \ - src/ggml-kompute/* \ src/ggml-metal/* \ src/ggml-musa/* \ src/ggml-opencl/* \ @@ -141,7 +140,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then # src/ggml-cpu/* -> ggml/src/ggml-cpu/* # src/ggml-cuda/* -> ggml/src/ggml-cuda/* # src/ggml-hip/* -> ggml/src/ggml-hip/* - # src/ggml-kompute/* -> ggml/src/ggml-kompute/* # src/ggml-metal/* -> ggml/src/ggml-metal/* # src/ggml-musa/* -> ggml/src/ggml-musa/* # src/ggml-opencl/* -> ggml/src/ggml-opencl/* @@ -174,7 +172,6 @@ if [ -f $SRC_LLAMA/ggml-src.patch ]; then -e 's/([[:space:]]| [ab]\/)src\/ggml-cpu\//\1ggml\/src\/ggml-cpu\//g' \ -e 's/([[:space:]]| [ab]\/)src\/ggml-cuda\//\1ggml\/src\/ggml-cuda\//g' \ -e 's/([[:space:]]| [ab]\/)src\/ggml-hip\//\1ggml\/src\/ggml-hip\//g' \ - -e 's/([[:space:]]| [ab]\/)src\/ggml-kompute\//\1ggml\/src\/ggml-kompute\//g' \ -e 's/([[:space:]]| [ab]\/)src\/ggml-metal\//\1ggml\/src\/ggml-metal\//g' \ -e 's/([[:space:]]| [ab]\/)src\/ggml-opencl\//\1ggml\/src\/ggml-opencl\//g' \ -e 's/([[:space:]]| [ab]\/)src\/ggml-rpc\//\1ggml\/src\/ggml-rpc\//g' \ diff --git a/scripts/sync-ggml.sh b/scripts/sync-ggml.sh index 6460a77f1..9b98329e0 100755 --- a/scripts/sync-ggml.sh +++ b/scripts/sync-ggml.sh @@ -15,7 +15,6 @@ cp -rpv ../ggml/src/ggml-cann/* ./ggml/src/ggml-cann/ cp -rpv ../ggml/src/ggml-cpu/* ./ggml/src/ggml-cpu/ cp -rpv ../ggml/src/ggml-cuda/* ./ggml/src/ggml-cuda/ cp -rpv ../ggml/src/ggml-hip/* ./ggml/src/ggml-hip/ -cp -rpv ../ggml/src/ggml-kompute/* ./ggml/src/ggml-kompute/ cp -rpv ../ggml/src/ggml-metal/* ./ggml/src/ggml-metal/ cp -rpv ../ggml/src/ggml-musa/* ./ggml/src/ggml-musa/ cp -rpv ../ggml/src/ggml-opencl/* ./ggml/src/ggml-opencl/ diff --git a/tests/test-c.c b/tests/test-c.c index 95ba73df3..a05071080 100644 --- a/tests/test-c.c +++ b/tests/test-c.c @@ -1,7 +1,3 @@ #include "llama.h" -#ifdef GGML_USE_KOMPUTE -#include "ggml-kompute.h" -#endif - int main(void) {}