From ef6198b5a57ef00b75c73e7d19024db7bd582546 Mon Sep 17 00:00:00 2001 From: chen fan <350211548@qq.com> Date: Wed, 23 Jul 2025 11:58:00 +0800 Subject: [PATCH] CANN: weight format to NZ for Ascend310P3 (#14407) * weight format to nz for 310p * remove quant weight format to nz * clean code * fix * make the conditions for converting weights to NZ format consistent * clean code --- ggml/src/ggml-cann/aclnn_ops.cpp | 23 ++++++++++- ggml/src/ggml-cann/aclnn_ops.h | 32 ++++++++++++++++ ggml/src/ggml-cann/ggml-cann.cpp | 65 ++++++++++++++++++++++++++++++++ 3 files changed, 118 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 4d5c2c182..76bed4e8c 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1785,8 +1785,27 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, size_t transpose_nb[] = {bcast_weight_nb[1], bcast_weight_nb[0], bcast_weight_nb[2], bcast_weight_nb[3], bcast_weight_nb[4], bcast_weight_nb[5]}; - aclTensor* acl_weight_tensor = - ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims); + aclTensor* acl_weight_tensor; + + bool weightToNZ = false; +#ifdef ASCEND_310P + weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); +#endif + if (weightToNZ && is_matmul_weight(weight)) { + int64_t acl_stride[2] = {1, transpose_ne[1]}; + + // Reverse ne. + std::reverse(transpose_ne, transpose_ne + n_dims); + + std::vector storageDims = {transpose_ne[0], transpose_ne[1]}; + + acl_weight_tensor = aclCreateTensor( + transpose_ne, n_dims, ggml_cann_type_mapping(weight->type), acl_stride, + 0, ACL_FORMAT_FRACTAL_NZ, storageDims.data(), 2, weight->data); + } else { + acl_weight_tensor = + ggml_cann_create_tensor(weight, transpose_ne, transpose_nb, n_dims, ACL_FORMAT_ND); + } aclTensor* acl_dst = ggml_cann_create_tensor(dst, bcast_dst_ne, bcast_dst_nb, n_dims); diff --git a/ggml/src/ggml-cann/aclnn_ops.h b/ggml/src/ggml-cann/aclnn_ops.h index 80ce80bae..924da66ed 100755 --- a/ggml/src/ggml-cann/aclnn_ops.h +++ b/ggml/src/ggml-cann/aclnn_ops.h @@ -23,6 +23,7 @@ #ifndef CANN_ACLNN_OPS #define CANN_ACLNN_OPS +#include #include #include #include @@ -1020,6 +1021,37 @@ inline void ggml_cann_async_memset(ggml_backend_cann_context & ctx, void * buffe */ void ggml_cann_mul_mat_id(ggml_backend_cann_context& ctx, ggml_tensor* dst); +/** + * @brief Check whether a tensor is a weight tensor for matrix multiplication. + * + * @details Checks whether the given tensor serves as weight parameters in matrix multiplication operations, + * typically within neural network layers. The function maintains a static set of canonical weight + * naming suffixes from Transformer-based architectures. Uses substring matching to identify weight + * tensors even with hierarchical naming patterns. + * + * @param tensor Pointer to the target ggml_tensor object (const-qualified). + */ +static bool is_matmul_weight(const ggml_tensor* tensor) { + std::string name = ggml_get_name(tensor); + static const std::unordered_set weight_suffixes{ + "output.weight", + "attn_q.weight", + "attn_k.weight", + "attn_v.weight", + "attn_output.weight", + "ffn_gate.weight", + "ffn_up.weight", + "ffn_down.weight" + }; + + for (const auto& suffix : weight_suffixes) { + if (name.find(suffix) != std::string::npos) { + return true; + } + } + return false; +} + /** * @brief Applies a element-wise operation to two input tensors using the CANN * backend. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index e5e11d4cd..f30241aca 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -1115,6 +1116,63 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( return GGML_STATUS_SUCCESS; } +static int CreateAclTensorWeight(const void *hostData, const std::vector &shape, void **deviceAddr, + aclDataType dataType, aclTensor **tensor) +{ + uint64_t size = 1; + for (auto i : shape) { + size *= i; + } + + const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size()); + ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size)); + + size *= sizeof(int16_t); + + ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST)); + aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE); + + std::vector strides(shape.size(), 1); + for (int64_t i = shape.size() - 2; i >= 0; i--) { + strides[i] = shape[i + 1] * strides[i + 1]; + } + + *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND, + shape.data(), shape.size(), *deviceAddr); + return 0; +} + +static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) { + aclrtStream stream; + ACL_CHECK(aclrtCreateStream(&stream)); + + std::vector weightTransposedShape = {tensor->ne[1], tensor->ne[0]}; + void *weightTransposedDeviceAddr = nullptr; + aclTensor *weightTransposed = nullptr; + CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr, + ggml_cann_type_mapping(tensor->type), &weightTransposed); + + uint64_t workspaceSize = 0; + aclOpExecutor *executor; + void *workspaceAddr = nullptr; + + // TransMatmulWeight + ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); + std::unique_ptr workspaceAddrPtrTrans(nullptr, aclrtFree); + if (workspaceSize > 0) { + ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + workspaceAddrPtrTrans.reset(workspaceAddr); + } + ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream)); + + size_t size = ggml_nelements(tensor) * ggml_element_size(tensor); + + aclrtMemcpy((char *)tensor->data + offset, size, + weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE); + ACL_CHECK(aclDestroyTensor(weightTransposed)); + aclrtFree(weightTransposedDeviceAddr); +} + // TODO: need handle tensor which has paddings. /** * @brief Set tensor data in a CANN buffer. @@ -1139,9 +1197,16 @@ static void ggml_backend_cann_buffer_set_tensor( // For acl, synchronous functions use this default stream. // Why aclrtSynchronizeDevice? + bool weightToNZ = false; +#ifdef ASCEND_310P + weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); +#endif if (!need_transform(tensor->type)) { ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE)); + if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) { + weight_format_to_nz(tensor, data, offset); + } } else { void *transform_buffer = malloc(size); ggml_backend_cann_transform(tensor, data, transform_buffer);