From 11490b36723d511d75fb601995c79b5c363ba3a2 Mon Sep 17 00:00:00 2001 From: hipudding Date: Thu, 31 Jul 2025 19:47:20 +0800 Subject: [PATCH] CANN: Improve loading efficiency after converting weights to NZ format. (#14985) * CANN: Improve loading efficiency after converting weights to NZ format. * CANN: fix typo --- docs/backend/CANN.md | 6 +- ggml/src/ggml-cann/aclnn_ops.cpp | 8 +-- ggml/src/ggml-cann/ggml-cann.cpp | 114 +++++++++++++++++-------------- 3 files changed, 70 insertions(+), 58 deletions(-) diff --git a/docs/backend/CANN.md b/docs/backend/CANN.md index 2b001f09a..325e09bd3 100755 --- a/docs/backend/CANN.md +++ b/docs/backend/CANN.md @@ -310,5 +310,7 @@ Specifies the memory pool management strategy: Controls automatic cleanup of the memory pool. This option is only effective when using the prio or leg memory pool strategies. -## TODO -- Support more models and data types. +### GGML_CANN_WEIGHT_NZ + +Converting the matmul weight format from ND to NZ can significantly improve performance on the 310I DUO NPU. + diff --git a/ggml/src/ggml-cann/aclnn_ops.cpp b/ggml/src/ggml-cann/aclnn_ops.cpp index 316055193..07d6b8b67 100755 --- a/ggml/src/ggml-cann/aclnn_ops.cpp +++ b/ggml/src/ggml-cann/aclnn_ops.cpp @@ -1913,11 +1913,9 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx, bcast_weight_nb[4], bcast_weight_nb[5]}; aclTensor* acl_weight_tensor; - bool weightToNZ = false; -#ifdef ASCEND_310P - weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); -#endif - if (weightToNZ && is_matmul_weight(weight)) { + // Only check env once. + static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("")); + if (weight_to_nz && is_matmul_weight(weight)) { int64_t acl_stride[2] = {1, transpose_ne[1]}; // Reverse ne. diff --git a/ggml/src/ggml-cann/ggml-cann.cpp b/ggml/src/ggml-cann/ggml-cann.cpp index 4dac2e8b7..49f55891d 100755 --- a/ggml/src/ggml-cann/ggml-cann.cpp +++ b/ggml/src/ggml-cann/ggml-cann.cpp @@ -1116,61 +1116,59 @@ static enum ggml_status ggml_backend_cann_buffer_init_tensor( return GGML_STATUS_SUCCESS; } -static int CreateAclTensorWeight(const void *hostData, const std::vector &shape, void **deviceAddr, - aclDataType dataType, aclTensor **tensor) -{ - uint64_t size = 1; - for (auto i : shape) { - size *= i; +// ND to NZ Workspace Cache Management. Thread-safety: Not guaranteed +namespace { + void* g_nz_workspace = nullptr; + size_t g_nz_workspace_allocated = 0; + + void release_nz_workspace() { + if (g_nz_workspace) { + aclrtFree(g_nz_workspace); + g_nz_workspace = nullptr; + g_nz_workspace_allocated = 0; + } } - const aclIntArray *mat2Size = aclCreateIntArray(shape.data(), shape.size()); - ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(mat2Size, dataType, &size)); - - size *= sizeof(int16_t); - - ACL_CHECK(aclrtMalloc(deviceAddr, size, ACL_MEM_MALLOC_HUGE_FIRST)); - aclrtMemcpy(*deviceAddr, size, hostData, size, ACL_MEMCPY_HOST_TO_DEVICE); - - std::vector strides(shape.size(), 1); - for (int64_t i = shape.size() - 2; i >= 0; i--) { - strides[i] = shape[i + 1] * strides[i + 1]; + void relloc_nz_workspace(size_t new_size) { + if (new_size > g_nz_workspace_allocated) { + if (g_nz_workspace) { + aclrtFree(g_nz_workspace); + g_nz_workspace = nullptr; + } + ACL_CHECK(aclrtMalloc(&g_nz_workspace, new_size, ACL_MEM_MALLOC_HUGE_FIRST)); + g_nz_workspace_allocated = new_size; + } } - - *tensor = aclCreateTensor(shape.data(), shape.size(), dataType, strides.data(), 0, aclFormat::ACL_FORMAT_ND, - shape.data(), shape.size(), *deviceAddr); - return 0; } +/** + * @brief Convert tensor weights to NZ format using Ascend CANN API. + * + * This function creates a transposed tensor descriptor and performs the + * TransMatmulWeight operation. Converting tensor formats can significantly + * improve performance on certain hardware. + * + * @param tensor Pointer to the input ggml_tensor containing the weights. + * @param data Pointer to the raw data buffer for the tensor weights. + * @param offset Byte offset within the tensor data buffer where weights start. + * + * @note The workspace buffer used in this function is managed globally and reused + * across calls. This reduces overhead from repeated memory allocation and deallocation. + */ static void weight_format_to_nz(ggml_tensor *tensor, const void *data, size_t offset) { - aclrtStream stream; - ACL_CHECK(aclrtCreateStream(&stream)); - - std::vector weightTransposedShape = {tensor->ne[1], tensor->ne[0]}; - void *weightTransposedDeviceAddr = nullptr; - aclTensor *weightTransposed = nullptr; - CreateAclTensorWeight(data, weightTransposedShape, &weightTransposedDeviceAddr, - ggml_cann_type_mapping(tensor->type), &weightTransposed); - + aclTensor* weightTransposed = ggml_cann_create_tensor(tensor, tensor->ne, + tensor->nb, 2, ACL_FORMAT_ND, offset); uint64_t workspaceSize = 0; aclOpExecutor *executor; - void *workspaceAddr = nullptr; // TransMatmulWeight - ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, &workspaceSize, &executor)); - std::unique_ptr workspaceAddrPtrTrans(nullptr, aclrtFree); - if (workspaceSize > 0) { - ACL_CHECK(aclrtMalloc(&workspaceAddr, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); - workspaceAddrPtrTrans.reset(workspaceAddr); - } - ACL_CHECK(aclnnTransMatmulWeight(workspaceAddr, workspaceSize, executor, stream)); + ACL_CHECK(aclnnTransMatmulWeightGetWorkspaceSize(weightTransposed, + &workspaceSize, &executor)); + // Avoid frequent malloc/free of the workspace. + relloc_nz_workspace(workspaceSize); - size_t size = ggml_nelements(tensor) * ggml_element_size(tensor); - - aclrtMemcpy((char *)tensor->data + offset, size, - weightTransposedDeviceAddr, size, ACL_MEMCPY_HOST_TO_DEVICE); + ACL_CHECK(aclnnTransMatmulWeight(g_nz_workspace, workspaceSize, executor, nullptr)); ACL_CHECK(aclDestroyTensor(weightTransposed)); - aclrtFree(weightTransposedDeviceAddr); } // TODO: need handle tensor which has paddings. @@ -1197,14 +1195,14 @@ static void ggml_backend_cann_buffer_set_tensor( // For acl, synchronous functions use this default stream. // Why aclrtSynchronizeDevice? - bool weightToNZ = false; -#ifdef ASCEND_310P - weightToNZ = (getenv("GGML_CANN_WEIGHT_NZ") != nullptr); -#endif + // Only check env once. + static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("")); if (!need_transform(tensor->type)) { ACL_CHECK(aclrtMemcpy((char *)tensor->data + offset, size, data, size, ACL_MEMCPY_HOST_TO_DEVICE)); - if (weightToNZ && is_matmul_weight((const ggml_tensor*)tensor)) { + if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) { + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); weight_format_to_nz(tensor, data, offset); } } else { @@ -1440,20 +1438,32 @@ static size_t ggml_backend_cann_buffer_type_get_alloc_size( size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; + // Only check env once. + static bool weight_to_nz = parse_bool(get_env("GGML_CANN_WEIGHT_NZ").value_or("")); + // last line must bigger than 32, because every single op deal at // least 32 bytes. // TODO: quantized type? // int64_t line_size = ne0 * ggml_element_size(tensor); // int64_t line_size_align_32 = (line_size + 31) & ~31; // size += (line_size_align_32 - line_size); - - // TODO: not support quantized yet. - // TODO: consider un-continue tensor. if (ggml_is_quantized(tensor->type)) { if (ne0 % MATRIX_ROW_PADDING != 0) { size += ggml_row_size( tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); } + } else if (weight_to_nz && is_matmul_weight((const ggml_tensor*)tensor)) { + // NZ format weight are not support quantized yet. + // If ND tensor transform to NZ, size may changed. + int64_t shape[] = {tensor->ne[1], tensor->ne[0]}; + GGML_ASSERT(tensor->ne[2] == 1); + GGML_ASSERT(tensor->ne[3] == 1); + const aclIntArray *acl_shape = aclCreateIntArray(shape, 2); + size_t new_size; + ACL_CHECK(aclnnCalculateMatmulWeightSizeV2(acl_shape, + ggml_cann_type_mapping(tensor->type), &new_size)); + ACL_CHECK(aclDestroyIntArray(acl_shape)); + size = std::max(size, new_size); } return size; @@ -2080,6 +2090,8 @@ static enum ggml_status ggml_backend_cann_graph_compute( (ggml_backend_cann_context*)backend->context; ggml_cann_set_device(cann_ctx->device); + //release temp buffer create by set tensor. + release_nz_workspace(); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor* node = cgraph->nodes[i];