mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-13 20:07:41 -04:00
CANN: add support for ACL Graph (#15065)
* feat(cann): add optional support for ACL Graph execution This commit adds support for executing ggml computational graphs using Huawei's ACL graph mode via the USE_CANN_GRAPH flag. The support can be enabled at compile time using the CMake option: -DUSE_CANN_GRAPH=ON By default, ACL graph execution is **disabled**, and the fallback path uses node-by-node execution. Key additions: - CMake option to toggle graph mode - Graph capture and execution logic using - Tensor property matching to determine whether graph update is required - Safe fallback and logging if the environment variable LLAMA_SET_ROWS is unset or invalid This prepares the backend for performance improvements in repetitive graph execution scenarios on Ascend devices. Signed-off-by: noemotiovon <757486878@qq.com> * Fix review comments Signed-off-by: noemotiovon <757486878@qq.com> * remane USE_CANN_GRAPH to USE_ACL_GRAPH Signed-off-by: noemotiovon <757486878@qq.com> * fix typo Signed-off-by: noemotiovon <757486878@qq.com> --------- Signed-off-by: noemotiovon <757486878@qq.com>
This commit is contained in:
@@ -31,6 +31,13 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
||||
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
||||
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
||||
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
|
||||
option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
|
||||
|
||||
if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
|
||||
message(FATAL_ERROR
|
||||
"CANN Graph (ACL graph mode) is not supported on 310P devices. "
|
||||
"Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
|
||||
endif()
|
||||
|
||||
if (CANN_INSTALL_DIR)
|
||||
# Only Support Linux.
|
||||
@@ -68,6 +75,13 @@ if (CANN_INSTALL_DIR)
|
||||
|
||||
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||
|
||||
if (USE_ACL_GRAPH)
|
||||
target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
|
||||
message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
|
||||
else()
|
||||
message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
|
||||
endif()
|
||||
|
||||
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
||||
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
||||
else()
|
||||
|
@@ -337,6 +337,29 @@ private:
|
||||
int32_t device_;
|
||||
};
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
struct ggml_graph_node_properties {
|
||||
void * node_address;
|
||||
ggml_op node_op;
|
||||
int64_t ne[GGML_MAX_DIMS];
|
||||
size_t nb[GGML_MAX_DIMS];
|
||||
void * src_address[GGML_MAX_SRC];
|
||||
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
||||
};
|
||||
|
||||
struct ggml_cann_graph {
|
||||
~ggml_cann_graph() {
|
||||
if (graph != nullptr) {
|
||||
aclmdlRIDestroy(graph);
|
||||
}
|
||||
}
|
||||
|
||||
aclmdlRI graph = nullptr;
|
||||
|
||||
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||
};
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
/**
|
||||
* @brief Context for managing CANN backend operations.
|
||||
*/
|
||||
@@ -345,8 +368,13 @@ struct ggml_backend_cann_context {
|
||||
std::string name; /**< Name of the device. */
|
||||
std::string description; /**< Description of the device. */
|
||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||
#ifdef USE_ACL_GRAPH
|
||||
/// Cached CANN ACL graph used for executing the current ggml computation graph.
|
||||
std::unique_ptr<ggml_cann_graph> cann_graph;
|
||||
#endif
|
||||
cann_task_queue task_queue;
|
||||
bool async_mode;
|
||||
bool support_set_rows;
|
||||
|
||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||
|
||||
@@ -362,6 +390,14 @@ struct ggml_backend_cann_context {
|
||||
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||
device, async_mode ? "ON" : "OFF");
|
||||
|
||||
support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
|
||||
GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
|
||||
|
||||
if (!support_set_rows) {
|
||||
GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
|
||||
"Falling back to eager mode.\n", __func__);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
@@ -2075,6 +2075,160 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||
}
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
/**
|
||||
* @brief Populate the internal CANN graph node properties from the ggml computation graph.
|
||||
*
|
||||
* This function copies all node attributes (operation type, dimensions, strides, input sources,
|
||||
* and operation parameters) into the cached CANN graph structure for later reuse or comparison.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The ggml computational graph.
|
||||
*/
|
||||
static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||
for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
|
||||
ggml_tensor * node = cgraph->nodes[node_idx];
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
|
||||
|
||||
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
|
||||
}
|
||||
for (int src = 0; src < GGML_MAX_SRC; src++) {
|
||||
cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
|
||||
node->src[src] ? node->src[src]->data : nullptr;
|
||||
}
|
||||
memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Check if a ggml tensor node matches a previously captured CANN graph node.
|
||||
*
|
||||
* This function compares all relevant fields (address, op type, shape, source inputs, op params)
|
||||
* to determine whether the current node matches a previously recorded version.
|
||||
*
|
||||
* @param node The current ggml tensor node.
|
||||
* @param graph_node_properties The stored properties of a CANN graph node.
|
||||
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
|
||||
*/
|
||||
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||
if (node->data != graph_node_properties->node_address &&
|
||||
node->op != GGML_OP_VIEW) {
|
||||
return false;
|
||||
}
|
||||
if (node->op != graph_node_properties->node_op) {
|
||||
return false;
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||
if (node->ne[i] != graph_node_properties->ne[i]) {
|
||||
return false;
|
||||
}
|
||||
if (node->nb[i] != graph_node_properties->nb[i]) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||
if (node->src[i] &&
|
||||
node->src[i]->data != graph_node_properties->src_address[i] &&
|
||||
node->op != GGML_OP_VIEW
|
||||
) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
if (node->op == GGML_OP_SCALE &&
|
||||
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
|
||||
*
|
||||
* This checks whether the number or properties of ggml graph nodes have changed
|
||||
* compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The current ggml computation graph.
|
||||
* @return true if an update is required; false otherwise.
|
||||
*/
|
||||
static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||
// The number of nodes is different, so the graph needs to be reconstructed.
|
||||
if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
||||
cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||
return true;
|
||||
}
|
||||
|
||||
// The number of nodes is the same; iterate over each node to check whether they match.
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
bool has_matching_properties = ggml_graph_node_has_matching_properties(
|
||||
cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
|
||||
if(!has_matching_properties) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
/**
|
||||
* @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
|
||||
*
|
||||
* If CANN graph execution is enabled and graph capture is required, this function begins
|
||||
* graph capture, runs the graph, ends capture, and stores the captured graph.
|
||||
*
|
||||
* Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
|
||||
*
|
||||
* @param cann_ctx The CANN backend context.
|
||||
* @param cgraph The ggml computation graph.
|
||||
* @param use_cann_graph Whether to use CANN graph execution.
|
||||
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
|
||||
*/
|
||||
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
|
||||
bool & use_cann_graph, bool & cann_graph_update_required) {
|
||||
#ifdef USE_ACL_GRAPH
|
||||
if (use_cann_graph && cann_graph_update_required) {
|
||||
if (cann_ctx->cann_graph->graph != nullptr) {
|
||||
ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
|
||||
cann_ctx->cann_graph->graph = nullptr;
|
||||
}
|
||||
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
||||
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
||||
if (!use_cann_graph || cann_graph_update_required) {
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor * node = cgraph->nodes[i];
|
||||
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||
}
|
||||
GGML_ASSERT(ok);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef USE_ACL_GRAPH
|
||||
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
|
||||
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
|
||||
}
|
||||
|
||||
if (use_cann_graph) {
|
||||
// Execute graph
|
||||
ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
|
||||
}
|
||||
#endif // USE_ACL_GRAPH
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* @brief Computes a computational graph using a CANN backend.
|
||||
*
|
||||
@@ -2091,27 +2245,38 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
||||
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
||||
ggml_backend_cann_context* cann_ctx =
|
||||
(ggml_backend_cann_context*)backend->context;
|
||||
|
||||
ggml_cann_set_device(cann_ctx->device);
|
||||
//release temp buffer create by set tensor.
|
||||
release_nz_workspace();
|
||||
#ifdef USE_ACL_GRAPH
|
||||
bool use_cann_graph = true;
|
||||
bool cann_graph_update_required = false;
|
||||
|
||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||
ggml_tensor* node = cgraph->nodes[i];
|
||||
|
||||
if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
|
||||
continue;
|
||||
}
|
||||
|
||||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||
|
||||
if (!ok) {
|
||||
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
||||
node->name, ggml_op_name(node->op));
|
||||
}
|
||||
GGML_ASSERT(ok);
|
||||
// check environment LLAMA_SET_ROWS
|
||||
if (!cann_ctx->support_set_rows) {
|
||||
use_cann_graph = false;
|
||||
}
|
||||
|
||||
if (use_cann_graph) {
|
||||
if (cann_ctx->cann_graph == nullptr) {
|
||||
cann_ctx->cann_graph.reset(new ggml_cann_graph());
|
||||
cann_graph_update_required = true;
|
||||
}
|
||||
|
||||
cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
|
||||
set_ggml_graph_node_properties(cann_ctx, cgraph);
|
||||
}
|
||||
#else
|
||||
bool use_cann_graph = false;
|
||||
bool cann_graph_update_required = false;
|
||||
#endif // USE_ACL_GRAPH
|
||||
|
||||
evaluate_and_capture_cann_graph(
|
||||
cann_ctx,
|
||||
cgraph,
|
||||
use_cann_graph,
|
||||
cann_graph_update_required
|
||||
);
|
||||
|
||||
return GGML_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
@@ -2226,12 +2391,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
||||
// only support F32 and F16.
|
||||
return false;
|
||||
}
|
||||
|
||||
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
||||
// unsupport dst is not contiguous.
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
} break;
|
||||
case GGML_OP_CONT: {
|
||||
|
Reference in New Issue
Block a user