mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-15 12:42:40 -04:00
CANN: add support for ACL Graph (#15065)
* feat(cann): add optional support for ACL Graph execution This commit adds support for executing ggml computational graphs using Huawei's ACL graph mode via the USE_CANN_GRAPH flag. The support can be enabled at compile time using the CMake option: -DUSE_CANN_GRAPH=ON By default, ACL graph execution is **disabled**, and the fallback path uses node-by-node execution. Key additions: - CMake option to toggle graph mode - Graph capture and execution logic using - Tensor property matching to determine whether graph update is required - Safe fallback and logging if the environment variable LLAMA_SET_ROWS is unset or invalid This prepares the backend for performance improvements in repetitive graph execution scenarios on Ascend devices. Signed-off-by: noemotiovon <757486878@qq.com> * Fix review comments Signed-off-by: noemotiovon <757486878@qq.com> * remane USE_CANN_GRAPH to USE_ACL_GRAPH Signed-off-by: noemotiovon <757486878@qq.com> * fix typo Signed-off-by: noemotiovon <757486878@qq.com> --------- Signed-off-by: noemotiovon <757486878@qq.com>
This commit is contained in:
@@ -31,6 +31,13 @@ string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
|
|||||||
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
|
||||||
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
string(TOUPPER ${SOC_TYPE_COMPILE_OPTION} SOC_TYPE_COMPILE_OPTION)
|
||||||
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
|
message(STATUS "CANN: SOC_VERSION = ${SOC_VERSION}")
|
||||||
|
option(USE_ACL_GRAPH "Enable CANN graph execution (ACL graph mode)" OFF)
|
||||||
|
|
||||||
|
if(USE_ACL_GRAPH AND (SOC_TYPE_MAJOR_SN STREQUAL "310P" OR SOC_TYPE_COMPILE_OPTION STREQUAL "ASCEND_310P"))
|
||||||
|
message(FATAL_ERROR
|
||||||
|
"CANN Graph (ACL graph mode) is not supported on 310P devices. "
|
||||||
|
"Please build with -DUSE_ACL_GRAPH=OFF or use a supported SOC.")
|
||||||
|
endif()
|
||||||
|
|
||||||
if (CANN_INSTALL_DIR)
|
if (CANN_INSTALL_DIR)
|
||||||
# Only Support Linux.
|
# Only Support Linux.
|
||||||
@@ -68,6 +75,13 @@ if (CANN_INSTALL_DIR)
|
|||||||
|
|
||||||
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
|
||||||
|
|
||||||
|
if (USE_ACL_GRAPH)
|
||||||
|
target_compile_definitions(ggml-cann PRIVATE USE_ACL_GRAPH)
|
||||||
|
message(STATUS "CANN: USE_ACL_GRAPH is enabled.")
|
||||||
|
else()
|
||||||
|
message(STATUS "CANN: USE_ACL_GRAPH is disabled.")
|
||||||
|
endif()
|
||||||
|
|
||||||
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
|
||||||
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
|
||||||
else()
|
else()
|
||||||
|
@@ -337,6 +337,29 @@ private:
|
|||||||
int32_t device_;
|
int32_t device_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#ifdef USE_ACL_GRAPH
|
||||||
|
struct ggml_graph_node_properties {
|
||||||
|
void * node_address;
|
||||||
|
ggml_op node_op;
|
||||||
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
|
size_t nb[GGML_MAX_DIMS];
|
||||||
|
void * src_address[GGML_MAX_SRC];
|
||||||
|
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_cann_graph {
|
||||||
|
~ggml_cann_graph() {
|
||||||
|
if (graph != nullptr) {
|
||||||
|
aclmdlRIDestroy(graph);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
aclmdlRI graph = nullptr;
|
||||||
|
|
||||||
|
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||||
|
};
|
||||||
|
#endif // USE_ACL_GRAPH
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Context for managing CANN backend operations.
|
* @brief Context for managing CANN backend operations.
|
||||||
*/
|
*/
|
||||||
@@ -345,8 +368,13 @@ struct ggml_backend_cann_context {
|
|||||||
std::string name; /**< Name of the device. */
|
std::string name; /**< Name of the device. */
|
||||||
std::string description; /**< Description of the device. */
|
std::string description; /**< Description of the device. */
|
||||||
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
aclrtEvent copy_event = nullptr; /**< Event for managing copy operations. */
|
||||||
|
#ifdef USE_ACL_GRAPH
|
||||||
|
/// Cached CANN ACL graph used for executing the current ggml computation graph.
|
||||||
|
std::unique_ptr<ggml_cann_graph> cann_graph;
|
||||||
|
#endif
|
||||||
cann_task_queue task_queue;
|
cann_task_queue task_queue;
|
||||||
bool async_mode;
|
bool async_mode;
|
||||||
|
bool support_set_rows;
|
||||||
|
|
||||||
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
aclrtStream streams[GGML_CANN_MAX_STREAMS] = {nullptr}; /**< Array of streams for the device. */
|
||||||
|
|
||||||
@@ -362,6 +390,14 @@ struct ggml_backend_cann_context {
|
|||||||
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
async_mode = parse_bool(get_env("GGML_CANN_ASYNC_MODE").value_or(""));
|
||||||
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
GGML_LOG_INFO("%s: device %d async operator submission is %s\n", __func__,
|
||||||
device, async_mode ? "ON" : "OFF");
|
device, async_mode ? "ON" : "OFF");
|
||||||
|
|
||||||
|
support_set_rows = parse_bool(get_env("LLAMA_SET_ROWS").value_or(""));
|
||||||
|
GGML_LOG_INFO("%s: LLAMA_SET_ROWS is %s\n", __func__, support_set_rows ? "ON" : "OFF");
|
||||||
|
|
||||||
|
if (!support_set_rows) {
|
||||||
|
GGML_LOG_INFO("%s: CANN Graph currently only supports execution when LLAMA_SET_ROWS is ON. "
|
||||||
|
"Falling back to eager mode.\n", __func__);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@@ -2075,6 +2075,160 @@ static void ggml_backend_cann_synchronize(ggml_backend_t backend) {
|
|||||||
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
ACL_CHECK(aclrtSynchronizeStream(cann_ctx->stream()));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ACL_GRAPH
|
||||||
|
/**
|
||||||
|
* @brief Populate the internal CANN graph node properties from the ggml computation graph.
|
||||||
|
*
|
||||||
|
* This function copies all node attributes (operation type, dimensions, strides, input sources,
|
||||||
|
* and operation parameters) into the cached CANN graph structure for later reuse or comparison.
|
||||||
|
*
|
||||||
|
* @param cann_ctx The CANN backend context.
|
||||||
|
* @param cgraph The ggml computational graph.
|
||||||
|
*/
|
||||||
|
static void set_ggml_graph_node_properties(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||||
|
for (int node_idx = 0; node_idx < cgraph->n_nodes; node_idx++) {
|
||||||
|
ggml_tensor * node = cgraph->nodes[node_idx];
|
||||||
|
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_address = node->data;
|
||||||
|
cann_ctx->cann_graph->ggml_graph_properties[node_idx].node_op = node->op;
|
||||||
|
|
||||||
|
for (int dim = 0; dim < GGML_MAX_DIMS; dim++) {
|
||||||
|
cann_ctx->cann_graph->ggml_graph_properties[node_idx].ne[dim] = node->ne[dim];
|
||||||
|
cann_ctx->cann_graph->ggml_graph_properties[node_idx].nb[dim] = node->nb[dim];
|
||||||
|
}
|
||||||
|
for (int src = 0; src < GGML_MAX_SRC; src++) {
|
||||||
|
cann_ctx->cann_graph->ggml_graph_properties[node_idx].src_address[src] =
|
||||||
|
node->src[src] ? node->src[src]->data : nullptr;
|
||||||
|
}
|
||||||
|
memcpy(cann_ctx->cann_graph->ggml_graph_properties[node_idx].op_params, node->op_params, GGML_MAX_OP_PARAMS);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Check if a ggml tensor node matches a previously captured CANN graph node.
|
||||||
|
*
|
||||||
|
* This function compares all relevant fields (address, op type, shape, source inputs, op params)
|
||||||
|
* to determine whether the current node matches a previously recorded version.
|
||||||
|
*
|
||||||
|
* @param node The current ggml tensor node.
|
||||||
|
* @param graph_node_properties The stored properties of a CANN graph node.
|
||||||
|
* @return true if all fields match (excluding GGML_OP_VIEW); false otherwise.
|
||||||
|
*/
|
||||||
|
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||||
|
if (node->data != graph_node_properties->node_address &&
|
||||||
|
node->op != GGML_OP_VIEW) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (node->op != graph_node_properties->node_op) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
|
if (node->ne[i] != graph_node_properties->ne[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (node->nb[i] != graph_node_properties->nb[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
if (node->src[i] &&
|
||||||
|
node->src[i]->data != graph_node_properties->src_address[i] &&
|
||||||
|
node->op != GGML_OP_VIEW
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (node->op == GGML_OP_SCALE &&
|
||||||
|
memcmp(graph_node_properties->op_params, node->op_params, GGML_MAX_OP_PARAMS) != 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Determine if the CANN graph needs to be rebuilt due to graph changes.
|
||||||
|
*
|
||||||
|
* This checks whether the number or properties of ggml graph nodes have changed
|
||||||
|
* compared to the last captured CANN graph. If so, the CANN graph must be re-captured.
|
||||||
|
*
|
||||||
|
* @param cann_ctx The CANN backend context.
|
||||||
|
* @param cgraph The current ggml computation graph.
|
||||||
|
* @return true if an update is required; false otherwise.
|
||||||
|
*/
|
||||||
|
static bool is_cann_graph_update_required(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph) {
|
||||||
|
// The number of nodes is different, so the graph needs to be reconstructed.
|
||||||
|
if (cann_ctx->cann_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
||||||
|
cann_ctx->cann_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// The number of nodes is the same; iterate over each node to check whether they match.
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
bool has_matching_properties = ggml_graph_node_has_matching_properties(
|
||||||
|
cgraph->nodes[i], &cann_ctx->cann_graph->ggml_graph_properties[i]);
|
||||||
|
if(!has_matching_properties) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif // USE_ACL_GRAPH
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @brief Evaluate the computation graph and optionally capture or execute it using CANN graph API.
|
||||||
|
*
|
||||||
|
* If CANN graph execution is enabled and graph capture is required, this function begins
|
||||||
|
* graph capture, runs the graph, ends capture, and stores the captured graph.
|
||||||
|
*
|
||||||
|
* Otherwise, it falls back to op-by-op execution using the CANN compute kernel dispatcher.
|
||||||
|
*
|
||||||
|
* @param cann_ctx The CANN backend context.
|
||||||
|
* @param cgraph The ggml computation graph.
|
||||||
|
* @param use_cann_graph Whether to use CANN graph execution.
|
||||||
|
* @param cann_graph_update_required Whether graph capture is needed due to graph changes.
|
||||||
|
*/
|
||||||
|
static void evaluate_and_capture_cann_graph(ggml_backend_cann_context * cann_ctx, ggml_cgraph * cgraph,
|
||||||
|
bool & use_cann_graph, bool & cann_graph_update_required) {
|
||||||
|
#ifdef USE_ACL_GRAPH
|
||||||
|
if (use_cann_graph && cann_graph_update_required) {
|
||||||
|
if (cann_ctx->cann_graph->graph != nullptr) {
|
||||||
|
ACL_CHECK(aclmdlRIDestroy(cann_ctx->cann_graph->graph));
|
||||||
|
cann_ctx->cann_graph->graph = nullptr;
|
||||||
|
}
|
||||||
|
ACL_CHECK(aclmdlRICaptureBegin(cann_ctx->stream(), ACL_MODEL_RI_CAPTURE_MODE_GLOBAL));
|
||||||
|
}
|
||||||
|
#endif // USE_ACL_GRAPH
|
||||||
|
|
||||||
|
// Only perform the graph execution if CANN graphs are not enabled, or we are capturing the graph.
|
||||||
|
// With the use of CANN graphs, the execution will be performed by the graph launch.
|
||||||
|
if (!use_cann_graph || cann_graph_update_required) {
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
if (ggml_is_empty(node) || node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
||||||
|
if (!ok) {
|
||||||
|
GGML_LOG_ERROR("%s: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
|
||||||
|
}
|
||||||
|
GGML_ASSERT(ok);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef USE_ACL_GRAPH
|
||||||
|
if (use_cann_graph && cann_graph_update_required) { // End CANN graph capture
|
||||||
|
ACL_CHECK(aclmdlRICaptureEnd(cann_ctx->stream(), &cann_ctx->cann_graph->graph));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (use_cann_graph) {
|
||||||
|
// Execute graph
|
||||||
|
ACL_CHECK(aclmdlRIExecuteAsync(cann_ctx->cann_graph->graph, cann_ctx->stream()));
|
||||||
|
}
|
||||||
|
#endif // USE_ACL_GRAPH
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @brief Computes a computational graph using a CANN backend.
|
* @brief Computes a computational graph using a CANN backend.
|
||||||
*
|
*
|
||||||
@@ -2091,27 +2245,38 @@ static enum ggml_status ggml_backend_cann_graph_compute(
|
|||||||
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
ggml_backend_t backend, ggml_cgraph* cgraph) {
|
||||||
ggml_backend_cann_context* cann_ctx =
|
ggml_backend_cann_context* cann_ctx =
|
||||||
(ggml_backend_cann_context*)backend->context;
|
(ggml_backend_cann_context*)backend->context;
|
||||||
|
|
||||||
ggml_cann_set_device(cann_ctx->device);
|
ggml_cann_set_device(cann_ctx->device);
|
||||||
//release temp buffer create by set tensor.
|
|
||||||
release_nz_workspace();
|
release_nz_workspace();
|
||||||
|
#ifdef USE_ACL_GRAPH
|
||||||
|
bool use_cann_graph = true;
|
||||||
|
bool cann_graph_update_required = false;
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
// check environment LLAMA_SET_ROWS
|
||||||
ggml_tensor* node = cgraph->nodes[i];
|
if (!cann_ctx->support_set_rows) {
|
||||||
|
use_cann_graph = false;
|
||||||
if (ggml_is_empty(node) || node->op == GGML_OP_NONE) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool ok = ggml_cann_compute_forward(*cann_ctx, node);
|
|
||||||
|
|
||||||
if (!ok) {
|
|
||||||
GGML_LOG_ERROR("%s: error: op not supported %s (%s)\n", __func__,
|
|
||||||
node->name, ggml_op_name(node->op));
|
|
||||||
}
|
|
||||||
GGML_ASSERT(ok);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (use_cann_graph) {
|
||||||
|
if (cann_ctx->cann_graph == nullptr) {
|
||||||
|
cann_ctx->cann_graph.reset(new ggml_cann_graph());
|
||||||
|
cann_graph_update_required = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
cann_graph_update_required = is_cann_graph_update_required(cann_ctx, cgraph);
|
||||||
|
set_ggml_graph_node_properties(cann_ctx, cgraph);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
bool use_cann_graph = false;
|
||||||
|
bool cann_graph_update_required = false;
|
||||||
|
#endif // USE_ACL_GRAPH
|
||||||
|
|
||||||
|
evaluate_and_capture_cann_graph(
|
||||||
|
cann_ctx,
|
||||||
|
cgraph,
|
||||||
|
use_cann_graph,
|
||||||
|
cann_graph_update_required
|
||||||
|
);
|
||||||
|
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -2226,12 +2391,6 @@ static bool ggml_backend_cann_supports_op(ggml_backend_dev_t dev,
|
|||||||
// only support F32 and F16.
|
// only support F32 and F16.
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!ggml_are_same_shape(op, src) && !ggml_is_contiguous(op)) {
|
|
||||||
// unsupport dst is not contiguous.
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
} break;
|
} break;
|
||||||
case GGML_OP_CONT: {
|
case GGML_OP_CONT: {
|
||||||
|
Reference in New Issue
Block a user