mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-18 14:18:50 -04:00
vulkan: Add fusion support for RMS_NORM+MUL (#14366)
* vulkan: Add fusion support for RMS_NORM+MUL - Add a use_count to ggml_tensor, so we can detect if an output is used more than once. - Change the ggml-vulkan rms_norm shader to optionally multiply by another tensor. - Add detection logic and basic fusion logic in ggml-vulkan. - Add some testing support for fusion. Rather than computing one node at a time, allow for computing the whole graph and just testing one node's results. Add rms_norm_mul tests and enable a llama test. * extract some common fusion logic * fix -Winconsistent-missing-override * move ggml_can_fuse to a common function * build fix * C and C++ versions of can_fuse * move use count to the graph to avoid data races and double increments when used in multiple threads * use hash table lookup to find node index * change use_counts to be indexed by hash table slot * minimize hash lookups style fixes * last node doesn't need single use. fix type. handle mul operands being swapped. * remove redundant parameter --------- Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
@@ -5841,19 +5841,32 @@ static void ggml_compute_backward(
|
||||
GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
|
||||
}
|
||||
|
||||
static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
|
||||
static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
|
||||
// check if already visited
|
||||
if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
|
||||
return;
|
||||
size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
|
||||
GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL);
|
||||
if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
|
||||
// This is the first time we see this node in the current graph.
|
||||
cgraph->visited_hash_set.keys[node_hash_pos] = node;
|
||||
ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
|
||||
cgraph->use_counts[node_hash_pos] = 0;
|
||||
} else {
|
||||
// already visited
|
||||
return node_hash_pos;
|
||||
}
|
||||
|
||||
for (int i = 0; i < GGML_MAX_SRC; ++i) {
|
||||
const int k =
|
||||
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
|
||||
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
|
||||
/* unknown order, just fall back to using i*/ i;
|
||||
if (node->src[k]) {
|
||||
ggml_visit_parents(cgraph, node->src[k]);
|
||||
/* unknown order, just fall back to using i */ i;
|
||||
|
||||
struct ggml_tensor * src = node->src[k];
|
||||
if (src) {
|
||||
size_t src_hash_pos = ggml_visit_parents(cgraph, src);
|
||||
|
||||
// Update the use count for this operand.
|
||||
cgraph->use_counts[src_hash_pos]++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5877,6 +5890,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
|
||||
cgraph->nodes[cgraph->n_nodes] = node;
|
||||
cgraph->n_nodes++;
|
||||
}
|
||||
|
||||
return node_hash_pos;
|
||||
}
|
||||
|
||||
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
|
||||
@@ -6014,6 +6029,7 @@ static size_t ggml_graph_nbytes(size_t size, bool grads) {
|
||||
incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
|
||||
incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
|
||||
incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
|
||||
incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t)); // use_counts
|
||||
incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
|
||||
if (grads) {
|
||||
incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
|
||||
@@ -6043,11 +6059,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
||||
|
||||
void * p = cgraph + 1;
|
||||
|
||||
struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
|
||||
struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
|
||||
struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
|
||||
struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
|
||||
struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
|
||||
struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
|
||||
struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
|
||||
int32_t * use_counts_ptr = incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t));
|
||||
struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
|
||||
struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
|
||||
struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
|
||||
|
||||
ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
|
||||
|
||||
@@ -6062,6 +6079,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
|
||||
/*.grads =*/ grads_ptr,
|
||||
/*.grad_accs =*/ grad_accs_ptr,
|
||||
/*.leafs =*/ leafs_ptr,
|
||||
/*.use_counts =*/ use_counts_ptr,
|
||||
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
|
||||
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
|
||||
};
|
||||
@@ -6088,7 +6106,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
|
||||
/*.grads =*/ NULL, // gradients would need visited_hash_set
|
||||
/*.grad_accs =*/ NULL,
|
||||
/*.leafs =*/ NULL,
|
||||
/*.visited_hash_set =*/ { 0, NULL, NULL },
|
||||
/*.use_counts =*/ cgraph0->use_counts,
|
||||
/*.visited_hash_set =*/ cgraph0->visited_hash_set,
|
||||
/*.order =*/ cgraph0->order,
|
||||
};
|
||||
|
||||
@@ -6115,7 +6134,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
|
||||
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
|
||||
// copy all hashset keys (tensors) that are in use
|
||||
if (ggml_bitset_get(src->visited_hash_set.used, i)) {
|
||||
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
|
||||
size_t new_hash_pos = ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
|
||||
dst->use_counts[new_hash_pos] = src->use_counts[i];
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user