vulkan: Add fusion support for RMS_NORM+MUL (#14366)

* vulkan: Add fusion support for RMS_NORM+MUL

- Add a use_count to ggml_tensor, so we can detect if an output is used more than once.
- Change the ggml-vulkan rms_norm shader to optionally multiply by another tensor.
- Add detection logic and basic fusion logic in ggml-vulkan.
- Add some testing support for fusion. Rather than computing one node at a time, allow
for computing the whole graph and just testing one node's results. Add rms_norm_mul tests
and enable a llama test.

* extract some common fusion logic

* fix -Winconsistent-missing-override

* move ggml_can_fuse to a common function

* build fix

* C and C++ versions of can_fuse

* move use count to the graph to avoid data races and double increments when used in multiple threads

* use hash table lookup to find node index

* change use_counts to be indexed by hash table slot

* minimize hash lookups

style fixes

* last node doesn't need single use.
fix type.
handle mul operands being swapped.

* remove redundant parameter

---------

Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
Jeff Bolz
2025-06-29 02:43:36 -05:00
committed by GitHub
parent 27208bf657
commit bd9c981d72
8 changed files with 263 additions and 56 deletions

View File

@@ -5841,19 +5841,32 @@ static void ggml_compute_backward(
GGML_ASSERT(!src2_needs_grads || ggml_are_same_shape(src2, cgraph->grads[isrc2]));
}
static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
static size_t ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor * node) {
// check if already visited
if (ggml_hash_insert(&cgraph->visited_hash_set, node) == GGML_HASHSET_ALREADY_EXISTS) {
return;
size_t node_hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
GGML_ASSERT(node_hash_pos != GGML_HASHSET_FULL);
if (!ggml_bitset_get(cgraph->visited_hash_set.used, node_hash_pos)) {
// This is the first time we see this node in the current graph.
cgraph->visited_hash_set.keys[node_hash_pos] = node;
ggml_bitset_set(cgraph->visited_hash_set.used, node_hash_pos);
cgraph->use_counts[node_hash_pos] = 0;
} else {
// already visited
return node_hash_pos;
}
for (int i = 0; i < GGML_MAX_SRC; ++i) {
const int k =
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT) ? i :
(cgraph->order == GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT) ? (GGML_MAX_SRC-1-i) :
/* unknown order, just fall back to using i*/ i;
if (node->src[k]) {
ggml_visit_parents(cgraph, node->src[k]);
/* unknown order, just fall back to using i */ i;
struct ggml_tensor * src = node->src[k];
if (src) {
size_t src_hash_pos = ggml_visit_parents(cgraph, src);
// Update the use count for this operand.
cgraph->use_counts[src_hash_pos]++;
}
}
@@ -5877,6 +5890,8 @@ static void ggml_visit_parents(struct ggml_cgraph * cgraph, struct ggml_tensor *
cgraph->nodes[cgraph->n_nodes] = node;
cgraph->n_nodes++;
}
return node_hash_pos;
}
static void ggml_build_forward_impl(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor, bool expand) {
@@ -6014,6 +6029,7 @@ static size_t ggml_graph_nbytes(size_t size, bool grads) {
incr_ptr_aligned(&p, sizeof(struct ggml_cgraph), 1);
incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // nodes
incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // leafs
incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t)); // use_counts
incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // hash keys
if (grads) {
incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)); // grads
@@ -6043,11 +6059,12 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
void * p = cgraph + 1;
struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
struct ggml_tensor ** nodes_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** leafs_ptr = incr_ptr_aligned(&p, size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
int32_t * use_counts_ptr = incr_ptr_aligned(&p, hash_size * sizeof(int32_t), sizeof(int32_t));
struct ggml_tensor ** hash_keys_ptr = incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *));
struct ggml_tensor ** grads_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
struct ggml_tensor ** grad_accs_ptr = grads ? incr_ptr_aligned(&p, hash_size * sizeof(struct ggml_tensor *), sizeof(struct ggml_tensor *)) : NULL;
ggml_bitset_t * hash_used = incr_ptr_aligned(&p, ggml_bitset_size(hash_size) * sizeof(ggml_bitset_t), sizeof(ggml_bitset_t));
@@ -6062,6 +6079,7 @@ struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t siz
/*.grads =*/ grads_ptr,
/*.grad_accs =*/ grad_accs_ptr,
/*.leafs =*/ leafs_ptr,
/*.use_counts =*/ use_counts_ptr,
/*.hash_table =*/ { hash_size, hash_used, hash_keys_ptr },
/*.order =*/ GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT,
};
@@ -6088,7 +6106,8 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph0, int i0, int i1)
/*.grads =*/ NULL, // gradients would need visited_hash_set
/*.grad_accs =*/ NULL,
/*.leafs =*/ NULL,
/*.visited_hash_set =*/ { 0, NULL, NULL },
/*.use_counts =*/ cgraph0->use_counts,
/*.visited_hash_set =*/ cgraph0->visited_hash_set,
/*.order =*/ cgraph0->order,
};
@@ -6115,7 +6134,8 @@ void ggml_graph_cpy(struct ggml_cgraph * src, struct ggml_cgraph * dst) {
for (size_t i = 0; i < src->visited_hash_set.size; ++i) {
// copy all hashset keys (tensors) that are in use
if (ggml_bitset_get(src->visited_hash_set.used, i)) {
ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
size_t new_hash_pos = ggml_hash_insert(&dst->visited_hash_set, src->visited_hash_set.keys[i]);
dst->use_counts[new_hash_pos] = src->use_counts[i];
}
}