vulkan: Optimize mul_mat_vec p021 and nc shaders (#12505)

* tests: add mul_mat perf/functional tests for p021/nc vulkan shaders * vulkan: Optimize mul_mat_vec p021 and nc shaders. These shaders are used in attention calculations, and when the KV cache grows large they start to dominate the run time. For the nc shader (which is called with large 'k' dimension), use unrolling and vector loads. For the p021 shader (which is called with large 'm' and small 'k' dimensions), take advantage of grouped query attention to reuse loads from the A matrix for the whole group, and reduce the number of workgroups (too much overhead from tiny dispatches). Using subgroupAdd in the p021 shader also helps, use that conditionally.
2025-06-27 03:55:20 +00:00 · 2025-03-22 03:40:11 -05:00
parent 4375415b4a
commit eddfb43850
5 changed files with 228 additions and 53 deletions
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -1964,9 +1964,10 @@ struct test_mul_mat : public test_case {
    const std::array<int64_t, 2> bs;  // dims 3 and 4
    const std::array<int64_t, 2> nr;  // repeat in dims 3 and 4
    const std::array<int64_t, 4> per; // permutation of dimensions
+    const bool v; // whether a is a non-contiguous view

    std::string vars() override {
-        return VARS_TO_STR8(type_a, type_b, m, n, k, bs, nr, per);
+        return VARS_TO_STR9(type_a, type_b, m, n, k, bs, nr, per, v);
    }

    double max_nmse_err() override {
@ -1986,8 +1987,9 @@ struct test_mul_mat : public test_case {
            int64_t m = 32, int64_t n = 32, int64_t k = 32,
            std::array<int64_t, 2> bs = {10, 10},
            std::array<int64_t, 2> nr = {2, 2},
-            std::array<int64_t, 4> per = {0, 1, 2, 3})
-        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per) {}
+            std::array<int64_t, 4> per = {0, 1, 2, 3},
+            bool v = false)
+        : type_a(type_a), type_b(type_b), m(m), n(n), k(k), bs(bs), nr(nr), per(per), v(v) {}

    ggml_tensor * build_graph(ggml_context * ctx) override {
        // C^T = A * B^T: (k, m) * (k, n) => (m, n)
@ -1997,6 +1999,7 @@ struct test_mul_mat : public test_case {
        const int npermuted = (per[0] != 0) + (per[1] != 1) + (per[2] != 2) + (per[3] != 3);
        if (npermuted > 0) {
            GGML_ASSERT(npermuted == 2);
+            GGML_ASSERT(!v); // not handled
            GGML_ASSERT(!ggml_is_quantized(type_a) || per[0] == 0);
            GGML_ASSERT(!ggml_is_quantized(type_b) || per[0] == 0);

@ -2020,7 +2023,13 @@ struct test_mul_mat : public test_case {
            ggml_set_name(a, "a_permuted");
            ggml_set_name(b, "b_permuted");
        } else {
-            a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]);
+
+            if (v) {
+                a = ggml_new_tensor_4d(ctx, type_a, k*2, m, bs[0], bs[1]);
+                a = ggml_view_4d(ctx, a, k, m, bs[0], bs[1], a->nb[1], a->nb[2], a->nb[3], 0);
+            } else {
+                a = ggml_new_tensor_4d(ctx, type_a, k, m, bs[0],       bs[1]);
+            }
            b = ggml_new_tensor_4d(ctx, type_b, k, n, bs[0]*nr[0], bs[1]*nr[1]);
            if (!ggml_is_quantized(type_a)) {
                if (bs[1] == 1 && nr[1] == 1) {
@ -4176,6 +4185,17 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));

+    for (auto bs : {1,2,4,8}) {
+        for (auto nr : {1,4}) {
+            for (uint32_t m = 0; m < 2; ++m) {
+                for (uint32_t k = 0; k < 2; ++k) {
+                    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056 + m, 1, 128 + k,  {bs,  1}, {nr, 1}, {0, 2, 1, 3}));
+                    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128 + m,  1, 1056 + k, {bs,  1}, {nr, 1}, {0, 1, 2, 3}, true));
+                }
+            }
+        }
+    }
+
    // sycl backend will limit task global_range < MAX_INT
    // test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
    // however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
@ -4444,6 +4464,9 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_perf() {
    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {1024, 10, 1, 1}));
    test_cases.emplace_back(new test_argmax(GGML_TYPE_F32, {32000, 512, 1, 1}));

+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 16416, 1, 128, {8,  1}, {4, 1}, {0, 2, 1, 3}));
+    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 1, 16416, {8,  1}, {4, 1}, {0, 1, 2, 3}, true));
+
    for (int bs : {1, 2, 3, 4, 5, 8, 512}) {
        for (ggml_type type_a : all_types) {
            for (ggml_type type_b : {GGML_TYPE_F32}) {