vulkan: fix mul_mat_vec failure in backend tests (#12529)

The OOB calculation could be wrong if the last iteration was during one of the unrolled loops. Adjust the unrolling counts to avoid this. Add a couple new backend tests that hit this failure on NVIDIA GPUs.
2025-06-29 12:35:16 +00:00 · 2025-03-24 01:56:17 -05:00
parent 77f9c6bbe5
commit 9b169a4d4e
2 changed files with 22 additions and 0 deletions
--- a/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec.comp
@ -105,6 +105,16 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    int unroll_count = 4;
    uint unrolled_iters = num_iters & ~(unroll_count - 1);
 #if K_PER_ITER == 2
    // If the K dimension is odd, we need lastiter==true on the last iteration
    // so OOB is computed correctly. Skip some unrolling to make that happen.
    if ((p.ncols & 1) != 0 &&
        unrolled_iters == num_iters &&
        unrolled_iters > 0) {
        unrolled_iters -= unroll_count;
    }
 #endif
    uint i = 0;
    while (i < unrolled_iters) {
        // Manually partially unroll the loop
@ -113,8 +123,18 @@ void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
            i++;
        }
    }
    unroll_count = 2;
    unrolled_iters = num_iters & ~(unroll_count - 1);
 #if K_PER_ITER == 2
    if ((p.ncols & 1) != 0 &&
        unrolled_iters == num_iters &&
        unrolled_iters > 0) {
        unrolled_iters -= unroll_count;
    }
 #endif
    while (i < unrolled_iters) {
        // Manually partially unroll the loop
        [[unroll]] for (uint k = 0; k < unroll_count; ++k) {
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@ -4204,6 +4204,8 @@ static std::vector<std::unique_ptr<test_case>> make_test_cases_eval() {
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  83, 2,   64, { 8,  1}, {4, 1}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32,  64, 45, 128, { 8,  1}, {4, 1}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45,  64, { 8,  1}, {4, 1}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1,  1}, {4, 1}, {0, 2, 1, 3}));
    test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67,  {1,  1}, {4, 1}, {0, 2, 1, 3}));
    for (auto bs : {1,2,4,8}) {
        for (auto nr : {1,4}) {