llama.cpp/ggml/src/ggml-vulkan/vulkan-shaders/mul_mat_vec_q3_k.comp

#version 450
#extension GL_EXT_shader_explicit_arithmetic_types : require

#include "mul_mat_vec_base.comp"

layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;

layout (constant_id = 0) const uint BLOCK_SIZE = 32;
layout (constant_id = 1) const uint NUM_ROWS = 1;

shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];

void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {
    uint a_offset, b_offset, d_offset;
    get_offsets(a_offset, b_offset, d_offset);

    const uint num_blocks_per_row = p.ncols / QUANT_K;

    // 16 threads are used to process each block
    const uint it_size = gl_WorkGroupSize.x/16;
    const uint tid = gl_LocalInvocationID.x;
    const uint itid = tid%16;  // 0...16
    const uint ix  = tid/16;

    const uint step = 8;

    const uint v_im = itid/step;                            // 0 or 1. 0 computes 0..., 1 computes 128...
    const uint v_in = itid - step*v_im;                     // 0...15 or 0...7

    const uint8_t m = uint8_t(1 << (4 * v_im));

    const uint l0 = 2*v_in;                                 // 0...15
    const uint q_offset = 32*v_im + l0;
    const uint y_offset = 128*v_im + l0;

    FLOAT_TYPE temp[NUM_ROWS];

    [[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {
        temp[i] = FLOAT_TYPE(0);
    }

    const uint s_shift = 4 * v_im;

    [[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {
        const uint y_idx = i * QUANT_K + y_offset;

        B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];
        B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];
        B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];
        B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];
        B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];
        B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];
        B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];
        B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];

        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;
            const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);

            uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];
            uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];
            uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];
            uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];
            uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];
            uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];
            u8vec2 s0 = unpack8(s0_16);
            u8vec2 s2 = unpack8(s2_16);
            u8vec2 s4 = unpack8(s4_16);
            u8vec2 s6 = unpack8(s6_16);
            u8vec2 s8 = unpack8(s8_16);
            u8vec2 s10 = unpack8(s10_16);

            FLOAT_TYPE sum = FLOAT_TYPE(0.0);
            [[unroll]] for (int l = 0; l < 2; ++l) {
                sum = fma(FLOAT_TYPE(b0[l])   * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 0)) != 0) ? 0 : 4)),
                      fma(FLOAT_TYPE(b32[l])  * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 1)) != 0) ? 0 : 4)),
                      fma(FLOAT_TYPE(b64[l])  * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) | ((s8[0]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 2)) != 0) ? 0 : 4)),
                      fma(FLOAT_TYPE(b96[l])  * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) | ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l   ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l   ] & (m << 3)) != 0) ? 0 : 4)),
                      fma(FLOAT_TYPE(b16[l])  * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16]     ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),
                      fma(FLOAT_TYPE(b48[l])  * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),
                      fma(FLOAT_TYPE(b80[l])  * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) | ((s8[1]  >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),
                      fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) | ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));
            }
            temp[n] = fma(d, sum, temp[n]);
        }
    }

    // sum up partial sums and write back result
    [[unroll]] for (uint n = 0; n < num_rows; ++n) {
        tmpsh[n][tid] = temp[n];
    }
    barrier();
    [[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {
        if (tid < s) {
            [[unroll]] for (uint n = 0; n < num_rows; ++n) {
                tmpsh[n][tid] += tmpsh[n][tid + s];
            }
        }
        barrier();
    }
    if (tid == 0) {
        [[unroll]] for (uint n = 0; n < num_rows; ++n) {
            data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);
        }
    }
}

void main() {
    const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);

    // do NUM_ROWS at a time, unless there aren't enough remaining rows
    if (first_row + NUM_ROWS <= p.stride_d) {
        compute_outputs(first_row, NUM_ROWS);
    } else {
        if (first_row >= p.stride_d) {
            return;
        }
        compute_outputs(first_row, p.stride_d - first_row);
    }
}
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`#version 450`
vulkan: optimize Q2_K and Q3_K mul_mat_vec (#10459) 2024-11-27 01:00:50 -06:00			`#extension GL_EXT_shader_explicit_arithmetic_types : require`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00
			`#include "mul_mat_vec_base.comp"`

vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants 2024-12-10 19:33:23 +00:00			`layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00
vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants 2024-12-10 19:33:23 +00:00			`layout (constant_id = 0) const uint BLOCK_SIZE = 32;`
vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`layout (constant_id = 1) const uint NUM_ROWS = 1;`
vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants 2024-12-10 19:33:23 +00:00
vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`shared FLOAT_TYPE tmpsh[NUM_ROWS][BLOCK_SIZE];`
vulkan: further optimize mul_mat_vec using larger loads (#10387) * vulkan: Use pipeline_robustness to disable robustness in mul_mat_vec. Add some early returns for nonexistent rows in mul_mat_vec shaders. These can only be hit when dispatching a 2D grid of workgroups. Fix the logic for the 2D grid of workgroups to round up. Enable the pipeline robustness extension if it's available, and use it to disable robustness for these pipelines. The instructions to do the bounds checking contend for the same ALU resources as the bit twiddling dequant instructions. * vulkan: Add GLSL structure aliases for quant types to allow larger loads In Vulkan it's not possible to cast pointer types, so instead you have to declare an aliased binding for the memory with a different type. This commit adds aliases for the quant formats using 16b ints, and in a few places where the struct size is a multiple of 4 also using 32b ints. Currently only q4_k's aliases are used, but others will be used in subsequent commits. * vulkan: use larger loads in q5_k and q6_k shaders. Similar to the optimization I did in q4_k recently, this vectorizes some loads and reduces the number of bit twiddling instructions. * vulkan: use larger K step per iteration in mul_mat_vec. Add vec4 dequantization functions, and use them to do K=8 per iteration in mul_mat_vec. This uses 16b loads for the quant values and 128b loads for B which helps reduce the load on the memory system. The K_PER_ITER==2 logic is still there, just for F16/F32, and really only because they support unaligned sizes. Tweak the num_iters/unrolling logic to be simpler and catch a couple missed unrolling opportunities. 2024-11-20 01:11:00 -06:00
vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`void compute_outputs(const uint32_t first_row, const uint32_t num_rows) {`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`uint a_offset, b_offset, d_offset;`
			`get_offsets(a_offset, b_offset, d_offset);`

			`const uint num_blocks_per_row = p.ncols / QUANT_K;`

vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants 2024-12-10 19:33:23 +00:00			`// 16 threads are used to process each block`
			`const uint it_size = gl_WorkGroupSize.x/16;`
			`const uint tid = gl_LocalInvocationID.x;`
			`const uint itid = tid%16; // 0...16`
			`const uint ix = tid/16;`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00
vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants 2024-12-10 19:33:23 +00:00			`const uint step = 8;`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00
vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants 2024-12-10 19:33:23 +00:00			`const uint v_im = itid/step; // 0 or 1. 0 computes 0..., 1 computes 128...`
			`const uint v_in = itid - step*v_im; // 0...15 or 0...7`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00
			`const uint8_t m = uint8_t(1 << (4 * v_im));`

vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`const uint l0 = 2*v_in; // 0...15`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`const uint q_offset = 32*v_im + l0;`
			`const uint y_offset = 128*v_im + l0;`

vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`FLOAT_TYPE temp[NUM_ROWS];`

			`[[unroll]] for (uint i = 0; i < NUM_ROWS; ++i) {`
			`temp[i] = FLOAT_TYPE(0);`
			`}`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00
			`const uint s_shift = 4 * v_im;`

vulkan: dynamic subgroup size for the remaining k quants (#10745) * q5_k q4_k q3_k q2_k q6_k multi row example * revert as multi row isnt faster for k quants 2024-12-10 19:33:23 +00:00			`[[unroll]] for (uint i = ix; i < num_blocks_per_row; i += it_size) {`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`const uint y_idx = i * QUANT_K + y_offset;`

vulkan: optimize Q2_K and Q3_K mul_mat_vec (#10459) 2024-11-27 01:00:50 -06:00			`B_TYPE_VEC2 b0 = data_b_v2[(b_offset + y_idx) / 2 + 0];`
			`B_TYPE_VEC2 b16 = data_b_v2[(b_offset + y_idx) / 2 + 8];`
			`B_TYPE_VEC2 b32 = data_b_v2[(b_offset + y_idx) / 2 + 16];`
			`B_TYPE_VEC2 b48 = data_b_v2[(b_offset + y_idx) / 2 + 24];`
			`B_TYPE_VEC2 b64 = data_b_v2[(b_offset + y_idx) / 2 + 32];`
			`B_TYPE_VEC2 b80 = data_b_v2[(b_offset + y_idx) / 2 + 40];`
			`B_TYPE_VEC2 b96 = data_b_v2[(b_offset + y_idx) / 2 + 48];`
			`B_TYPE_VEC2 b112 = data_b_v2[(b_offset + y_idx) / 2 + 56];`

vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`[[unroll]] for (uint n = 0; n < num_rows; ++n) {`
			`const uint ib0 = a_offset / QUANT_K + (first_row+n)*num_blocks_per_row;`
			`const FLOAT_TYPE d = FLOAT_TYPE(data_a[ib0 + i].d);`

			`uint16_t s0_16 = data_a_packed16[ib0 + i].scales[0];`
			`uint16_t s2_16 = data_a_packed16[ib0 + i].scales[1];`
			`uint16_t s4_16 = data_a_packed16[ib0 + i].scales[2];`
			`uint16_t s6_16 = data_a_packed16[ib0 + i].scales[3];`
			`uint16_t s8_16 = data_a_packed16[ib0 + i].scales[4];`
			`uint16_t s10_16 = data_a_packed16[ib0 + i].scales[5];`
			`u8vec2 s0 = unpack8(s0_16);`
			`u8vec2 s2 = unpack8(s2_16);`
			`u8vec2 s4 = unpack8(s4_16);`
			`u8vec2 s6 = unpack8(s6_16);`
			`u8vec2 s8 = unpack8(s8_16);`
			`u8vec2 s10 = unpack8(s10_16);`

			`FLOAT_TYPE sum = FLOAT_TYPE(0.0);`
			`[[unroll]] for (int l = 0; l < 2; ++l) {`
			`sum = fma(FLOAT_TYPE(b0[l]) * FLOAT_TYPE(int8_t(((s0[0] >> s_shift) & 0xF) \| ((s8[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 0)) != 0) ? 0 : 4)),`
			`fma(FLOAT_TYPE(b32[l]) * FLOAT_TYPE(int8_t(((s2[0] >> s_shift) & 0xF) \| ((s10[0] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 1)) != 0) ? 0 : 4)),`
			`fma(FLOAT_TYPE(b64[l]) * FLOAT_TYPE(int8_t(((s4[0] >> s_shift) & 0xF) \| ((s8[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 2)) != 0) ? 0 : 4)),`
			`fma(FLOAT_TYPE(b96[l]) * FLOAT_TYPE(int8_t(((s6[0] >> s_shift) & 0xF) \| ((s10[0] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l ] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l ] & (m << 3)) != 0) ? 0 : 4)),`
			`fma(FLOAT_TYPE(b16[l]) * FLOAT_TYPE(int8_t(((s0[1] >> s_shift) & 0xF) \| ((s8[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] ) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 0)) != 0) ? 0 : 4)),`
			`fma(FLOAT_TYPE(b48[l]) * FLOAT_TYPE(int8_t(((s2[1] >> s_shift) & 0xF) \| ((s10[1] >> (s_shift + 0) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 2) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 1)) != 0) ? 0 : 4)),`
			`fma(FLOAT_TYPE(b80[l]) * FLOAT_TYPE(int8_t(((s4[1] >> s_shift) & 0xF) \| ((s8[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 4) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 2)) != 0) ? 0 : 4)),`
			`fma(FLOAT_TYPE(b112[l]) * FLOAT_TYPE(int8_t(((s6[1] >> s_shift) & 0xF) \| ((s10[1] >> (s_shift + 2) & 0x3) << 4)) - 32), FLOAT_TYPE(((data_a[ib0 + i].qs[q_offset + l+16] >> 6) & 3) - (((data_a[ib0 + i].hmask[l0 + l+16] & (m << 3)) != 0) ? 0 : 4)), sum))))))));`
			`}`
			`temp[n] = fma(d, sum, temp[n]);`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`}`
			`}`

			`// sum up partial sums and write back result`
vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`[[unroll]] for (uint n = 0; n < num_rows; ++n) {`
			`tmpsh[n][tid] = temp[n];`
			`}`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`barrier();`
vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`[[unroll]] for (uint s = BLOCK_SIZE/2; s > 0; s >>= 1) {`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`if (tid < s) {`
vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`[[unroll]] for (uint n = 0; n < num_rows; ++n) {`
			`tmpsh[n][tid] += tmpsh[n][tid + s];`
			`}`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`}`
			`barrier();`
			`}`
			`if (tid == 0) {`
vulkan: multi-row k quants (#10846) * multi row k quant shaders! * better row selection * more row choices * readjust row selection * rm_kq=2 by default 2024-12-26 10:54:44 -05:00			`[[unroll]] for (uint n = 0; n < num_rows; ++n) {`
			`data_d[d_offset + first_row + n] = D_TYPE(tmpsh[n][0]);`
			`}`
			`}`
			`}`

			`void main() {`
			`const uint first_row = NUM_ROWS * (gl_WorkGroupID.x + gl_NumWorkGroups.x * gl_WorkGroupID.z);`

			`// do NUM_ROWS at a time, unless there aren't enough remaining rows`
			`if (first_row + NUM_ROWS <= p.stride_d) {`
			`compute_outputs(first_row, NUM_ROWS);`
			`} else {`
			`if (first_row >= p.stride_d) {`
			`return;`
			`}`
			`compute_outputs(first_row, p.stride_d - first_row);`
Vulkan Shader Refactor, Memory Debugging Option (#7947) * Refactor shaders, extract GLSL code from ggml_vk_generate_shaders.py into vulkan-shaders directory * Improve debug log code * Add memory debug output option * Fix flake8 * Fix unnecessary high llama-3 VRAM use 2024-06-16 07:17:31 +02:00			`}`
			`}`