|
|
|
@ -32,6 +32,13 @@
|
|
|
|
|
layout(local_size_x_id = 0, local_size_y = 1, local_size_z = 1) in;
|
|
|
|
|
|
|
|
|
|
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
|
|
|
|
|
#if defined(A_TYPE_PACKED16)
|
|
|
|
|
layout (binding = 0) readonly buffer A_PACKED16 {A_TYPE_PACKED16 data_a_packed16[];};
|
|
|
|
|
#endif
|
|
|
|
|
#if defined(A_TYPE_PACKED32)
|
|
|
|
|
layout (binding = 0) readonly buffer A_PACKED32 {A_TYPE_PACKED32 data_a_packed32[];};
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
layout (binding = 1) readonly buffer B {B_TYPE data_b[];};
|
|
|
|
|
layout (binding = 2) writeonly buffer D {D_TYPE data_d[];};
|
|
|
|
|
|
|
|
|
@ -243,74 +250,100 @@ void main() {
|
|
|
|
|
#endif
|
|
|
|
|
#elif defined(DATA_A_Q4_0)
|
|
|
|
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
|
|
|
|
|
|
|
|
|
|
const uint ib = idx / 16;
|
|
|
|
|
const uint iqs = idx & 0xF;
|
|
|
|
|
const uint ib = idx / 4;
|
|
|
|
|
const uint iqs = idx & 0x03;
|
|
|
|
|
|
|
|
|
|
const float d = float(data_a[ib].d);
|
|
|
|
|
const uint vui = uint(data_a[ib].qs[iqs]);
|
|
|
|
|
const vec2 v = (vec2(vui & 0xF, vui >> 4) - 8.0f) * d;
|
|
|
|
|
const float d = float(data_a_packed16[ib].d);
|
|
|
|
|
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
|
|
|
|
const vec4 v0 = (vec4(unpack8(vui & 0x0F0F0F0F)) - 8.0f) * d;
|
|
|
|
|
const vec4 v1 = (vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) - 8.0f) * d;
|
|
|
|
|
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
|
|
|
|
|
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
|
|
|
|
|
buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
|
|
|
|
|
buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
|
|
|
|
|
buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
|
|
|
|
|
buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
|
|
|
|
|
buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
|
|
|
|
|
#elif defined(DATA_A_Q4_1)
|
|
|
|
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 4 * loadr_a;
|
|
|
|
|
|
|
|
|
|
const uint ib = idx / 16;
|
|
|
|
|
const uint iqs = idx & 0xF;
|
|
|
|
|
const uint ib = idx / 4;
|
|
|
|
|
const uint iqs = idx & 0x03;
|
|
|
|
|
|
|
|
|
|
const float d = float(data_a[ib].d);
|
|
|
|
|
const float m = float(data_a[ib].m);
|
|
|
|
|
const uint vui = uint(data_a[ib].qs[iqs]);
|
|
|
|
|
const vec2 v = vec2(vui & 0xF, vui >> 4) * d + m;
|
|
|
|
|
const float d = float(data_a_packed16[ib].d);
|
|
|
|
|
const float m = float(data_a_packed16[ib].m);
|
|
|
|
|
const uint vui = uint(data_a_packed16[ib].qs[2*iqs]) | (uint(data_a_packed16[ib].qs[2*iqs + 1]) << 16);
|
|
|
|
|
const vec4 v0 = vec4(unpack8(vui & 0x0F0F0F0F)) * d + m;
|
|
|
|
|
const vec4 v1 = vec4(unpack8((vui >> 4) & 0x0F0F0F0F)) * d + m;
|
|
|
|
|
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v0.x);
|
|
|
|
|
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v0.y);
|
|
|
|
|
buf_a[buf_idx + 2 ] = FLOAT_TYPE(v0.z);
|
|
|
|
|
buf_a[buf_idx + 3 ] = FLOAT_TYPE(v0.w);
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v1.x);
|
|
|
|
|
buf_a[buf_idx + 17] = FLOAT_TYPE(v1.y);
|
|
|
|
|
buf_a[buf_idx + 18] = FLOAT_TYPE(v1.z);
|
|
|
|
|
buf_a[buf_idx + 19] = FLOAT_TYPE(v1.w);
|
|
|
|
|
#elif defined(DATA_A_Q5_0)
|
|
|
|
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
|
|
|
|
|
|
|
|
|
|
const uint ib = idx / 16;
|
|
|
|
|
const uint iqs = idx & 0xF;
|
|
|
|
|
const uint ib = idx / 8;
|
|
|
|
|
const uint iqs = idx & 0x07;
|
|
|
|
|
|
|
|
|
|
const float d = float(data_a[ib].d);
|
|
|
|
|
const uint uint_qh = uint(data_a[ib].qh[1]) << 16 | data_a[ib].qh[0];
|
|
|
|
|
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
|
|
|
|
const uint vui = uint(data_a[ib].qs[iqs]);
|
|
|
|
|
const vec2 v = (vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) - 16.0f) * d;
|
|
|
|
|
const float d = float(data_a_packed16[ib].d);
|
|
|
|
|
const uint uint_qh = uint(data_a_packed16[ib].qh[1]) << 16 | uint(data_a_packed16[ib].qh[0]);
|
|
|
|
|
const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
|
|
|
|
|
const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
|
|
|
|
|
|
|
|
|
|
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
|
|
|
|
const vec4 v = (vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) - 16.0f) * d;
|
|
|
|
|
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
|
|
|
|
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
|
|
|
|
buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
|
|
|
|
|
#elif defined(DATA_A_Q5_1)
|
|
|
|
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
|
|
|
|
|
|
|
|
|
|
const uint ib = idx / 16;
|
|
|
|
|
const uint iqs = idx & 0xF;
|
|
|
|
|
const uint ib = idx / 8;
|
|
|
|
|
const uint iqs = idx & 0x07;
|
|
|
|
|
|
|
|
|
|
const float d = float(data_a[ib].d);
|
|
|
|
|
const float m = float(data_a[ib].m);
|
|
|
|
|
const uint uint_qh = data_a[ib].qh;
|
|
|
|
|
const ivec2 qh = ivec2(((uint_qh >> iqs) << 4) & 0x10, (uint_qh >> (iqs + 12)) & 0x10);
|
|
|
|
|
const uint vui = uint(data_a[ib].qs[iqs]);
|
|
|
|
|
const vec2 v = vec2((vui & 0xF) | qh.x, (vui >> 4) | qh.y) * d + m;
|
|
|
|
|
const float d = float(data_a_packed16[ib].d);
|
|
|
|
|
const float m = float(data_a_packed16[ib].m);
|
|
|
|
|
const uint uint_qh = data_a_packed16[ib].qh;
|
|
|
|
|
const ivec2 qh0 = ivec2(((uint_qh >> 2*iqs) << 4) & 0x10, (uint_qh >> (2*iqs + 12)) & 0x10);
|
|
|
|
|
const ivec2 qh1 = ivec2(((uint_qh >> (2*iqs + 1)) << 4) & 0x10, (uint_qh >> (2*iqs + 13)) & 0x10);
|
|
|
|
|
|
|
|
|
|
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
|
|
|
|
const vec4 v = vec4((vui & 0xF) | qh0.x, ((vui >> 4) & 0xF) | qh0.y, ((vui >> 8) & 0xF) | qh1.x, (vui >> 12) | qh1.y) * d + m;
|
|
|
|
|
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
|
|
|
|
buf_a[buf_idx + 1 ] = FLOAT_TYPE(v.z);
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
|
|
|
|
buf_a[buf_idx + 17] = FLOAT_TYPE(v.w);
|
|
|
|
|
#elif defined(DATA_A_Q8_0)
|
|
|
|
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
|
|
|
|
|
|
|
|
|
const uint ib = idx / 16;
|
|
|
|
|
const uint iqs = (idx & 0xF) * 2;
|
|
|
|
|
const uint ib = idx / 8;
|
|
|
|
|
const uint iqs = idx & 0x07;
|
|
|
|
|
|
|
|
|
|
const float d = float(data_a[ib].d);
|
|
|
|
|
const vec2 v = vec2(int(data_a[ib].qs[iqs]), int(data_a[ib].qs[iqs + 1])) * d;
|
|
|
|
|
const float d = float(data_a_packed16[ib].d);
|
|
|
|
|
const i8vec2 v0 = unpack8(data_a_packed16[ib].qs[2*iqs]);
|
|
|
|
|
const i8vec2 v1 = unpack8(data_a_packed16[ib].qs[2*iqs + 1]);
|
|
|
|
|
const vec4 v = vec4(v0.x, v0.y, v1.x, v1.y) * d;
|
|
|
|
|
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
|
|
|
|
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
|
|
|
|
buf_a[buf_idx + 2] = FLOAT_TYPE(v.z);
|
|
|
|
|
buf_a[buf_idx + 3] = FLOAT_TYPE(v.w);
|
|
|
|
|
#elif defined(DATA_A_Q2_K)
|
|
|
|
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a * LOAD_VEC_A;
|
|
|
|
@ -623,17 +656,18 @@ void main() {
|
|
|
|
|
buf_a[buf_idx + 1] = FLOAT_TYPE(v.y);
|
|
|
|
|
#elif defined(DATA_A_IQ4_NL)
|
|
|
|
|
const uint idx = pos_a + (loadc_a + l) * p.stride_a / LOAD_VEC_A + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + loadr_a;
|
|
|
|
|
const uint buf_idx = (loadc_a + l) * SHMEM_STRIDE + 2 * loadr_a;
|
|
|
|
|
|
|
|
|
|
const uint ib = idx / 16;
|
|
|
|
|
const uint iqs = idx & 0xF;
|
|
|
|
|
const uint ib = idx / 8;
|
|
|
|
|
const uint iqs = idx & 0x07;
|
|
|
|
|
|
|
|
|
|
const float d = float(data_a[ib].d);
|
|
|
|
|
const uint vui = uint(data_a[ib].qs[iqs]);
|
|
|
|
|
const vec2 v = vec2(kvalues_iq4nl[vui & 0xF], kvalues_iq4nl[vui >> 4]) * d;
|
|
|
|
|
const FLOAT_TYPE d = FLOAT_TYPE(data_a_packed16[ib].d);
|
|
|
|
|
const uint vui = uint(data_a_packed16[ib].qs[iqs]);
|
|
|
|
|
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(v.x);
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(v.y);
|
|
|
|
|
buf_a[buf_idx ] = FLOAT_TYPE(kvalues_iq4nl[vui & 0xF]) * d;
|
|
|
|
|
buf_a[buf_idx + 1 ] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 8, 4)]) * d;
|
|
|
|
|
buf_a[buf_idx + 16] = FLOAT_TYPE(kvalues_iq4nl[bitfieldExtract(vui, 4, 4)]) * d;
|
|
|
|
|
buf_a[buf_idx + 17] = FLOAT_TYPE(kvalues_iq4nl[vui >> 12]) * d;
|
|
|
|
|
#endif
|
|
|
|
|
}
|
|
|
|
|
[[unroll]] for (uint l = 0; l < BN; l += loadstride_b) {
|
|
|
|
|