mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-30 22:23:31 -04:00
musa: fix all warnings, re-enable -DLLAMA_FATAL_WARNINGS=ON
in ci and update doc (#12611)
* musa: fix all warnings Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: enable -DLLAMA_FATAL_WARNINGS=ON in run.sh Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * musa: update ci doc (install ccache) Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * fix Windows build issue Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * Address review comments Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> * Address review comments Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com> --------- Signed-off-by: Xiaodong Ye <xiaodong.ye@mthreads.com>
This commit is contained in:
@@ -945,7 +945,7 @@ static __device__ __forceinline__ void vec_dot_q8_0_16_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -1024,7 +1024,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int k01 = 0; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
||||
for (int k01 = 0; k01 < WARP_SIZE/2; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
#pragma unroll
|
||||
@@ -1035,19 +1035,34 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_dp4a(
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
if (k01 < WARP_SIZE/2) {
|
||||
constexpr int ns = 2;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
} else {
|
||||
constexpr int ns = 1;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
}
|
||||
constexpr int ns = 2;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Some compilers fail to unroll the loop over k01 if there is a conditional statement for ns in the inner loop.
|
||||
// As a workaround 2 separate loops are used instead.
|
||||
#pragma unroll
|
||||
for (int k01 = WARP_SIZE/2; k01 < WARP_SIZE; k01 += QR2_K*VDR_Q2_K_Q8_1_MMQ) {
|
||||
const int k0 = k00 + k01;
|
||||
|
||||
#pragma unroll
|
||||
for (int j0 = 0; j0 < mmq_x; j0 += nwarps) {
|
||||
const int j = j0 + threadIdx.y;
|
||||
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < mmq_y; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
constexpr int ns = 1;
|
||||
sum[j0/nwarps*mmq_y/WARP_SIZE + i0/WARP_SIZE] += vec_dot_q2_K_q8_1_impl_mmq<ns>(
|
||||
&x_qs[i*(2*WARP_SIZE + 1) + k0], &y_qs[j*MMQ_TILE_Y_K + k01],
|
||||
&x_dm[i*(WARP_SIZE + 1) + k0/4], k01 < WARP_SIZE/2 ? y_df[j0/nwarps].x : y_df[j0/nwarps].y,
|
||||
&y_ds[j*MMQ_TILE_Y_K + (1 + k01/QI8_1)]);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1176,7 +1191,7 @@ static __device__ __forceinline__ void vec_dot_q2_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -1253,7 +1268,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
const float d = bxi->d;
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
x_df[i*MMQ_MMA_TILE_X_K_Q3_K + sizeof(int)*(threadIdx.x % (WARP_SIZE/8)) + l] = d*sc8[l];
|
||||
}
|
||||
#else
|
||||
@@ -1376,7 +1391,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
||||
}
|
||||
}
|
||||
@@ -1517,7 +1532,7 @@ template <int mmq_y, int nwarps, bool need_check> static __device__ __forceinlin
|
||||
const half2 dm = bxi->dm * make_half2(1.0f, -1.0f);
|
||||
|
||||
#pragma unroll
|
||||
for (int l = 0; l < sizeof(int); ++l) {
|
||||
for (int l = 0; l < int(sizeof(int)); ++l) {
|
||||
x_dm[i*MMQ_MMA_TILE_X_K_Q8_1 + sizeof(int)*ksc + l] = dm*make_half2(sc8[l], m8[l]);
|
||||
}
|
||||
}
|
||||
@@ -1810,7 +1825,7 @@ static __device__ __forceinline__ void vec_dot_q6_K_q8_1_mma(
|
||||
}
|
||||
}
|
||||
#else
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum);
|
||||
GGML_UNUSED(x); GGML_UNUSED(y); GGML_UNUSED(sum); GGML_UNUSED(k00);
|
||||
NO_DEVICE_CODE;
|
||||
#endif // NEW_MMA_AVAILABLE
|
||||
}
|
||||
@@ -2570,6 +2585,8 @@ static __device__ void mul_mat_q_process_tile(
|
||||
} else {
|
||||
write_back(sum, dst + jt*mmq_x*ne0 + it*mmq_y, ne0, tile_x_max_i, tile_y_max_j);
|
||||
}
|
||||
|
||||
GGML_UNUSED(ne00); GGML_UNUSED(ne10);
|
||||
}
|
||||
|
||||
|
||||
@@ -2695,7 +2712,7 @@ static __global__ void mul_mat_q_stream_k_fixup(
|
||||
const int it = (kbc_stop - jt*(blocks_per_ne00*nty)) / blocks_per_ne00;
|
||||
|
||||
// Skip fixup tile if it's unrelated to the output tile assigned to this CUDA block:
|
||||
if (it != blockIdx.x || jt != blockIdx.y) {
|
||||
if ((unsigned)it != blockIdx.x || (unsigned)jt != blockIdx.y) {
|
||||
continue;
|
||||
}
|
||||
|
||||
@@ -2825,7 +2842,6 @@ static void launch_mul_mat_q(ggml_backend_cuda_context & ctx, const mmq_args & a
|
||||
template <ggml_type type>
|
||||
void mul_mat_q_case(ggml_backend_cuda_context & ctx, const mmq_args & args, cudaStream_t stream) {
|
||||
const int id = ggml_cuda_get_device();
|
||||
const int nsm = ggml_cuda_info().devices[id].nsm;
|
||||
const int cc = ggml_cuda_info().devices[id].cc;
|
||||
const int smpbo = ggml_cuda_info().devices[id].smpbo;
|
||||
|
||||
|
Reference in New Issue
Block a user