sycl: fixed semantics of block offset calculation (#14814)

This commit is contained in:
Alberto Cabrera Pérez
2025-07-24 11:09:57 +01:00
committed by Aaron Teo
parent 6286ad25d1
commit 07a49304ad
2 changed files with 10 additions and 15 deletions

View File

@@ -48,11 +48,11 @@ template <> struct block_q_t<GGML_TYPE_Q4_0> {
}; };
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) { static constexpr std::pair<int, int> get_block_offset(const int block_index, const int /* nblocks */) {
return { block_index * (traits::qk / traits::qr), 0 }; return { block_index * (QK4_0 / QR4_0), 0 };
} }
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) { static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
return { (ncols / traits::qr * nrows) + block_index * sizeof(ggml_half), 0 }; return { (ncols / QR4_0 * nrows) + block_index * sizeof(ggml_half), 0 };
} }
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
@@ -71,14 +71,12 @@ template <> struct block_q_t<GGML_TYPE_Q4_K> {
} }
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) { static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
auto nblocks = (nrows * (ncols / traits::qk)); auto nblocks = (nrows * (ncols / QK_K));
return { nblocks * (QK_K / 2), return { nblocks * (QK_K / 2) + (block_index * K_SCALE_SIZE),
(nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) }; (nblocks * QK_K / 2) + (nblocks * K_SCALE_SIZE) + (block_index * sizeof(ggml_half2)) };
} }
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
constexpr size_t get_total_qs_bytes(int nblocks) { return nblocks * QK_K / 2; }
}; };
template <> struct block_q_t<GGML_TYPE_Q6_K> { template <> struct block_q_t<GGML_TYPE_Q6_K> {
@@ -90,22 +88,23 @@ template <> struct block_q_t<GGML_TYPE_Q6_K> {
}; };
static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) { static constexpr std::pair<int, int> get_block_offset(const int block_index, const int n_blocks) {
auto low_bits_index = block_index * (traits::qk / traits::qr); auto low_bits_index = block_index * (QK_K / QR6_K);
// the index of high bits it's after all low bits // the index of high bits it's after all low bits
auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4)); auto high_bits_index = n_blocks * (QK_K / 2) + (block_index * (QK_K / 4));
return { low_bits_index, high_bits_index }; return { low_bits_index, high_bits_index };
} }
static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) { static constexpr std::pair<int, int> get_d_offset(int nrows, int ncols, const int block_index) {
auto nblocks = (nrows * (ncols / traits::qk)); auto nblocks = (nrows * (ncols / QK_K));
auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4); auto total_qs_bytes = nblocks * (QK_K / 2) + nblocks * (QK_K / 4);
auto block_scales = total_qs_bytes + block_index * (QK_K / 16); auto block_scales = total_qs_bytes + block_index * (QK_K / 16);
auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16); auto sb_scale = total_qs_bytes + nblocks * (QK_K / 16) + block_index * sizeof(ggml_half);
return { block_scales, sb_scale }; return { block_scales, sb_scale };
} }
static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; } static constexpr int block_to_q8_1_ratio() { return traits::qk / QK8_1; }
}; };
} // namespace ggml_sycl_reordered } // namespace ggml_sycl_reordered
#endif // GGML_SYCL_QUANTS_HPP #endif // GGML_SYCL_QUANTS_HPP

View File

@@ -350,11 +350,9 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q4_K> {
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset, __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr,
const sycl::half2 * q8_1_ds, const int & iqs) { const sycl::half2 * q8_1_ds, const int & iqs) {
const int ib = ibx_offset.first / (QK_K / 2);
const uint8_t * base = static_cast<const uint8_t *>(vbq); const uint8_t * base = static_cast<const uint8_t *>(vbq);
const uint8_t * qs = base + ibx_offset.first; const uint8_t * qs = base + ibx_offset.first;
const uint8_t * scs = base + d_offset.first + ib * K_SCALE_SIZE; const uint8_t * scs = base + d_offset.first;
const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second); const ggml_half2 * dms = reinterpret_cast<const ggml_half2 *>(base + d_offset.second);
const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2)); const int bq8_offset = QR4_K * ((iqs / 2) / (QI8_1 / 2));
@@ -427,13 +425,11 @@ template <> struct reorder_vec_dot_q_sycl<GGML_TYPE_Q6_K> {
__dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset, __dpct_inline__ float operator()(const void * __restrict__ vbq, const std::pair<int, int> ibx_offset,
const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds, const std::pair<int, int> d_offset, const int8_t * q8_1_quant_ptr, const sycl::half2 * q8_1_ds,
const int iqs) { const int iqs) {
const int ib = ibx_offset.first / (QK_K / 2);
const uint8_t * base = static_cast<const uint8_t *>(vbq); const uint8_t * base = static_cast<const uint8_t *>(vbq);
const uint8_t * ql = base + ibx_offset.first; const uint8_t * ql = base + ibx_offset.first;
const uint8_t * qh = base + ibx_offset.second; const uint8_t * qh = base + ibx_offset.second;
const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first); const int8_t * scales = reinterpret_cast<const int8_t *>(base + d_offset.first);
const ggml_half * d = (const ggml_half *) (base + d_offset.second) + ib; const ggml_half * d = (const ggml_half *) (base + d_offset.second);
const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4); const int bq8_offset = 2 * QR6_K * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 4);
const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8); const int scale_offset = (QI6_K / 4) * (iqs / (QI6_K / 2)) + (iqs % (QI6_K / 2)) / (QI6_K / 8);