llama.cpp/ggml/src/ggml-cuda/quantize.cuh

#pragma once

#include "common.cuh"
#include "mmq.cuh"

#include <cstdint>

#define CUDA_QUANTIZE_BLOCK_SIZE     256
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128

static_assert(MATRIX_ROW_PADDING %    CUDA_QUANTIZE_BLOCK_SIZE      == 0, "Risk of out-of-bounds access.");
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");

typedef void (*quantize_cuda_t)(
        const float * x, const int32_t * ids, void * vy,
        ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
        int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);

void quantize_row_q8_1_cuda(
        const float * x, const int32_t * ids, void * vy,
        ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
        int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);

void quantize_mmq_q8_1_cuda(
        const float * x, const int32_t * ids, void * vy,
        ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
        int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00			`#pragma once`

cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00			`#include "common.cuh"`
CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00			`#include "mmq.cuh"`

			`#include <cstdint>`
cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00
CUDA: optimize and refactor MMQ (#8416) * CUDA: optimize and refactor MMQ * explicit q8_1 memory layouts, add documentation 2024-07-11 16:47:47 +02:00			`#define CUDA_QUANTIZE_BLOCK_SIZE 256`
			`#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128`

			`static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");`
			`static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");`
cuda : refactor into multiple files (#6269) 2024-03-25 13:50:23 +01:00
CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00			`typedef void (*quantize_cuda_t)(`
CUDA: batched+noncont MMQ, refactor bs>1 MoE code (#13199) 2025-04-30 23:12:59 +02:00			`const float * x, const int32_t * ids, void * vy,`
			`ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,`
			`int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);`
CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00
			`void quantize_row_q8_1_cuda(`
CUDA: batched+noncont MMQ, refactor bs>1 MoE code (#13199) 2025-04-30 23:12:59 +02:00			`const float * x, const int32_t * ids, void * vy,`
			`ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,`
			`int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);`
CUDA: revise q8_1 data layout for mul_mat_q (#7824) 2024-06-09 09:42:25 +02:00
			`void quantize_mmq_q8_1_cuda(`
CUDA: batched+noncont MMQ, refactor bs>1 MoE code (#13199) 2025-04-30 23:12:59 +02:00			`const float * x, const int32_t * ids, void * vy,`
			`ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,`
			`int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);`