2024-06-09 09:42:25 +02:00
|
|
|
#pragma once
|
|
|
|
|
2024-03-25 13:50:23 +01:00
|
|
|
#include "common.cuh"
|
2024-06-09 09:42:25 +02:00
|
|
|
#include "mmq.cuh"
|
|
|
|
|
|
|
|
#include <cstdint>
|
2024-03-25 13:50:23 +01:00
|
|
|
|
2024-07-11 16:47:47 +02:00
|
|
|
#define CUDA_QUANTIZE_BLOCK_SIZE 256
|
|
|
|
#define CUDA_QUANTIZE_BLOCK_SIZE_MMQ 128
|
|
|
|
|
|
|
|
static_assert(MATRIX_ROW_PADDING % CUDA_QUANTIZE_BLOCK_SIZE == 0, "Risk of out-of-bounds access.");
|
|
|
|
static_assert(MATRIX_ROW_PADDING % (4*CUDA_QUANTIZE_BLOCK_SIZE_MMQ) == 0, "Risk of out-of-bounds access.");
|
2024-03-25 13:50:23 +01:00
|
|
|
|
2024-06-09 09:42:25 +02:00
|
|
|
typedef void (*quantize_cuda_t)(
|
2025-04-30 23:12:59 +02:00
|
|
|
const float * x, const int32_t * ids, void * vy,
|
|
|
|
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
|
|
|
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
2024-06-09 09:42:25 +02:00
|
|
|
|
|
|
|
void quantize_row_q8_1_cuda(
|
2025-04-30 23:12:59 +02:00
|
|
|
const float * x, const int32_t * ids, void * vy,
|
|
|
|
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
|
|
|
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|
2024-06-09 09:42:25 +02:00
|
|
|
|
|
|
|
void quantize_mmq_q8_1_cuda(
|
2025-04-30 23:12:59 +02:00
|
|
|
const float * x, const int32_t * ids, void * vy,
|
|
|
|
ggml_type type_src0, int64_t ne00, int64_t s01, int64_t s02, int64_t s03,
|
|
|
|
int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3, cudaStream_t stream);
|