CUDA: MMQ code deduplication + iquant support (#8495)

* CUDA: MMQ code deduplication + iquant support

* 1 less parallel job for CI build
This commit is contained in:
Johannes Gäßler
2024-07-20 22:25:26 +02:00
committed by GitHub
parent 07283b1a90
commit 69c487f4ed
11 changed files with 800 additions and 639 deletions

View File

@ -23,7 +23,8 @@ SOURCE_FATTN_WMMA_CASE = "DECL_FATTN_WMMA_F16_CASE({head_size}, {cols_per_block}
TYPES_MMQ = [
"GGML_TYPE_Q4_0", "GGML_TYPE_Q4_1", "GGML_TYPE_Q5_0", "GGML_TYPE_Q5_1", "GGML_TYPE_Q8_0",
"GGML_TYPE_Q2_K", "GGML_TYPE_Q3_K", "GGML_TYPE_Q4_K", "GGML_TYPE_Q5_K", "GGML_TYPE_Q6_K",
"GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
"GGML_TYPE_IQ2_XXS", "GGML_TYPE_IQ2_XS", "GGML_TYPE_IQ2_S", "GGML_TYPE_IQ3_XXS", "GGML_TYPE_IQ3_S",
"GGML_TYPE_IQ1_S", "GGML_TYPE_IQ4_NL", "GGML_TYPE_IQ4_XS"
]
SOURCE_MMQ = """// This file has been autogenerated by generate_cu_files.py, do not edit manually.