quantize: Handle user-defined quantization levels for additional tensors (#12511)

* Add llama_model_quantize_params parameters * Add new quantize parameters parsing and validation * Update usage * Add new parameters defaults * Add new quantization parameters logic * Add llama_model_quantize_params parameters * Add new quantize parameters parsing and validation * Update usage * Add new parameters defaults * Add new quantization parameters logic * Minor refactoring as per the contributors' coding guidelines * Update descriptions to match existing style * Add llama_model_quantize_params parameters * Add new quantize parameters parsing and validation * Update usage * Add new parameters defaults * Add new quantization parameters logic * Minor refactoring as per the contributors' guidelines * Implement general --tensor-type instead of tensor-specific command option * Fix implied type bug * Restore missing #includes * Add regex capability for tensor selection * Refactor function name and update ALLOWED_TENSOR_TYPE * Add missing #include * Handle edge case when tensor name is cls.output * Minor logging improvement
2025-08-15 04:33:06 -04:00 · 2025-04-13 19:29:28 +01:00
parent bc091a4dc5
commit 71e90e8813
3 changed files with 155 additions and 20 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -10,6 +10,7 @@
 #include <cinttypes>
 #include <fstream>
 #include <mutex>
+#include <regex>
 #include <thread>
 #include <unordered_map>

@@ -47,8 +48,14 @@ struct quantize_state_impl {
        {}
 };

+// changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
 static void llama_tensor_dequantize_impl(
-    struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
+    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
 ) {
    if (output.size() < nelements) {
@@ -536,7 +543,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    model.load_hparams(ml);
    model.load_stats  (ml);

-    struct quantize_state_impl qs(model, params);
+    quantize_state_impl qs(model, params);

    if (params->only_copy) {
        ftype = ml.ftype;
@@ -661,7 +668,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    // populate the original tensors so we get an initial meta data
    for (const auto * it : tensors) {
        uint16_t i_split = params->keep_split ? it->idx : 0;
-        struct ggml_tensor * tensor = it->tensor;
+        ggml_tensor * tensor = it->tensor;
        if (!ctx_outs[i_split]) {
            ctx_outs[i_split].reset(gguf_init_empty());
        }
@@ -710,7 +717,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
    new_ofstream(0);
    for (const auto * it : tensors) {
        const auto & weight = *it;
-        struct ggml_tensor * tensor = weight.tensor;
+        ggml_tensor * tensor = weight.tensor;
        if (weight.idx != cur_split && params->keep_split) {
            close_ofstream();
            new_ofstream(weight.idx);
@@ -776,7 +783,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
        // do not quantize relative position bias (T5)
        quantize &= name.find("attn_rel_b.weight") == std::string::npos;

-        enum ggml_type new_type;
+        ggml_type new_type;
        void * new_data;
        size_t new_size;

@@ -786,6 +793,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
            // get more optimal quantization type based on the tensor shape, layer, etc.
            if (!params->pure && ggml_is_quantized(default_type)) {
                new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
+                // unless the user specifies a type
+                if (params->tensor_types) {
+                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    for (const auto & [tname, qtype] : tensor_types) {
+                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
+                            if (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                            }
+                            new_type = qtype;
+                            break;
+                        }
+                    }
+                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
@@ -910,8 +930,8 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
 // interface implementation
 //

-struct llama_model_quantize_params llama_model_quantize_default_params() {
-    struct llama_model_quantize_params result = {
+llama_model_quantize_params llama_model_quantize_default_params() {
+    llama_model_quantize_params result = {
        /*.nthread                     =*/ 0,
        /*.ftype                       =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
        /*.output_tensor_type          =*/ GGML_TYPE_COUNT,
@@ -923,6 +943,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
        /*.keep_split                  =*/ false,
        /*.imatrix                     =*/ nullptr,
        /*.kv_overrides                =*/ nullptr,
+        /*.tensor_type                 =*/ nullptr,
    };

    return result;