quantize : improve tensor-type pattern matching (#13033)

2025-06-27 03:55:20 +00:00 · 2025-05-13 18:12:31 +01:00
parent 71bdbdb587
commit e5c834f718
2 changed files with 26 additions and 87 deletions
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@ -14,6 +14,12 @@
 #include <thread>
 #include <unordered_map>
 // Quantization types. Changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static void zeros(std::ofstream & file, size_t n) {
    char zero = 0;
    for (size_t i = 0; i < n; ++i) {
@ -48,12 +54,6 @@ struct quantize_state_impl {
        {}
 };
 // changes to this struct must be replicated in quantize.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static void llama_tensor_dequantize_impl(
    ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
    const size_t nelements, const int nthread
@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                // unless the user specifies a type
                if (params->tensor_types) {
                    const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                    const std::string tensor_name(tensor->name);
                    for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
+                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
                            if  (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
                            }
                                new_type = qtype;
-                            break;
+                                break; // if two or more types are specified for the tensor, first match wins
                            }
                        }
                    }
                }
            }
            if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                new_type = params->token_embedding_type;
            }
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@ -57,6 +57,12 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
    { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
 };
 // Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
 static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
@ -244,56 +250,10 @@ static ggml_type parse_ggml_type(const char * arg) {
            return type;
        }
    }
-    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
    return GGML_TYPE_COUNT;
 }
 // Allowed tensors for arbitrary quantization with --tensor-type option
 static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
    "attn_k",
    "attn_kv_a_mqa",
    "attn_kv_b",
    "attn_o",
    "attn_output",
    "attn_q",
    "attn_q_a",
    "attn_q_b",
    "attn_qkv",
    "attn_v",
    "channel_mix_key",
    "channel_mix_receptance",
    "channel_mix_value",
    "cls",
    "cls.output",
    "cross_attn_k",
    "cross_attn_o",
    "cross_attn_q",
    "cross_attn_v",
    "ffn_act",
    "ffn_down",
    "ffn_down_exps",
    "ffn_down_shexp",
    "ffn_gate",
    "ffn_gate_exps",
    "ffn_gate_shexp",
    "ffn_up",
    "ffn_up_exps",
    "ffn_up_shexp",
    "ssm_in",
    "ssm_out",
    "time_mix_gate",
    "time_mix_key",
    "time_mix_output",
    "time_mix_receptance",
    "time_mix_value",
 };
 // changes to this struct must be replicated in llama-quant.cpp
 struct tensor_quantization {
    std::string name;
    ggml_type quant = GGML_TYPE_COUNT;
 };
 static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
    const char * sep = strchr(data, '=');
    if (sep == nullptr) {
@ -306,7 +266,6 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
        printf("\n%s: missing tensor name\n\n", __func__);
        return false;
    }
    if (const size_t qt_len = strlen(sep); qt_len == 1) {
        printf("\n%s: missing quantization type\n\n", __func__);
        return false;
@ -315,37 +274,15 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
    std::string tn(data, tn_len);
    std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
    sep++;
    const std::string qt(sep);
    bool found = false;
    for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
        std::string tensor;
        tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
        // handle special case of cls.output
        std::string cls_output = "cls.output";
        if (tn.find(cls_output) != std::string::npos) {
            tensor = "cls.output";
        }
        // check if an allowed tensor exists and it's at the end of the kv string
        if (tensor == allowed) {
            found = true;
            break;
        }
    }
    if (!found) {
        printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
        return false;
    }
    if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
        printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
        return false;
    }
    tensor_quantization tqz;
    tqz.name = tn;
-    tqz.quant = parse_ggml_type(qt.c_str());
+    tqz.quant = parse_ggml_type(sep);
    tensor_type.emplace_back(std::move(tqz));
    if (tqz.quant == GGML_TYPE_COUNT) {
        printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
        return false;
    }
    return true;
 }