llama.cpp/tools/imatrix/imatrix.cpp

#include "arg.h"
#include "common.h"
#include "log.h"
#include "llama.h"
#include "gguf.h"

#include <algorithm>
#include <chrono>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <ctime>
#include <thread>
#include <mutex>
#include <vector>
#include <fstream>
#include <unordered_map>
#include <map>
#include <regex>
#include <numeric>

#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

static void print_usage(int, char ** argv) {
    LOG("\nexample usage:\n");
    LOG("\n    %s \\\n"
            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n"
            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
            "       [--show-statistics] [...]\n" , argv[0]);
    LOG("\n");
}

static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";

struct Stats {
    std::vector<float>   values;
    std::vector<int64_t> counts;
};

struct tensor_statistics {
    std::string tensor;
    Stats stats;
    float total_sqract = 0.0f;
    float mean_sqract  = 0.0f;
    float max_sqract   = 0.0f;
    float min_sqract   = 0.0f;
    int elements       = 0;
    float stddev       = 0.0f;
    float active       = 0.0f;
    float entropy      = 0.0f;
    float zd           = 0.0f;
    float cossim       = 0.0f;
};

class IMatrixCollector {
public:
    IMatrixCollector() = default;
    void set_params(common_params params) { m_params = std::move(params); }
    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
    void save_imatrix_legacy(int32_t ncall = -1) const;
    void save_imatrix(int32_t n_chunk = -1) const;
    bool load_imatrix_legacy(const char * fname);
    bool load_imatrix(const char * file_name);
    const std::unordered_map<std::string, Stats> & get_mstats() const { return m_stats; }
private:
    std::unordered_map<std::string, Stats> m_stats;
    common_params                          m_params;
    std::mutex                             m_mutex;
    std::vector<std::string>               m_datasets;
    int32_t                                m_last_chunk = 0;
    std::vector<char>                      m_src1_data;
    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
};

// remove any prefix and suffixes from the name
// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
static std::string filter_tensor_name(const char * name) {
    std::string wname;
    const char * p = strchr(name, '#');
    if (p != NULL) {
        p = p + 1;
        const char * q = strchr(p, '#');
        if (q != NULL) {
            wname = std::string(p, q - p);
        } else {
            wname = p;
        }
    } else {
        wname = name;
    }
    return wname;
}

static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) {
    std::vector<std::string> name;
    std::istringstream stream(input);
    std::string item;

    while (std::getline(stream, item, '.')) {
        name.push_back(item);
    }
    for (size_t i = 0; i < name.size(); ++i) {
        if (name[i] == "blk" && i + 1 < name.size()) {
            layer = name[i + 1];
            break;
        }
    }
    for (size_t i = 0; i < name.size(); ++i) {
        if (name[i] == "weight" && i > 0) {
            tensor = name[i - 1];
            break;
        }
    }

    if (tensor.empty()) {
        tensor = input;
    }
    if (layer.empty()) {
        layer = "-";
    }
}

static void compute_statistics(std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e) {
    if (e.values.size() % e.counts.size() != 0) {
        LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size());
        return;
    }
    if (e.counts.empty()) {
        LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
        return;
    }

    const int n_mat = e.counts.size();
    const int row_size = e.values.size() / n_mat;

    std::vector<float> activations;
    activations.reserve(e.values.size());

    for (int i = 0; i < n_mat; ++i) {
        for (int j = 0; j < row_size; ++j) {
            activations.push_back(e.values[i*row_size + j] / e.counts[i]);
        }
    }

    const float act_total     = std::accumulate(activations.begin(), activations.end(), 0.0f);
    const float act_max       = *std::max_element(activations.begin(), activations.end());
    const float act_min       = *std::min_element(activations.begin(), activations.end());
    const float act_mean      = act_total / activations.size();
    const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f);
    const float act_var       = (act_sqr_total / activations.size()) - (act_mean * act_mean);
    const float act_dev       = std::sqrt(std::max(0.0f, act_var));
    float threshold           = 1e-5f;
    const int inactive_count  = std::count_if(activations.begin(), activations.end(),
                                               [threshold](const float v) { return fabsf(v) <= threshold; });
    const float active_ratio  = 1 - static_cast<float>(inactive_count) / activations.size();

    float entropy = 0;
    if (act_total > 0) {
        for (const auto act : activations) {
            if (const float p = act / act_total; p > 0) {
                entropy -= p * std::log2(p);
            }
        }
    }

    int z_score = 0;
    if (act_dev > 0.0f) {
        for (const auto act : activations) {
            if (const float p = (act - act_mean) / act_dev; p > 1) {
                z_score++;
            }
        }
    }

    auto & ts = tstats.emplace_back();
    ts.tensor     = name;
    ts.stats      = e;
    ts.total_sqract = act_total;
    ts.mean_sqract  = act_mean;
    ts.max_sqract   = act_max;
    ts.min_sqract   = act_min;
    ts.elements   = static_cast<int>(activations.size());
    ts.stddev     = act_dev;
    ts.active     = active_ratio;
    ts.entropy    = entropy;
    ts.zd         = static_cast<float>(z_score) / ts.elements;
}

static void compute_cossim(std::vector<tensor_statistics> & tstats) {
    static const std::regex pattern(R"(blk\.(\d+)\.)");
    for (auto & ts : tstats) {
        if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) {
            const int blk = std::stoi(match[1]);
            std::string tname(ts.tensor);
            tname.replace(match.position(1), match.length(1), std::to_string(blk-1));
            auto prev = std::find_if(tstats.begin(), tstats.end(),
                [tname](const tensor_statistics & t) { return t.tensor == tname; });
            if (prev != tstats.end()) {
                const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
                    prev->stats.values.begin(), 0.0f);
                const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
                    ts.stats.values.begin(), 0.0f));
                const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(),
                    prev->stats.values.begin(), 0.0f));
                const float cs = dp / (curr_mag * prev_mag);
                ts.cossim = cs;
            }
        } else {
            ts.cossim = 0;
        }
    }
}

bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    GGML_UNUSED(user_data);

    const struct ggml_tensor * src0 = t->src[0];
    const struct ggml_tensor * src1 = t->src[1];
    std::string wname = filter_tensor_name(src0->name);

    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;

    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
    if (ask) {
        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
        if (t->op != GGML_OP_MUL_MAT) return false;
        // why are small batches ignored (<16 tokens)?
        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
        return true;
    }

    std::lock_guard<std::mutex> lock(m_mutex);

    // copy the data from the GPU memory if needed
    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);

    if (!is_host) {
        const size_t src1_nbytes = ggml_nbytes(src1);
        m_src1_data.resize(src1_nbytes);
        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
    }

    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));

    // TODO: 4d? (is that even used in practice?)
    // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
    if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
        LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
        GGML_ASSERT(false);
    }

    // this has been adapted to the new format of storing merged experts in a single 3d tensor
    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
    if (t->op == GGML_OP_MUL_MAT_ID) {
        //   ids  -> [n_experts_used, n_tokens]
        //   src1 -> [cols, n_expert_used, n_tokens]
        const ggml_tensor * ids = t->src[2];
        const int64_t n_as = src0->ne[2];
        const int64_t n_ids = ids->ne[0];

        // the top-k selected expert ids are stored in the ids tensor
        // for simplicity, always copy ids to host, because it is small
        // take into account that ids is not contiguous!

        GGML_ASSERT(ids->ne[1] == src1->ne[2]);

        m_ids.resize(ggml_nbytes(ids));
        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));

        auto & e = m_stats[wname];

        if (e.counts.size() == 1 && n_as > 1) {
            // broadcast, when loading an old imatrix
            e.counts.resize(n_as, e.counts[0]);
        }
        if (e.values.empty()) {
            e.values.resize(src1->ne[0]*n_as, 0);
            e.counts.resize(n_as, 0);
        }
        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as));
            exit(1); //GGML_ABORT("fatal error");
        }
        else if (e.counts.size() != (size_t)n_as) {
            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as);
            exit(1); //GGML_ABORT("fatal error");
        }
        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
        // loop over all possible experts, regardless if they are used or not in the batch
        for (int64_t ex = 0; ex < n_as; ++ex) {
            size_t e_start = ex*src1->ne[0];

            for (int64_t idx = 0; idx < n_ids; ++idx) {
                for (int64_t row = 0; row < src1->ne[2]; ++row) {
                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);

                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check

                    if (excur != ex) continue;

                    const int64_t i11 = idx % src1->ne[1];
                    const int64_t i12 = row;
                    const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);

                    e.counts[ex]++;

                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
                        e.values[e_start + j] += x[j] * x[j];
                        if (!std::isfinite((float)e.values[e_start + j])) {
                            LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
                            exit(1);
                        }
                    }
                }
            }
            const int32_t n_chunk = e.counts[ex] / chunk_size;
            if (n_chunk > m_last_chunk) {
                const int32_t chunk_step = n_chunk - m_last_chunk;
                m_last_chunk = n_chunk;
                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                    save_imatrix();
                }
                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
                    save_imatrix(m_last_chunk);
                }
            }
        }
    } else {
        auto & e = m_stats[wname];
        const int64_t n_mat = src1->ne[2] * src1->ne[3];

        if (e.values.empty()) {
            e.values.resize(src1->ne[0] * n_mat, 0);
            e.counts.resize(n_mat, 0);
        }
        else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
            exit(1); //GGML_ABORT("fatal error");
        }
        else if (e.counts.size() != (size_t)n_mat) {
            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
            exit(1); //GGML_ABORT("fatal error");
        }
        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
        for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
            for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
                const int64_t mat_id = i3 * src1->ne[2] + i2;
                const int64_t mat_start = mat_id * src1->ne[0];

                for (int64_t row = 0; row < src1->ne[1]; ++row) {
                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
                    e.counts[mat_id]++;
                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
                        e.values[mat_start + j] += x[j] * x[j];
                        if (!std::isfinite((float)e.values[j])) {
                            LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
                            exit(1);
                        }
                    }
                }
                const int32_t n_chunk = e.counts[mat_id] / chunk_size;
                if (n_chunk > m_last_chunk) {
                    const int32_t chunk_step = n_chunk - m_last_chunk;
                    m_last_chunk = n_chunk;
                    if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
                        save_imatrix();
                    }
                    if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
                        save_imatrix(m_last_chunk);
                    }
                }
            }
        }
    }

    return true;
}

void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
    auto fname = m_params.out_file;

    if (ncall > 0) {
        fname += ".at_";
        fname += std::to_string(ncall);
    }

    // warn when writing imatrix entries that do not have full data
    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data

    int n_entries = 0;
    std::vector<std::string> to_store;

    bool is_first = true; // for printing
    for (const auto & kv : m_stats) {
        const int n_all = kv.second.counts.size();

        if (n_all == 0) {
            continue;
        }

        int n_zeros = 0;
        for (const int c : kv.second.counts) {
            if (c == 0) {
                n_zeros++;
            }
        }

        if (n_zeros != 0 && is_first) {
            LOG_INF("\n");
            is_first = false;
        }

        if (n_zeros == n_all) {
            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
            continue;
        }

        if (n_zeros > 0) {
            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
        }

        n_entries++;
        to_store.push_back(kv.first);
    }

    if (to_store.size() < m_stats.size()) {
        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
    }

    // deterministic tensor name order
    std::sort(to_store.begin(), to_store.end());

    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;

    std::ofstream out(fname, std::ios::binary);
    out.write((const char *) &n_entries, sizeof(n_entries));
    for (const auto & name : to_store) {
        const auto & stat = m_stats.at(name);
        const int32_t len = name.size();
        out.write((const char *) &len, sizeof(len));
        out.write(name.c_str(), len);
        // ceiling division to avoid accidental zeros
        const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size;
        out.write((const char *) &ncall, sizeof(ncall));
        const int32_t nval = stat.values.size();
        const int32_t nmat = stat.counts.size();
        out.write((const char *) &nval, sizeof(nval));
        if (nval > 0 && nmat > 0) {
            std::vector<float> tmp(nval);
            for (int32_t i = 0; i < nval; i++) {
                float count = static_cast<float>(stat.counts[i / (nval / nmat)]);
                float value = stat.values[i];
                if (count == 0.0f) {
                    // store 1 for partial data
                    value = 1.0f;
                    count = 1.0f;
                }
                tmp[i] = (value / count) * static_cast<float>(ncall);
            }
            out.write((const char *) tmp.data(), nval * sizeof(float));
        }
    }

    // Write the number of call the matrix was computed with
    out.write((const char *) &m_last_chunk, sizeof(m_last_chunk));

    // Write the input filename at the end of the file to later on specify it in quantize
    {
        const char * dataset_file = m_params.prompt_file.c_str();
        int32_t len = m_params.prompt_file.size();
        // When there is no prompt but there were other imatrix files loaded, use the last dataset
        if (m_params.prompt_file.empty() && !m_datasets.empty()) {
            const std::string & dataset_str = m_datasets[m_datasets.size() - 1];
            dataset_file = dataset_str.c_str();
            len = dataset_str.size();
        }
        out.write((const char *) &len, sizeof(len));
        out.write(dataset_file, len);
    }

    LOGV(1, "\n");
    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
}

void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
    auto fname = m_params.out_file;

    // TODO: use the new format in more cases
    if (!string_ends_with(fname, ".gguf")) {
        LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
        this->save_imatrix_legacy(n_chunk);
        return;
    }

    if (n_chunk > 0) {
        fname += ".at_";
        fname += std::to_string(n_chunk);
    }

    // write imatrix entries even if they don't have full data. (can be corrected when reading)
    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data

    std::vector<std::string> to_store;
    size_t data_size = 0;

    bool is_first = true; // for printing
    for (const auto & kv : m_stats) {
        const int n_all = kv.second.counts.size();

        int n_zeros = 0;
        for (const auto c : kv.second.counts) {
            if (c == 0) {
                n_zeros++;
            }
        }

        if (n_zeros != 0 && is_first) {
            LOG_INF("\n");
            is_first = false;
        }

        if (n_zeros > 0) {
            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
        }

        to_store.push_back(kv.first);
        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
    }

    // deterministic tensor name order
    std::sort(to_store.begin(), to_store.end());

    struct ggml_init_params params = {
        /* .mem_size   = */ data_size,
        /* .mem_buffer = */ NULL,
        /* .no_alloc   = */ false,
    };
    struct ggml_context * ctx = ggml_init(params);
    struct gguf_context * ctx_gguf = gguf_init_empty();

    {
        std::vector<const char *> datasets;
        datasets.reserve(m_datasets.size() + 1);
        for (size_t i = 0; i < m_datasets.size(); ++i) {
            datasets.push_back(m_datasets[i].c_str());
        }
        if (!m_params.prompt_file.empty()) {
            datasets.push_back(m_params.prompt_file.c_str());
        }

        gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
        // Write the dataset paths
        gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size());
        // Write the number of chunks the matrix was computed with
        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
    }

    for (const auto & name : to_store) {
        const auto & stat = m_stats.at(name);
        const int32_t nval = (int32_t) stat.values.size();
        const int32_t nmat = (int32_t) stat.counts.size();
        if (nval > 0 && nmat > 0) {
            struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
            struct ggml_tensor * counts  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
            ggml_format_name(in_sum2, "%s.in_sum2", name.c_str());
            ggml_format_name(counts, "%s.counts", name.c_str());

            for (int32_t j = 0; j < nval; ++j) {
                ((float *) in_sum2->data)[j] = (float) stat.values[j];
            }
            for (int32_t j = 0; j < nmat; ++j) {
                ((float *) counts->data)[j] = (float) stat.counts[j];
            }

            gguf_add_tensor(ctx_gguf, in_sum2);
            gguf_add_tensor(ctx_gguf, counts);
        }
    }

    gguf_write_to_file(ctx_gguf, fname.c_str(), false);

    LOGV(1, "\n");
    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());

    gguf_free(ctx_gguf);
    ggml_free(ctx);
}

bool IMatrixCollector::load_imatrix_legacy(const char * fname) {
    std::ifstream in(fname, std::ios::binary);
    if (!in) {
        LOG_ERR("%s: failed to open %s\n", __func__, fname);
        return false;
    }
    int n_entries;
    in.read((char *) &n_entries, sizeof(n_entries));
    if (in.fail() || n_entries < 1) {
        LOG_ERR("%s: no data in file %s\n", __func__, fname);
        return false;
    }
    // Guess the chunk size because it's not stored in the file
    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;

    for (int i = 0; i < n_entries; ++i) {
        int32_t len = 0;
        in.read((char *) &len, sizeof(len));
        std::vector<char> name_as_vec(len + 1);
        in.read((char *) name_as_vec.data(), len);
        if (in.fail()) {
            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
            return false;
        }
        name_as_vec[len] = 0;
        std::string name{ name_as_vec.data() };
        auto & e = m_stats[std::move(name)];
        int32_t ncall = 0;
        in.read((char *) &ncall, sizeof(ncall));
        int32_t nval = 0;
        in.read((char *) &nval, sizeof(nval));
        if (in.fail() || nval < 1) {
            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
            m_stats = {};
            return false;
        }

        if (e.values.empty()) {
            e.values.resize(nval, 0.0f);
            e.counts.resize(1, 0);
        }

        std::vector<float> tmp(nval);
        in.read((char *) tmp.data(), nval * sizeof(float));
        if (in.fail()) {
            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
            m_stats = {};
            return false;
        }

        // Recreate the state as expected by save_imatrix(), and correct for weighted sum.
        for (int i = 0; i < nval; i++) {
            e.values[i] += tmp[i] * chunk_size;
        }
        // The legacy format doesn't distinguish the counts for different experts
        for (size_t j = 0; j < e.counts.size(); ++j) {
            e.counts[j] += ncall * chunk_size;
        }
    }

    {
        // TODO: extract into its own method; this is also used by the GGUF-based format
        // Calculate the last chunk count
        int64_t max_count = 0;
        for (const auto & stats : m_stats) {
            for (int64_t count : stats.second.counts) {
                if (count > max_count) {
                    max_count = count;
                }
            }
        }
        m_last_chunk = max_count / (chunk_size);
    }

    {
        // Read the number of calls the matrix was computed with
        int32_t n_calls;
        in.read((char *) &n_calls, sizeof(n_calls));
        // ignore it because it's not important
    }

    // Read the dataset path to include it when writing to GGUF
    if (!in.fail()){
        int32_t len = 0;
        in.read((char *) &len, sizeof(len));
        if (!in.fail()) {
            std::vector<char> dataset;
            dataset.resize(len + 1, 0);
            in.read(dataset.data(), len);
            if (!in.fail()) {
                m_datasets.push_back(dataset.data());
            }
        }
    }

    return true;
}

// Using GGUF as the file format, for greater extensibility
bool IMatrixCollector::load_imatrix(const char * file_name) {
    struct ggml_context * ctx = nullptr;
    struct gguf_init_params meta_gguf_params = {
        /* .no_alloc = */ false, // the data is needed
        /* .ctx      = */ &ctx,
    };
    struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
    if (!ctx_gguf) {
        return this->load_imatrix_legacy(file_name);
    }
    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
    if (n_entries < 1) {
        LOG_ERR("%s: no data in file %s\n", __func__, file_name);
        gguf_free(ctx_gguf);
        ggml_free(ctx);
        return false;
    }

    const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
    if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) {
        const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key);
        m_datasets.reserve(m_datasets.size() + n);
        for (int64_t i = 0; i < n; ++i) {
            m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i));
        }
    }

    const std::string in_sum2_suffix{ ".in_sum2" };
    const std::string counts_suffix{ ".counts" };

    // Could re-use m_stats instead, but this allows
    // checking for completeness of *each* loaded imatrix file
    // and also makes it easier to re-use a similar implementation in quantize.cpp
    // Using an ordered map to get a deterministic iteration order.
    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;

    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
        std::string name = cur->name;

        if (name.empty()) { continue; }

        if (string_remove_suffix(name, in_sum2_suffix)) {
            // in_sum2
            sums_counts_for[std::move(name)].first = cur;
        } else if (string_remove_suffix(name, counts_suffix)) {
            // counts
            sums_counts_for[std::move(name)].second = cur;
        } else {
            // ignore other tensors
        }
    }

    for (const auto & sc : sums_counts_for) {
        const std::string &        name    = sc.first;
        const struct ggml_tensor * in_sum2 = sc.second.first;
        const struct ggml_tensor * counts  = sc.second.second;

        if (!in_sum2 || !counts) {
            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            return false;
        }

        auto & e = m_stats[name];

        int64_t nval = ggml_nelements(in_sum2);
        if (e.values.empty()) {
            e.values.resize(nval, 0.0f);
        } else if ((size_t) nval != e.values.size()) {
            LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            return false;
        }

        int64_t ncounts = ggml_nelements(counts);
        if (e.counts.empty()) {
            e.counts.resize(ncounts, 0);
        } else if (e.counts.size() == 1 && ncounts > 1) {
            // broadcast, when loading an old imatrix
            e.counts.resize(ncounts, e.counts[0]);
        } else if ((size_t) ncounts != e.counts.size()) {
            LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
            gguf_free(ctx_gguf);
            ggml_free(ctx);
            return false;
        }

        // Recreate the state as expected by save_imatrix()
        for (int64_t j = 0; j < nval; j++) {
            e.values[j] += ((const float *) in_sum2->data)[j];
        }
        for (int64_t j = 0; j < ncounts; j++) {
            e.counts[j] += std::lround(((const float *) counts->data)[j]);
        }
    }

    // TODO: extract into its own method; this is also used by the legacy format
    // Calculate the last chunk count
    int64_t max_count = 0;
    for (const auto & stats : m_stats) {
        for (int64_t count : stats.second.counts) {
            if (count > max_count) {
                max_count = count;
            }
        }
    }
    m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel);

    gguf_free(ctx_gguf);
    ggml_free(ctx);
    return true;
}

static IMatrixCollector g_collector;

static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
    return g_collector.collect_imatrix(t, ask, user_data);
}

struct results_log_softmax {
    double log_softmax;
    float  logit;
    float  prob;
};

static std::vector<float> softmax(const std::vector<float> & logits) {
    std::vector<float> probs(logits.size());
    float max_logit = logits[0];
    for (float v : logits) {
        max_logit = std::max(max_logit, v);
    }
    double sum_exp = 0.0;
    for (size_t i = 0; i < logits.size(); i++) {
        // Subtract the maximum logit value from the current logit value for numerical stability
        const float logit = logits[i] - max_logit;
        const float exp_logit = expf(logit);
        sum_exp += exp_logit;
        probs[i] = exp_logit;
    }
    for (size_t i = 0; i < probs.size(); i++) {
        probs[i] /= sum_exp;
    }
    return probs;
}

static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
    float max_logit = logits[0];
    for (int i = 1; i < n_vocab; ++i) {
        max_logit = std::max(max_logit, logits[i]);
    }
    double sum_exp = 0.0;
    for (int i = 0; i < n_vocab; ++i) {
        sum_exp += expf(logits[i] - max_logit);
    }
    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
}

static void process_logits(
    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
    double & nll, double & nll2, float * logit_history, float * prob_history) {
    std::mutex mutex;
    int counter = 0;
    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
        double local_nll  = 0;
        double local_nll2 = 0;
        while (true) {
            std::unique_lock<std::mutex> lock(mutex);
            int i = counter++;
            if (i >= n_token) {
                nll += local_nll; nll2 += local_nll2;
                break;
            }
            lock.unlock();
            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
            const double v = -results.log_softmax;
            local_nll += v;
            local_nll2 += v*v;

            logit_history[i] = results.logit;
            prob_history[i]  = results.prob;
        }
    };
    for (auto & w : workers) {
        w = std::thread(compute);
    }
    compute();
    for (auto & w : workers) {
        w.join();
    }
}

static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
    const llama_model * model = llama_get_model(ctx);
    const llama_vocab * vocab = llama_model_get_vocab(model);

    const bool add_bos = llama_vocab_get_add_bos(vocab);

    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));

    auto tim1 = std::chrono::high_resolution_clock::now();
    LOG_INF("%s: tokenizing the input ..\n", __func__);

    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true, params.parse_special);

    auto tim2 = std::chrono::high_resolution_clock::now();
    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());

    if (params.i_chunk > 0) {
        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
            return false;
        }
        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
    }

    if (int(tokens.size()) < 2*n_ctx) {
        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
        return false;
    }

    std::vector<float> logit_history;
    std::vector<float> prob_history;

    if (params.compute_ppl) {
        logit_history.resize(tokens.size());
        prob_history.resize(tokens.size());
    }

    const int n_chunk_max = tokens.size() / n_ctx;

    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
    const int n_vocab = llama_vocab_n_tokens(vocab);
    const int n_batch = params.n_batch;

    int count = 0;
    double nll = 0.0;
    double nll2 = 0.0;

    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
    const int n_seq = std::max(1, n_batch / n_ctx);

    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);

    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);

    std::vector<float> logits;
    if (params.compute_ppl && num_batches > 1) {
        logits.reserve((size_t)n_ctx * n_vocab);
    }

    LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);

    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);

    for (int i = 0; i < n_chunk; i += n_seq) {
        const int start =     i * n_ctx;
        const int end   = start + n_ctx;

        const int n_seq_batch = std::min(n_seq, n_chunk - i);

        const auto t_start = std::chrono::high_resolution_clock::now();

        // clear the KV cache
        llama_memory_clear(llama_get_memory(ctx), true);

        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

            // clear the batch
            common_batch_clear(batch);

            for (int seq = 0; seq < n_seq_batch; seq++) {
                int seq_start = batch_start + seq*n_ctx;

                // save original token and restore it after eval
                const auto token_org = tokens[seq_start];

                // add BOS token for the first batch of each chunk
                if (add_bos && j == 0) {
                    tokens[seq_start] = llama_vocab_bos(vocab);
                }
                for (int k = 0; k < batch_size; ++k) {
                    // NOTE: specifying all logits to get activations for the output.weight tensor
                    //       and also for the perplexity calculation.
                    // TODO: only get outputs when (params.process_output || params.compute_ppl)
                    //       (not possible when this skips FFN computation of the last layer)
                    common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
                }

                // restore the original token in case it was set to BOS
                tokens[seq_start] = token_org;
            }

            if (llama_decode(ctx, batch)) {
                LOG_ERR("%s : failed to eval\n", __func__);
                llama_batch_free(batch);
                return false;
            }

            if (params.compute_ppl && num_batches > 1) {
                const auto * batch_logits = llama_get_logits(ctx);
                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
            }
        }


        if (i == 0) {
            llama_synchronize(ctx);
            const auto t_end = std::chrono::high_resolution_clock::now();
            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
            int total_seconds = (int)(t_total * n_chunk / n_seq);
            if (total_seconds >= 60*60) {
                LOG("%d hours ", total_seconds / (60*60));
                total_seconds = total_seconds % (60*60);
            }
            LOG("%.2f minutes\n", total_seconds / 60.0);
        }

        if (params.compute_ppl) {
            const int first = n_ctx/2;
            for (int seq = 0; seq < n_seq_batch; seq++) {
                const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);

                llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;

                process_logits(n_vocab, all_logits + first*n_vocab,
                        tokens_data, n_ctx - 1 - first,
                        workers, nll, nll2,
                        logit_history.data() + start + seq*n_ctx + first,
                        prob_history.data()  + start + seq*n_ctx + first);

                count += n_ctx - first - 1;

                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
            }
            fflush(stdout);

            logits.clear();
        }
    }

    LOG("\n");

    if (params.compute_ppl) {
        nll2 /= count;
        nll /= count;
        const double ppl = exp(nll);
        nll2 -= nll * nll;
        if (nll2 > 0) {
            nll2 = sqrt(nll2/(count-1));
            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
        } else {
            LOG("Unexpected negative standard deviation of log(prob)\n");
        }
    }

    llama_batch_free(batch);

    return true;
}

static bool show_statistics(const common_params & params) {
    std::vector<tensor_statistics> ts;
    if (params.in_files.empty() || params.in_files.size() > 1) {
        LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n");
        return false;
    }
    if (g_collector.load_imatrix(params.in_files[0].c_str())) {
        for (const auto & [name, stats] :g_collector.get_mstats()) {
            compute_statistics(ts, name, stats);
        }
    } else {
        LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str());
        return false;
    }
    if (!ts.empty()) {
        compute_cossim(ts);
    } else {
        LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str());
        return false;
    }

    struct tensor_comparer {
        bool operator()(const tensor_statistics & a, const tensor_statistics & b) const {
            std::string layer, name_a, name_b;
            ;
            process_tensor_name(a.tensor, layer, name_a);
            process_tensor_name(b.tensor, layer, name_b);
            return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract);
        }
    };
    std::sort(ts.begin(), ts.end(), tensor_comparer());

    struct weighted_stats {
        float weighted_bias   = 0.0f;
        float weighted_zd     = 0.0f;
        float weighted_cossim = 0.0f;
        int   total_elements  = 0;
    };
    std::map<int, weighted_stats> ws;

    LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast<int>(ts.size()));
    LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", "       Tensor", "          Σ(Act²)",
            "  Min", "            Max", "           μ", "   σ", " % Active", "N", "   Entropy", "E (norm)", "ZD",
            "  CosSim");
    LOG_INF(
        "=============================================================================================================="
        "===========================================================\n");
    for (const auto & tstat : ts) {
        std::string layer, name;
        process_tensor_name(tstat.tensor, layer, name);

        int blk;
        try {
            blk = std::stoi(layer);
        } catch (const std::exception & e) {
            blk = -1;  // not a block layer
        }

        LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n",
                layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract,
                tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy,
                100.0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim);

        const float weighted_bias   = tstat.elements * tstat.total_sqract;
        const float weighted_zd     = tstat.elements * tstat.zd;
        const float weighted_cossim = tstat.elements * tstat.cossim;

        if (ws.find(blk) != ws.end()) {
            ws[blk].weighted_bias += weighted_bias;
            ws[blk].weighted_zd += weighted_zd;
            ws[blk].weighted_cossim += weighted_cossim;
            ws[blk].total_elements += tstat.elements;
        } else {
            weighted_stats temp_ws;
            temp_ws.weighted_bias   = weighted_bias;
            temp_ws.weighted_zd     = weighted_zd;
            temp_ws.weighted_cossim = weighted_cossim;
            temp_ws.total_elements  = tstat.elements;
            ws[blk]                 = temp_ws;
        }
    }

    const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; });
    LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers);
    LOG_INF("\n%s\t%s\t%s\t%s\n", "  Layer", "     μΣ(Act²)", "      μZD", "μCosSim");
    LOG_INF("================================================\n");
    for (const auto & [first, second] : ws) {
        const auto & layer = first;
        const auto & stats = second;

        if (stats.total_elements == 0) {
            continue;
        }

        if (layer >= 0) {
            const float bias   = stats.weighted_bias / stats.total_elements;
            const float zd     = stats.weighted_zd / stats.total_elements;
            const float cossim = stats.weighted_cossim / stats.total_elements;

            LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim);
        }
    }
    LOG_INF("\n");

    return true;
}

int main(int argc, char ** argv) {
    common_params params;

    params.out_file = "imatrix.gguf";

    params.n_ctx = 512;
    params.escape = false;

    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
        return 1;
    }

    if (params.show_statistics) {
        if (!show_statistics(params)) {
            return 1;
        }
        return 0;
    }

    common_init();

    const int32_t n_ctx = params.n_ctx;

    if (n_ctx <= 0) {
        LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
        return 1;
    }

    {
        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
        const int32_t n_kv = n_seq * n_ctx;

        params.n_parallel = n_seq;
        params.n_ctx      = n_kv;

        params.n_batch = std::min(params.n_batch, n_kv);
    }

    g_collector.set_params(params);

    for (const auto & in_file : params.in_files) {
        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
        if (!g_collector.load_imatrix(in_file.c_str())) {
            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
            return 1;
        }
    }

    if (params.prompt.empty()) {
        LOG_INF("No prompt provided; combining precomputed matrices only.\n");

        if (params.in_files.empty()) {
            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
            return 1;
        }

        if (params.in_files.size() == 1) {
            LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str());
        } else if (params.in_files.size() > 1) {
            LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
        }

        g_collector.save_imatrix();

        return 0;
    }

    llama_backend_init();
    llama_numa_init(params.numa);

    // pass the callback to the backend scheduler
    // it will be executed for each node during the graph computation
    params.cb_eval = ik_collect_imatrix;
    params.cb_eval_user_data = NULL;
    params.warmup = false;

    // init
    common_init_result llama_init = common_init_from_params(params);

    llama_model * model = llama_init.model.get();
    llama_context * ctx = llama_init.context.get();

    if (model == nullptr || ctx == nullptr) {
        LOG_ERR("%s : failed to init\n", __func__);
        return 1;
    }

    const int n_ctx_train = llama_model_n_ctx_train(model);
    if (params.n_ctx > n_ctx_train) {
        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
                __func__, n_ctx_train, params.n_ctx);
    }

    // print system information
    {
        LOG_INF("\n");
        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
    }

    if (!compute_imatrix(ctx, params, n_ctx)) {
        return 1;
    }

    g_collector.save_imatrix();

    LOG("\n");
    llama_perf_context_print(ctx);

    llama_backend_free();

    return 0;
}
-												common : move arg parser code to `arg.cpp` (#9388)

* common : move arg parser to arg.cpp

* better categorize args

* add cmake

* missing climits

* missing cstdarg

* common : more explicit includes

* fix build

* refactor gpt_params_parse

* update server readme

* fix test

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-09-09 23:36:09 +02:00
+								#include "arg.h"
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								#include "common.h"
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								#include "log.h"
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								#include "llama.h"
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								#include "gguf.h"
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								#include <algorithm>
-												Fix: Compile failure due to Microsoft STL breaking change (#11836)


											
										
										
											2025-02-12 20:36:11 +00:00
+								#include <chrono>
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								#include <cmath>
 								#include <cstdio>
 								#include <cstring>
 								#include <ctime>
 								#include <thread>
 								#include <mutex>
 								#include <vector>
 								#include <fstream>
 								#include <unordered_map>
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								#include <map>
-												imatrix: add option to display importance score statistics for a given imatrix file (#12718)

* Add --show-statistics option

* Add --show-statistics logic

* Add tensor name parsing

* Tidy output format

* Fix typo in title

* Improve tensor influence ranking

* Add better statistics

* Change statistics' sort order

* Add Cosine Similarity

* Add header search path

* Change header search path to private

* Add weighted statistics per layer

* Update report title

* Refactor compute_statistics out of main

* Refactor compute_cossim out of load_imatrix

* Refactor compute_statistics out of load_imatrix

* Move imatrix statistics calculation into its own functions

* Add checks and validations

* Remove unnecessary include directory

* Rename labels

* Add m_stats getter and refactor compute_statistics out of load_imatrix

* Refactor variable names

* Minor cosmetic change

* Retrigger checks (empty commit)

* Rerun checks (empty commit)

* Fix unnecessary type promotion

Co-authored-by: compilade <git@compilade.net>

* Reverting change to improve code readability

* Rerun checks (empty commit)

* Rerun checks (empty commit)

* Rerun checks - third time's the Charm 🤞 (empty commit)

* Minor cosmetic change

* Update README

* Fix typo

* Update README

* Rerun checks (empty commit)

* Re-implement changes on top of #9400

* Update README.md

* Update README

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

* Remove duplicate option in print_usage()

* Update README.md

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Remove input check

* Remove commented out code

---------

Co-authored-by: compilade <git@compilade.net>
											
										
										
											2025-07-22 13:33:37 +01:00
+								#include <regex>
 								#include <numeric>
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
 								#if defined(_MSC_VER)
 								#pragma warning(disable: 4244 4267) // possible loss of data
 								#endif
-												common : refactor arg parser (#9308)

* (wip) argparser v3

* migrated

* add test

* handle env

* fix linux build

* add export-docs example

* fix build (2)

* skip build test-arg-parser on windows

* update server docs

* bring back missing --alias

* bring back --n-predict

* clarify test-arg-parser

* small correction

* add comments

* fix args with 2 values

* refine example-specific args

* no more lamba capture

Co-authored-by: slaren@users.noreply.github.com

* params.sparams

* optimize more

* export-docs --> gen-docs
											
										
										
											2024-09-07 20:43:51 +02:00
+								static void print_usage(int, char ** argv) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								    LOG("\nexample usage:\n");
 								    LOG("\n    %s \\\n"
-												imatrix: add option to display importance score statistics for a given imatrix file (#12718)

* Add --show-statistics option

* Add --show-statistics logic

* Add tensor name parsing

* Tidy output format

* Fix typo in title

* Improve tensor influence ranking

* Add better statistics

* Change statistics' sort order

* Add Cosine Similarity

* Add header search path

* Change header search path to private

* Add weighted statistics per layer

* Update report title

* Refactor compute_statistics out of main

* Refactor compute_cossim out of load_imatrix

* Refactor compute_statistics out of load_imatrix

* Move imatrix statistics calculation into its own functions

* Add checks and validations

* Remove unnecessary include directory

* Rename labels

* Add m_stats getter and refactor compute_statistics out of load_imatrix

* Refactor variable names

* Minor cosmetic change

* Retrigger checks (empty commit)

* Rerun checks (empty commit)

* Fix unnecessary type promotion

Co-authored-by: compilade <git@compilade.net>

* Reverting change to improve code readability

* Rerun checks (empty commit)

* Rerun checks (empty commit)

* Rerun checks - third time's the Charm 🤞 (empty commit)

* Minor cosmetic change

* Update README

* Fix typo

* Update README

* Rerun checks (empty commit)

* Re-implement changes on top of #9400

* Update README.md

* Update README

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

* Remove duplicate option in print_usage()

* Update README.md

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Remove input check

* Remove commented out code

---------

Co-authored-by: compilade <git@compilade.net>
											
										
										
											2025-07-22 13:33:37 +01:00
+								            "       -m model.gguf -f some-text.txt [-o imatrix.gguf] [--no-ppl] \\\n"
 								            "       [--process-output] [--chunk 123] [--save-frequency 0] [--output-frequency 10] \\\n"
 								            "       [--in-file imatrix-prev-0.gguf --in-file imatrix-prev-1.gguf ...] [--parse-special] \\\n"
 								            "       [--show-statistics] [...]\n" , argv[0]);
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								    LOG("\n");
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								}
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								static const char * const LLM_KV_IMATRIX_DATASETS    = "imatrix.datasets";
 								static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count";
 								static const char * const LLM_KV_IMATRIX_CHUNK_SIZE  = "imatrix.chunk_size";
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								struct Stats {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    std::vector<float>   values;
 								    std::vector<int64_t> counts;
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								};
-												imatrix: add option to display importance score statistics for a given imatrix file (#12718)

* Add --show-statistics option

* Add --show-statistics logic

* Add tensor name parsing

* Tidy output format

* Fix typo in title

* Improve tensor influence ranking

* Add better statistics

* Change statistics' sort order

* Add Cosine Similarity

* Add header search path

* Change header search path to private

* Add weighted statistics per layer

* Update report title

* Refactor compute_statistics out of main

* Refactor compute_cossim out of load_imatrix

* Refactor compute_statistics out of load_imatrix

* Move imatrix statistics calculation into its own functions

* Add checks and validations

* Remove unnecessary include directory

* Rename labels

* Add m_stats getter and refactor compute_statistics out of load_imatrix

* Refactor variable names

* Minor cosmetic change

* Retrigger checks (empty commit)

* Rerun checks (empty commit)

* Fix unnecessary type promotion

Co-authored-by: compilade <git@compilade.net>

* Reverting change to improve code readability

* Rerun checks (empty commit)

* Rerun checks (empty commit)

* Rerun checks - third time's the Charm 🤞 (empty commit)

* Minor cosmetic change

* Update README

* Fix typo

* Update README

* Rerun checks (empty commit)

* Re-implement changes on top of #9400

* Update README.md

* Update README

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

* Remove duplicate option in print_usage()

* Update README.md

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Remove input check

* Remove commented out code

---------

Co-authored-by: compilade <git@compilade.net>
											
										
										
											2025-07-22 13:33:37 +01:00
+								struct tensor_statistics {
 								    std::string tensor;
 								    Stats stats;
 								    float total_sqract = 0.0f;
 								    float mean_sqract  = 0.0f;
 								    float max_sqract   = 0.0f;
 								    float min_sqract   = 0.0f;
 								    int elements       = 0;
 								    float stddev       = 0.0f;
 								    float active       = 0.0f;
 								    float entropy      = 0.0f;
 								    float zd           = 0.0f;
 								    float cossim       = 0.0f;
 								};
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								class IMatrixCollector {
 								public:
 								    IMatrixCollector() = default;
-												common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-10-10 22:57:42 +02:00
+								    void set_params(common_params params) { m_params = std::move(params); }
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								    bool collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    void save_imatrix_legacy(int32_t ncall = -1) const;
 								    void save_imatrix(int32_t n_chunk = -1) const;
 								    bool load_imatrix_legacy(const char * fname);
 								    bool load_imatrix(const char * file_name);
-												imatrix: add option to display importance score statistics for a given imatrix file (#12718)

* Add --show-statistics option

* Add --show-statistics logic

* Add tensor name parsing

* Tidy output format

* Fix typo in title

* Improve tensor influence ranking

* Add better statistics

* Change statistics' sort order

* Add Cosine Similarity

* Add header search path

* Change header search path to private

* Add weighted statistics per layer

* Update report title

* Refactor compute_statistics out of main

* Refactor compute_cossim out of load_imatrix

* Refactor compute_statistics out of load_imatrix

* Move imatrix statistics calculation into its own functions

* Add checks and validations

* Remove unnecessary include directory

* Rename labels

* Add m_stats getter and refactor compute_statistics out of load_imatrix

* Refactor variable names

* Minor cosmetic change

* Retrigger checks (empty commit)

* Rerun checks (empty commit)

* Fix unnecessary type promotion

Co-authored-by: compilade <git@compilade.net>

* Reverting change to improve code readability

* Rerun checks (empty commit)

* Rerun checks (empty commit)

* Rerun checks - third time's the Charm 🤞 (empty commit)

* Minor cosmetic change

* Update README

* Fix typo

* Update README

* Rerun checks (empty commit)

* Re-implement changes on top of #9400

* Update README.md

* Update README

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

* Remove duplicate option in print_usage()

* Update README.md

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Remove input check

* Remove commented out code

---------

Co-authored-by: compilade <git@compilade.net>
											
										
										
											2025-07-22 13:33:37 +01:00
+								    const std::unordered_map<std::string, Stats> & get_mstats() const { return m_stats; }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								private:
 								    std::unordered_map<std::string, Stats> m_stats;
-												common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-10-10 22:57:42 +02:00
+								    common_params                          m_params;
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    std::mutex                             m_mutex;
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    std::vector<std::string>               m_datasets;
 								    int32_t                                m_last_chunk = 0;
-												imatrix: fix oob writes if src1 is not contiguous (#13286)


											
										
										
											2025-05-04 00:50:37 +02:00
+								    std::vector<char>                      m_src1_data;
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								    std::vector<char>                      m_ids; // the expert ids from ggml_mul_mat_id
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								};
-												imatrix : fix wname for mul_mat_id ops (#6271)

* imatrix : fix wname for mul_mat_id ops

* also filter tensor names in mul_mat_id ops

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-03-24 16:18:45 +02:00
+								// remove any prefix and suffixes from the name
 								// CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight
 								static std::string filter_tensor_name(const char * name) {
 								    std::string wname;
 								    const char * p = strchr(name, '#');
 								    if (p != NULL) {
 								        p = p + 1;
 								        const char * q = strchr(p, '#');
 								        if (q != NULL) {
 								            wname = std::string(p, q - p);
 								        } else {
 								            wname = p;
 								        }
 								    } else {
 								        wname = name;
 								    }
 								    return wname;
 								}
-												imatrix: add option to display importance score statistics for a given imatrix file (#12718)

* Add --show-statistics option

* Add --show-statistics logic

* Add tensor name parsing

* Tidy output format

* Fix typo in title

* Improve tensor influence ranking

* Add better statistics

* Change statistics' sort order

* Add Cosine Similarity

* Add header search path

* Change header search path to private

* Add weighted statistics per layer

* Update report title

* Refactor compute_statistics out of main

* Refactor compute_cossim out of load_imatrix

* Refactor compute_statistics out of load_imatrix

* Move imatrix statistics calculation into its own functions

* Add checks and validations

* Remove unnecessary include directory

* Rename labels

* Add m_stats getter and refactor compute_statistics out of load_imatrix

* Refactor variable names

* Minor cosmetic change

* Retrigger checks (empty commit)

* Rerun checks (empty commit)

* Fix unnecessary type promotion

Co-authored-by: compilade <git@compilade.net>

* Reverting change to improve code readability

* Rerun checks (empty commit)

* Rerun checks (empty commit)

* Rerun checks - third time's the Charm 🤞 (empty commit)

* Minor cosmetic change

* Update README

* Fix typo

* Update README

* Rerun checks (empty commit)

* Re-implement changes on top of #9400

* Update README.md

* Update README

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

* Remove duplicate option in print_usage()

* Update README.md

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Remove input check

* Remove commented out code

---------

Co-authored-by: compilade <git@compilade.net>
											
										
										
											2025-07-22 13:33:37 +01:00
+								static void process_tensor_name(const std::string & input, std::string & layer, std::string & tensor) {
 								    std::vector<std::string> name;
 								    std::istringstream stream(input);
 								    std::string item;
 								    while (std::getline(stream, item, '.')) {
 								        name.push_back(item);
 								    }
 								    for (size_t i = 0; i < name.size(); ++i) {
 								        if (name[i] == "blk" && i + 1 < name.size()) {
 								            layer = name[i + 1];
 								            break;
 								        }
 								    }
 								    for (size_t i = 0; i < name.size(); ++i) {
 								        if (name[i] == "weight" && i > 0) {
 								            tensor = name[i - 1];
 								            break;
 								        }
 								    }
 								    if (tensor.empty()) {
 								        tensor = input;
 								    }
 								    if (layer.empty()) {
 								        layer = "-";
 								    }
 								}
 								static void compute_statistics(std::vector<tensor_statistics> & tstats, const std::string & name, const Stats & e) {
 								    if (e.values.size() % e.counts.size() != 0) {
 								        LOG_ERR("%s: activation size mismatch for tensor %s (%zu vs %zu)\n", __func__, name.c_str(), e.counts.size(), e.values.size());
 								        return;
 								    }
 								    if (e.counts.empty()) {
 								        LOG_ERR("%s: there are no activations for tensor %s. The imatrix may be suboptimal\n", __func__, name.c_str());
 								        return;
 								    }
 								    const int n_mat = e.counts.size();
 								    const int row_size = e.values.size() / n_mat;
 								    std::vector<float> activations;
 								    activations.reserve(e.values.size());
 								    for (int i = 0; i < n_mat; ++i) {
 								        for (int j = 0; j < row_size; ++j) {
 								            activations.push_back(e.values[i*row_size + j] / e.counts[i]);
 								        }
 								    }
 								    const float act_total     = std::accumulate(activations.begin(), activations.end(), 0.0f);
 								    const float act_max       = *std::max_element(activations.begin(), activations.end());
 								    const float act_min       = *std::min_element(activations.begin(), activations.end());
 								    const float act_mean      = act_total / activations.size();
 								    const float act_sqr_total = std::inner_product(activations.begin(), activations.end(), activations.begin(), 0.0f);
 								    const float act_var       = (act_sqr_total / activations.size()) - (act_mean * act_mean);
 								    const float act_dev       = std::sqrt(std::max(0.0f, act_var));
 								    float threshold           = 1e-5f;
 								    const int inactive_count  = std::count_if(activations.begin(), activations.end(),
 								                                               [threshold](const float v) { return fabsf(v) <= threshold; });
 								    const float active_ratio  = 1 - static_cast<float>(inactive_count) / activations.size();
 								    float entropy = 0;
 								    if (act_total > 0) {
 								        for (const auto act : activations) {
 								            if (const float p = act / act_total; p > 0) {
 								                entropy -= p * std::log2(p);
 								            }
 								        }
 								    }
 								    int z_score = 0;
 								    if (act_dev > 0.0f) {
 								        for (const auto act : activations) {
 								            if (const float p = (act - act_mean) / act_dev; p > 1) {
 								                z_score++;
 								            }
 								        }
 								    }
 								    auto & ts = tstats.emplace_back();
 								    ts.tensor     = name;
 								    ts.stats      = e;
 								    ts.total_sqract = act_total;
 								    ts.mean_sqract  = act_mean;
 								    ts.max_sqract   = act_max;
 								    ts.min_sqract   = act_min;
 								    ts.elements   = static_cast<int>(activations.size());
 								    ts.stddev     = act_dev;
 								    ts.active     = active_ratio;
 								    ts.entropy    = entropy;
 								    ts.zd         = static_cast<float>(z_score) / ts.elements;
 								}
 								static void compute_cossim(std::vector<tensor_statistics> & tstats) {
 								    static const std::regex pattern(R"(blk\.(\d+)\.)");
 								    for (auto & ts : tstats) {
 								        if (std::smatch match; std::regex_search(ts.tensor, match, pattern)) {
 								            const int blk = std::stoi(match[1]);
 								            std::string tname(ts.tensor);
 								            tname.replace(match.position(1), match.length(1), std::to_string(blk-1));
 								            auto prev = std::find_if(tstats.begin(), tstats.end(),
 								                [tname](const tensor_statistics & t) { return t.tensor == tname; });
 								            if (prev != tstats.end()) {
 								                const float dp = std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
 								                    prev->stats.values.begin(), 0.0f);
 								                const float curr_mag = std::sqrt(std::inner_product(ts.stats.values.begin(), ts.stats.values.end(),
 								                    ts.stats.values.begin(), 0.0f));
 								                const float prev_mag = std::sqrt(std::inner_product(prev->stats.values.begin(), prev->stats.values.end(),
 								                    prev->stats.values.begin(), 0.0f));
 								                const float cs = dp / (curr_mag * prev_mag);
 								                ts.cossim = cs;
 								            }
 								        } else {
 								            ts.cossim = 0;
 								        }
 								    }
 								}
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
 								    GGML_UNUSED(user_data);
 								    const struct ggml_tensor * src0 = t->src[0];
 								    const struct ggml_tensor * src1 = t->src[1];
-												imatrix : fix wname for mul_mat_id ops (#6271)

* imatrix : fix wname for mul_mat_id ops

* also filter tensor names in mul_mat_id ops

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-03-24 16:18:45 +02:00
+								    std::string wname = filter_tensor_name(src0->name);
-												backend : offload large batches to GPU (#6083)

* backend : offload large batches to GPU

* fix hip

* code cleanup

* fix CUDA split buffers

* Update ggml-backend-impl.h

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>

* cuda : fix memset without set_device

* imatrix : remove sched affix from weight names

* sched : add a new split if the current one has too many inputs
reduce max inputs per split
more cleanup

* update backends

ggml-ci

---------

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
											
										
										
											2024-03-18 11:03:04 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								    // when ask is true, the scheduler wants to know if we are interested in data from this tensor
 								    // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection
 								    if (ask) {
 								        if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications
 								        if (t->op != GGML_OP_MUL_MAT) return false;
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								        // why are small batches ignored (<16 tokens)?
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false;
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        if (!(wname.substr(0, 4) == "blk." || (m_params.process_output && wname == "output.weight"))) return false;
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        return true;
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    }
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
 								    std::lock_guard<std::mutex> lock(m_mutex);
 								    // copy the data from the GPU memory if needed
 								    const bool is_host = ggml_backend_buffer_is_host(src1->buffer);
 								    if (!is_host) {
-												imatrix: fix oob writes if src1 is not contiguous (#13286)


											
										
										
											2025-05-04 00:50:37 +02:00
+								        const size_t src1_nbytes = ggml_nbytes(src1);
 								        m_src1_data.resize(src1_nbytes);
 								        ggml_backend_tensor_get(src1, m_src1_data.data(), 0, src1_nbytes);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    }
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
-												imatrix: fix oob writes if src1 is not contiguous (#13286)


											
										
										
											2025-05-04 00:50:37 +02:00
+								    const char * data = is_host ? (const char *) src1->data : m_src1_data.data();
 								    GGML_ASSERT(src1->nb[0] == ggml_element_size(src1));
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    // TODO: 4d? (is that even used in practice?)
 								    // the extra dimension would need to be stored somewhere to be reflected in the imatrix file
 								    if (ggml_nrows(src1) != src1->ne[1] * src1->ne[2]) {
 								        LOG_ERR("%s: tensor has more than 3 dimensions: %s", __func__, wname.c_str());
 								        GGML_ASSERT(false);
 								    }
-												ggml : mul_mat_id use the same tensor for all the experts (#6387)

* ggml : update mul_mat_id to use the same tensor for all the experts

* update cuda

* minor

* update metal

* update test-backend-ops

* fix cuda

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* update convert.py

* update convert-hf-to-gguf.py

* update convert.py for mixtral hf models

* Update convert-hf-to-gguf.py

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* cuda : support non-pow-2 number of experts

* allow quantize to work for split and merged experts models in the same way

* cleanup + disable mmap automatically with split tensors models

* update imatrix

* test-backend-ops : test qwen argsort

* update grok model loading

* llama : add merged experts tensors to the grok tensor map

* minor

* gguf : bump version

* fix quantizing of merged experts

* convert-hf-to-gguf.py : update grok (untested)

* make linter happy

* cuda/argsort : use shared memory instead of pool memory

* convert : fix grok tensor names

* metal : add support for non-pow-2 argsort

* llama : more loader cleanup, better error checking

* cuda : fix warning

* llama : still use mmap for loading old models, but copy the data to a host buffer

* add review note

* llama : remove ffn tensor counting + add sanity check

ggml-ci

* convert : fix handling of n_experts == None

ggml-ci

* imatrix : fix ncall counters

* llama : produce error if imatrix size does not match

* quantize : terminate on errors + trace logs

ggml-ci

* metal : pad shared memory to 16 bytes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-03 15:07:05 +02:00
+								    // this has been adapted to the new format of storing merged experts in a single 3d tensor
-												repo : update links to new url (#11886)

* repo : update links to new url

ggml-ci

* cont : more urls

ggml-ci
											
										
										
											2025-02-15 16:40:57 +02:00
+								    // ref: https://github.com/ggml-org/llama.cpp/pull/6387
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								    if (t->op == GGML_OP_MUL_MAT_ID) {
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								        //   ids  -> [n_experts_used, n_tokens]
 								        //   src1 -> [cols, n_expert_used, n_tokens]
-												ggml : mul_mat_id use the same tensor for all the experts (#6387)

* ggml : update mul_mat_id to use the same tensor for all the experts

* update cuda

* minor

* update metal

* update test-backend-ops

* fix cuda

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* update convert.py

* update convert-hf-to-gguf.py

* update convert.py for mixtral hf models

* Update convert-hf-to-gguf.py

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* cuda : support non-pow-2 number of experts

* allow quantize to work for split and merged experts models in the same way

* cleanup + disable mmap automatically with split tensors models

* update imatrix

* test-backend-ops : test qwen argsort

* update grok model loading

* llama : add merged experts tensors to the grok tensor map

* minor

* gguf : bump version

* fix quantizing of merged experts

* convert-hf-to-gguf.py : update grok (untested)

* make linter happy

* cuda/argsort : use shared memory instead of pool memory

* convert : fix grok tensor names

* metal : add support for non-pow-2 argsort

* llama : more loader cleanup, better error checking

* cuda : fix warning

* llama : still use mmap for loading old models, but copy the data to a host buffer

* add review note

* llama : remove ffn tensor counting + add sanity check

ggml-ci

* convert : fix handling of n_experts == None

ggml-ci

* imatrix : fix ncall counters

* llama : produce error if imatrix size does not match

* quantize : terminate on errors + trace logs

ggml-ci

* metal : pad shared memory to 16 bytes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-03 15:07:05 +02:00
+								        const ggml_tensor * ids = t->src[2];
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        const int64_t n_as = src0->ne[2];
 								        const int64_t n_ids = ids->ne[0];
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
-												ggml : mul_mat_id use the same tensor for all the experts (#6387)

* ggml : update mul_mat_id to use the same tensor for all the experts

* update cuda

* minor

* update metal

* update test-backend-ops

* fix cuda

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* update convert.py

* update convert-hf-to-gguf.py

* update convert.py for mixtral hf models

* Update convert-hf-to-gguf.py

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* cuda : support non-pow-2 number of experts

* allow quantize to work for split and merged experts models in the same way

* cleanup + disable mmap automatically with split tensors models

* update imatrix

* test-backend-ops : test qwen argsort

* update grok model loading

* llama : add merged experts tensors to the grok tensor map

* minor

* gguf : bump version

* fix quantizing of merged experts

* convert-hf-to-gguf.py : update grok (untested)

* make linter happy

* cuda/argsort : use shared memory instead of pool memory

* convert : fix grok tensor names

* metal : add support for non-pow-2 argsort

* llama : more loader cleanup, better error checking

* cuda : fix warning

* llama : still use mmap for loading old models, but copy the data to a host buffer

* add review note

* llama : remove ffn tensor counting + add sanity check

ggml-ci

* convert : fix handling of n_experts == None

ggml-ci

* imatrix : fix ncall counters

* llama : produce error if imatrix size does not match

* quantize : terminate on errors + trace logs

ggml-ci

* metal : pad shared memory to 16 bytes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-03 15:07:05 +02:00
+								        // the top-k selected expert ids are stored in the ids tensor
 								        // for simplicity, always copy ids to host, because it is small
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								        // take into account that ids is not contiguous!
 								        GGML_ASSERT(ids->ne[1] == src1->ne[2]);
 								        m_ids.resize(ggml_nbytes(ids));
-												ggml : mul_mat_id use the same tensor for all the experts (#6387)

* ggml : update mul_mat_id to use the same tensor for all the experts

* update cuda

* minor

* update metal

* update test-backend-ops

* fix cuda

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* update convert.py

* update convert-hf-to-gguf.py

* update convert.py for mixtral hf models

* Update convert-hf-to-gguf.py

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* cuda : support non-pow-2 number of experts

* allow quantize to work for split and merged experts models in the same way

* cleanup + disable mmap automatically with split tensors models

* update imatrix

* test-backend-ops : test qwen argsort

* update grok model loading

* llama : add merged experts tensors to the grok tensor map

* minor

* gguf : bump version

* fix quantizing of merged experts

* convert-hf-to-gguf.py : update grok (untested)

* make linter happy

* cuda/argsort : use shared memory instead of pool memory

* convert : fix grok tensor names

* metal : add support for non-pow-2 argsort

* llama : more loader cleanup, better error checking

* cuda : fix warning

* llama : still use mmap for loading old models, but copy the data to a host buffer

* add review note

* llama : remove ffn tensor counting + add sanity check

ggml-ci

* convert : fix handling of n_experts == None

ggml-ci

* imatrix : fix ncall counters

* llama : produce error if imatrix size does not match

* quantize : terminate on errors + trace logs

ggml-ci

* metal : pad shared memory to 16 bytes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-03 15:07:05 +02:00
+								        ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));
 								        auto & e = m_stats[wname];
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        if (e.counts.size() == 1 && n_as > 1) {
 								            // broadcast, when loading an old imatrix
 								            e.counts.resize(n_as, e.counts[0]);
 								        }
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								        if (e.values.empty()) {
 								            e.values.resize(src1->ne[0]*n_as, 0);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            e.counts.resize(n_as, 0);
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								        }
 								        else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0]*n_as));
 								            exit(1); //GGML_ABORT("fatal error");
 								        }
 								        else if (e.counts.size() != (size_t)n_as) {
 								            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_as);
-												ggml : reduce hash table reset cost (#8698)

* ggml : reduce hash table reset cost

* fix unreachable code warnings after GGML_ASSERT(false)

* GGML_ASSERT(false) -> GGML_ABORT("fatal error")

* GGML_ABORT use format string

											
										
										
											2024-07-27 04:41:55 +02:00
+								            exit(1); //GGML_ABORT("fatal error");
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								        }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[2], (int)src1->type);
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        // loop over all possible experts, regardless if they are used or not in the batch
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        for (int64_t ex = 0; ex < n_as; ++ex) {
-												ggml : mul_mat_id use the same tensor for all the experts (#6387)

* ggml : update mul_mat_id to use the same tensor for all the experts

* update cuda

* minor

* update metal

* update test-backend-ops

* fix cuda

* Update ggml-metal.m

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* update convert.py

* update convert-hf-to-gguf.py

* update convert.py for mixtral hf models

* Update convert-hf-to-gguf.py

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* cuda : support non-pow-2 number of experts

* allow quantize to work for split and merged experts models in the same way

* cleanup + disable mmap automatically with split tensors models

* update imatrix

* test-backend-ops : test qwen argsort

* update grok model loading

* llama : add merged experts tensors to the grok tensor map

* minor

* gguf : bump version

* fix quantizing of merged experts

* convert-hf-to-gguf.py : update grok (untested)

* make linter happy

* cuda/argsort : use shared memory instead of pool memory

* convert : fix grok tensor names

* metal : add support for non-pow-2 argsort

* llama : more loader cleanup, better error checking

* cuda : fix warning

* llama : still use mmap for loading old models, but copy the data to a host buffer

* add review note

* llama : remove ffn tensor counting + add sanity check

ggml-ci

* convert : fix handling of n_experts == None

ggml-ci

* imatrix : fix ncall counters

* llama : produce error if imatrix size does not match

* quantize : terminate on errors + trace logs

ggml-ci

* metal : pad shared memory to 16 bytes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-03 15:07:05 +02:00
+								            size_t e_start = ex*src1->ne[0];
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            for (int64_t idx = 0; idx < n_ids; ++idx) {
 								                for (int64_t row = 0; row < src1->ne[2]; ++row) {
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								                    const int excur = *(const int32_t *) (m_ids.data() + row*ids->nb[1] + idx*ids->nb[0]);
 								                    GGML_ASSERT(excur >= 0 && excur < n_as); // sanity check
 								                    if (excur != ex) continue;
 								                    const int64_t i11 = idx % src1->ne[1];
 								                    const int64_t i12 = row;
-												imatrix: fix oob writes if src1 is not contiguous (#13286)


											
										
										
											2025-05-04 00:50:37 +02:00
+								                    const float * x = (const float *)(data + i11*src1->nb[1] + i12*src1->nb[2]);
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								                    e.counts[ex]++;
 								                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
 								                        e.values[e_start + j] += x[j] * x[j];
 								                        if (!std::isfinite((float)e.values[e_start + j])) {
 								                            LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str());
-												check for nans in imatrix and quantize (#7807)

* imatrix : detect nan/inf values

* quantize : check imatrix for nan/inf values
											
										
										
											2024-06-07 08:01:29 +02:00
+								                            exit(1);
 								                        }
-												ggml : group all experts in a single ggml_mul_mat_id (#6505)

* ggml : group all experts in a single ggml_mul_mat_id
cuda : improve mmid row copy

* cuda : fix bin bcast with non-cont src0

* test-backend-ops : only run all mul mat tests for base types

* llama : disable moe offloading with SYCL

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-18 15:18:48 +02:00
+								                    }
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								                }
 								            }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            const int32_t n_chunk = e.counts[ex] / chunk_size;
 								            if (n_chunk > m_last_chunk) {
 								                const int32_t chunk_step = n_chunk - m_last_chunk;
 								                m_last_chunk = n_chunk;
 								                if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								                    save_imatrix();
 								                }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								                if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
 								                    save_imatrix(m_last_chunk);
-												imatrix : keep intermediate imatrix results (#5077)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-22 14:18:43 +02:00
+								                }
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								            }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								        }
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								    } else {
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        auto & e = m_stats[wname];
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        const int64_t n_mat = src1->ne[2] * src1->ne[3];
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        if (e.values.empty()) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            e.values.resize(src1->ne[0] * n_mat, 0);
 								            e.counts.resize(n_mat, 0);
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        else if (e.values.size() != (size_t)(src1->ne[0] * n_mat)) {
 								            LOG_ERR("%s: inconsistent size for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.values.size(), (int)(src1->ne[0] * n_mat));
-												ggml : reduce hash table reset cost (#8698)

* ggml : reduce hash table reset cost

* fix unreachable code warnings after GGML_ASSERT(false)

* GGML_ASSERT(false) -> GGML_ABORT("fatal error")

* GGML_ABORT use format string

											
										
										
											2024-07-27 04:41:55 +02:00
+								            exit(1); //GGML_ABORT("fatal error");
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        else if (e.counts.size() != (size_t)n_mat) {
 								            LOG_ERR("%s: inconsistent expert count for %s (%d vs %d)\n", __func__, wname.c_str(), (int)e.counts.size(), (int)n_mat);
 								            exit(1); //GGML_ABORT("fatal error");
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        LOG_DBGV(2, "%s[%d]: %32s, %s, %5d x %5d x %5d, %d\n", __func__, m_last_chunk, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->ne[2], (int)src1->type);
 								        for (int64_t i3 = 0; i3 < src1->ne[3]; ++i3) {
 								            for (int64_t i2 = 0; i2 < src1->ne[2]; ++i2) {
 								                const int64_t mat_id = i3 * src1->ne[2] + i2;
 								                const int64_t mat_start = mat_id * src1->ne[0];
 								                for (int64_t row = 0; row < src1->ne[1]; ++row) {
 								                    const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]);
 								                    e.counts[mat_id]++;
 								                    for (int64_t j = 0; j < src1->ne[0]; ++j) {
 								                        e.values[mat_start + j] += x[j] * x[j];
 								                        if (!std::isfinite((float)e.values[j])) {
 								                            LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str());
 								                            exit(1);
 								                        }
 								                    }
 								                }
 								                const int32_t n_chunk = e.counts[mat_id] / chunk_size;
 								                if (n_chunk > m_last_chunk) {
 								                    const int32_t chunk_step = n_chunk - m_last_chunk;
 								                    m_last_chunk = n_chunk;
 								                    if ((m_last_chunk % m_params.n_out_freq) / chunk_step == 0) {
 								                        save_imatrix();
 								                    }
 								                    if (m_params.n_save_freq > 0 && (m_last_chunk % m_params.n_save_freq) / chunk_step == 0) {
 								                        save_imatrix(m_last_chunk);
 								                    }
 								                }
-												imatrix : keep intermediate imatrix results (#5077)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-22 14:18:43 +02:00
+								            }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								        }
 								    }
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
 								    return true;
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								}
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								void IMatrixCollector::save_imatrix_legacy(int32_t ncall) const {
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    auto fname = m_params.out_file;
-												imatrix : keep intermediate imatrix results (#5077)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-22 14:18:43 +02:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    if (ncall > 0) {
 								        fname += ".at_";
 								        fname += std::to_string(ncall);
 								    }
-												imatrix : keep intermediate imatrix results (#5077)

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-22 14:18:43 +02:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    // warn when writing imatrix entries that do not have full data
-												imatrix : handle partial entries (#7833)


											
										
										
											2024-06-09 20:19:35 +03:00
+								    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
 								    int n_entries = 0;
 								    std::vector<std::string> to_store;
 								    bool is_first = true; // for printing
 								    for (const auto & kv : m_stats) {
 								        const int n_all = kv.second.counts.size();
 								        if (n_all == 0) {
 								            continue;
 								        }
 								        int n_zeros = 0;
 								        for (const int c : kv.second.counts) {
 								            if (c == 0) {
 								                n_zeros++;
 								            }
 								        }
 								        if (n_zeros != 0 && is_first) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG_INF("\n");
-												imatrix : handle partial entries (#7833)


											
										
										
											2024-06-09 20:19:35 +03:00
+								            is_first = false;
 								        }
 								        if (n_zeros == n_all) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG_WRN("%s: entry '%40s' has no data - skipping\n", __func__, kv.first.c_str());
-												imatrix : handle partial entries (#7833)


											
										
										
											2024-06-09 20:19:35 +03:00
+								            continue;
 								        }
 								        if (n_zeros > 0) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
-												imatrix : handle partial entries (#7833)


											
										
										
											2024-06-09 20:19:35 +03:00
+								        }
 								        n_entries++;
 								        to_store.push_back(kv.first);
 								    }
 								    if (to_store.size() < m_stats.size()) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_WRN("%s: storing only %zu out of %zu entries\n", __func__, to_store.size(), m_stats.size());
-												imatrix : handle partial entries (#7833)


											
										
										
											2024-06-09 20:19:35 +03:00
+								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    // deterministic tensor name order
 								    std::sort(to_store.begin(), to_store.end());
 								    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    std::ofstream out(fname, std::ios::binary);
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
+								    out.write((const char *) &n_entries, sizeof(n_entries));
-												imatrix : handle partial entries (#7833)


											
										
										
											2024-06-09 20:19:35 +03:00
+								    for (const auto & name : to_store) {
 								        const auto & stat = m_stats.at(name);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        const int32_t len = name.size();
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
+								        out.write((const char *) &len, sizeof(len));
-												imatrix : handle partial entries (#7833)


											
										
										
											2024-06-09 20:19:35 +03:00
+								        out.write(name.c_str(), len);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        // ceiling division to avoid accidental zeros
 								        const int32_t ncall = (*std::max_element(stat.counts.begin(), stat.counts.end()) + (chunk_size - 1)) / chunk_size;
 								        out.write((const char *) &ncall, sizeof(ncall));
 								        const int32_t nval = stat.values.size();
 								        const int32_t nmat = stat.counts.size();
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
+								        out.write((const char *) &nval, sizeof(nval));
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        if (nval > 0 && nmat > 0) {
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
+								            std::vector<float> tmp(nval);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            for (int32_t i = 0; i < nval; i++) {
 								                float count = static_cast<float>(stat.counts[i / (nval / nmat)]);
 								                float value = stat.values[i];
 								                if (count == 0.0f) {
 								                    // store 1 for partial data
 								                    value = 1.0f;
 								                    count = 1.0f;
 								                }
 								                tmp[i] = (value / count) * static_cast<float>(ncall);
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
+								            }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            out.write((const char *) tmp.data(), nval * sizeof(float));
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
+								        }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    }
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
 								    // Write the number of call the matrix was computed with
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    out.write((const char *) &m_last_chunk, sizeof(m_last_chunk));
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    // Write the input filename at the end of the file to later on specify it in quantize
 								    {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        const char * dataset_file = m_params.prompt_file.c_str();
 								        int32_t len = m_params.prompt_file.size();
 								        // When there is no prompt but there were other imatrix files loaded, use the last dataset
 								        if (m_params.prompt_file.empty() && !m_datasets.empty()) {
 								            const std::string & dataset_str = m_datasets[m_datasets.size() - 1];
 								            dataset_file = dataset_str.c_str();
 								            len = dataset_str.size();
 								        }
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        out.write((const char *) &len, sizeof(len));
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        out.write(dataset_file, len);
 								    }
 								    LOGV(1, "\n");
 								    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
 								}
 								void IMatrixCollector::save_imatrix(int32_t n_chunk) const {
 								    auto fname = m_params.out_file;
 								    // TODO: use the new format in more cases
 								    if (!string_ends_with(fname, ".gguf")) {
 								        LOG_WRN("\n%s: saving to legacy imatrix format because output suffix is not .gguf\n", __func__);
 								        this->save_imatrix_legacy(n_chunk);
 								        return;
 								    }
 								    if (n_chunk > 0) {
 								        fname += ".at_";
 								        fname += std::to_string(n_chunk);
 								    }
 								    // write imatrix entries even if they don't have full data. (can be corrected when reading)
 								    // this can happen with MoE models where some of the experts end up not being exercised by the provided training data
 								    std::vector<std::string> to_store;
 								    size_t data_size = 0;
 								    bool is_first = true; // for printing
 								    for (const auto & kv : m_stats) {
 								        const int n_all = kv.second.counts.size();
 								        int n_zeros = 0;
 								        for (const auto c : kv.second.counts) {
 								            if (c == 0) {
 								                n_zeros++;
 								            }
 								        }
 								        if (n_zeros != 0 && is_first) {
 								            LOG_INF("\n");
 								            is_first = false;
 								        }
 								        if (n_zeros > 0) {
 								            LOG_WRN("%s: entry '%40s' has partial data (%.2f%%)\n", __func__, kv.first.c_str(), 100.0f * (n_all - n_zeros) / n_all);
 								        }
 								        to_store.push_back(kv.first);
 								        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN);
 								        data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN);
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    }
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    // deterministic tensor name order
 								    std::sort(to_store.begin(), to_store.end());
 								    struct ggml_init_params params = {
 								        /* .mem_size   = */ data_size,
 								        /* .mem_buffer = */ NULL,
 								        /* .no_alloc   = */ false,
 								    };
 								    struct ggml_context * ctx = ggml_init(params);
 								    struct gguf_context * ctx_gguf = gguf_init_empty();
 								    {
 								        std::vector<const char *> datasets;
 								        datasets.reserve(m_datasets.size() + 1);
 								        for (size_t i = 0; i < m_datasets.size(); ++i) {
 								            datasets.push_back(m_datasets[i].c_str());
 								        }
 								        if (!m_params.prompt_file.empty()) {
 								            datasets.push_back(m_params.prompt_file.c_str());
 								        }
 								        gguf_set_val_str(ctx_gguf, "general.type", "imatrix");
 								        // Write the dataset paths
 								        gguf_set_arr_str(ctx_gguf, LLM_KV_IMATRIX_DATASETS, datasets.data(), datasets.size());
 								        // Write the number of chunks the matrix was computed with
 								        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_COUNT, m_last_chunk);
 								        gguf_set_val_u32(ctx_gguf, LLM_KV_IMATRIX_CHUNK_SIZE, m_params.n_ctx / m_params.n_parallel);
 								    }
 								    for (const auto & name : to_store) {
 								        const auto & stat = m_stats.at(name);
 								        const int32_t nval = (int32_t) stat.values.size();
 								        const int32_t nmat = (int32_t) stat.counts.size();
 								        if (nval > 0 && nmat > 0) {
 								            struct ggml_tensor * in_sum2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nval / nmat, nmat);
 								            struct ggml_tensor * counts  = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, nmat);
 								            ggml_format_name(in_sum2, "%s.in_sum2", name.c_str());
 								            ggml_format_name(counts, "%s.counts", name.c_str());
 								            for (int32_t j = 0; j < nval; ++j) {
 								                ((float *) in_sum2->data)[j] = (float) stat.values[j];
 								            }
 								            for (int32_t j = 0; j < nmat; ++j) {
 								                ((float *) counts->data)[j] = (float) stat.counts[j];
 								            }
 								            gguf_add_tensor(ctx_gguf, in_sum2);
 								            gguf_add_tensor(ctx_gguf, counts);
 								        }
 								    }
 								    gguf_write_to_file(ctx_gguf, fname.c_str(), false);
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								    LOGV(1, "\n");
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    LOG_DBGV(1, "%s: stored collected data after %d chunks in %s\n", __func__, m_last_chunk, fname.c_str());
 								    gguf_free(ctx_gguf);
 								    ggml_free(ctx);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								}
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								bool IMatrixCollector::load_imatrix_legacy(const char * fname) {
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    std::ifstream in(fname, std::ios::binary);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								    if (!in) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        LOG_ERR("%s: failed to open %s\n", __func__, fname);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								        return false;
 								    }
 								    int n_entries;
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    in.read((char *) &n_entries, sizeof(n_entries));
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								    if (in.fail() || n_entries < 1) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_ERR("%s: no data in file %s\n", __func__, fname);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								        return false;
 								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    // Guess the chunk size because it's not stored in the file
 								    const int32_t chunk_size = m_params.n_ctx / m_params.n_parallel;
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								    for (int i = 0; i < n_entries; ++i) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        int32_t len = 0;
 								        in.read((char *) &len, sizeof(len));
 								        std::vector<char> name_as_vec(len + 1);
 								        in.read((char *) name_as_vec.data(), len);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								        if (in.fail()) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            LOG_ERR("%s: failed reading name for entry %d from %s\n", __func__, i + 1, fname);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								            return false;
 								        }
 								        name_as_vec[len] = 0;
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        std::string name{ name_as_vec.data() };
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        auto & e = m_stats[std::move(name)];
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        int32_t ncall = 0;
 								        in.read((char *) &ncall, sizeof(ncall));
 								        int32_t nval = 0;
 								        in.read((char *) &nval, sizeof(nval));
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								        if (in.fail() || nval < 1) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            LOG_ERR("%s: failed reading number of values for entry %d\n", __func__, i);
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								            m_stats = {};
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								            return false;
 								        }
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
 								        if (e.values.empty()) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            e.values.resize(nval, 0.0f);
 								            e.counts.resize(1, 0);
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
+								        }
 								        std::vector<float> tmp(nval);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        in.read((char *) tmp.data(), nval * sizeof(float));
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								        if (in.fail()) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            LOG_ERR("%s: failed reading data for entry %d\n", __func__, i);
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								            m_stats = {};
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								            return false;
 								        }
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        // Recreate the state as expected by save_imatrix(), and correct for weighted sum.
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
+								        for (int i = 0; i < nval; i++) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            e.values[i] += tmp[i] * chunk_size;
 								        }
 								        // The legacy format doesn't distinguish the counts for different experts
 								        for (size_t j = 0; j < e.counts.size(); ++j) {
 								            e.counts[j] += ncall * chunk_size;
 								        }
 								    }
 								    {
 								        // TODO: extract into its own method; this is also used by the GGUF-based format
 								        // Calculate the last chunk count
 								        int64_t max_count = 0;
 								        for (const auto & stats : m_stats) {
 								            for (int64_t count : stats.second.counts) {
 								                if (count > max_count) {
 								                    max_count = count;
 								                }
 								            }
 								        }
 								        m_last_chunk = max_count / (chunk_size);
 								    }
 								    {
 								        // Read the number of calls the matrix was computed with
 								        int32_t n_calls;
 								        in.read((char *) &n_calls, sizeof(n_calls));
 								        // ignore it because it's not important
 								    }
 								    // Read the dataset path to include it when writing to GGUF
 								    if (!in.fail()){
 								        int32_t len = 0;
 								        in.read((char *) &len, sizeof(len));
 								        if (!in.fail()) {
 								            std::vector<char> dataset;
 								            dataset.resize(len + 1, 0);
 								            in.read(dataset.data(), len);
 								            if (!in.fail()) {
 								                m_datasets.push_back(dataset.data());
 								            }
 								        }
 								    }
 								    return true;
 								}
 								// Using GGUF as the file format, for greater extensibility
 								bool IMatrixCollector::load_imatrix(const char * file_name) {
 								    struct ggml_context * ctx = nullptr;
 								    struct gguf_init_params meta_gguf_params = {
 								        /* .no_alloc = */ false, // the data is needed
 								        /* .ctx      = */ &ctx,
 								    };
 								    struct gguf_context * ctx_gguf = gguf_init_from_file(file_name, meta_gguf_params);
 								    if (!ctx_gguf) {
 								        return this->load_imatrix_legacy(file_name);
 								    }
 								    const int32_t n_entries = gguf_get_n_tensors(ctx_gguf);
 								    if (n_entries < 1) {
 								        LOG_ERR("%s: no data in file %s\n", __func__, file_name);
 								        gguf_free(ctx_gguf);
 								        ggml_free(ctx);
 								        return false;
 								    }
 								    const int64_t datasets_key = gguf_find_key(ctx_gguf, LLM_KV_IMATRIX_DATASETS);
 								    if (datasets_key != -1 && gguf_get_arr_type(ctx_gguf, datasets_key) == GGUF_TYPE_STRING) {
 								        const int64_t n = gguf_get_arr_n(ctx_gguf, datasets_key);
 								        m_datasets.reserve(m_datasets.size() + n);
 								        for (int64_t i = 0; i < n; ++i) {
 								            m_datasets.push_back(gguf_get_arr_str(ctx_gguf, datasets_key, i));
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
+								        }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    }
 								    const std::string in_sum2_suffix{ ".in_sum2" };
 								    const std::string counts_suffix{ ".counts" };
 								    // Could re-use m_stats instead, but this allows
 								    // checking for completeness of *each* loaded imatrix file
 								    // and also makes it easier to re-use a similar implementation in quantize.cpp
 								    // Using an ordered map to get a deterministic iteration order.
 								    std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for;
 								    for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) {
 								        std::string name = cur->name;
-												Fixed save_imatrix to match old behaviour for MoE (#7099)

* Fixed save_imatrix to match old behaviour for MoE

This fix is simple and clear, but unnecessarily doubles the memory overhead..

* Fixed missing idx variable

* Unconditionally increment ncall

Co-authored-by: slaren <slarengh@gmail.com>

* Fixed 2 bugs in save_imatrix()

- Fixed segfault bug because the counts vector needed to be created.
- Fixed pre-existing bug didn't actually add to the counts for "--combine" option.

* ncall needs summing too

* Trailing whitespace

---------

Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-05-08 01:24:16 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        if (name.empty()) { continue; }
 								        if (string_remove_suffix(name, in_sum2_suffix)) {
 								            // in_sum2
 								            sums_counts_for[std::move(name)].first = cur;
 								        } else if (string_remove_suffix(name, counts_suffix)) {
 								            // counts
 								            sums_counts_for[std::move(name)].second = cur;
 								        } else {
 								            // ignore other tensors
 								        }
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
 								    for (const auto & sc : sums_counts_for) {
 								        const std::string &        name    = sc.first;
 								        const struct ggml_tensor * in_sum2 = sc.second.first;
 								        const struct ggml_tensor * counts  = sc.second.second;
 								        if (!in_sum2 || !counts) {
 								            LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str());
 								            gguf_free(ctx_gguf);
 								            ggml_free(ctx);
 								            return false;
 								        }
 								        auto & e = m_stats[name];
 								        int64_t nval = ggml_nelements(in_sum2);
 								        if (e.values.empty()) {
 								            e.values.resize(nval, 0.0f);
 								        } else if ((size_t) nval != e.values.size()) {
 								            LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size());
 								            gguf_free(ctx_gguf);
 								            ggml_free(ctx);
 								            return false;
 								        }
 								        int64_t ncounts = ggml_nelements(counts);
 								        if (e.counts.empty()) {
 								            e.counts.resize(ncounts, 0);
 								        } else if (e.counts.size() == 1 && ncounts > 1) {
 								            // broadcast, when loading an old imatrix
 								            e.counts.resize(ncounts, e.counts[0]);
 								        } else if ((size_t) ncounts != e.counts.size()) {
 								            LOG_ERR("%s: mismatched counts size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) ncounts, e.counts.size());
 								            gguf_free(ctx_gguf);
 								            ggml_free(ctx);
 								            return false;
 								        }
 								        // Recreate the state as expected by save_imatrix()
 								        for (int64_t j = 0; j < nval; j++) {
 								            e.values[j] += ((const float *) in_sum2->data)[j];
 								        }
 								        for (int64_t j = 0; j < ncounts; j++) {
 								            e.counts[j] += std::lround(((const float *) counts->data)[j]);
 								        }
 								    }
 								    // TODO: extract into its own method; this is also used by the legacy format
 								    // Calculate the last chunk count
 								    int64_t max_count = 0;
 								    for (const auto & stats : m_stats) {
 								        for (int64_t count : stats.second.counts) {
 								            if (count > max_count) {
 								                max_count = count;
 								            }
 								        }
 								    }
 								    m_last_chunk = max_count / (m_params.n_ctx / m_params.n_parallel);
 								    gguf_free(ctx_gguf);
 								    ggml_free(ctx);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								    return true;
 								}
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								static IMatrixCollector g_collector;
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								static bool ik_collect_imatrix(struct ggml_tensor * t, bool ask, void * user_data) {
 								    return g_collector.collect_imatrix(t, ask, user_data);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								}
 								struct results_log_softmax {
 								    double log_softmax;
 								    float  logit;
 								    float  prob;
 								};
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								static std::vector<float> softmax(const std::vector<float> & logits) {
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    std::vector<float> probs(logits.size());
 								    float max_logit = logits[0];
 								    for (float v : logits) {
 								        max_logit = std::max(max_logit, v);
 								    }
 								    double sum_exp = 0.0;
 								    for (size_t i = 0; i < logits.size(); i++) {
 								        // Subtract the maximum logit value from the current logit value for numerical stability
 								        const float logit = logits[i] - max_logit;
 								        const float exp_logit = expf(logit);
 								        sum_exp += exp_logit;
 								        probs[i] = exp_logit;
 								    }
 								    for (size_t i = 0; i < probs.size(); i++) {
 								        probs[i] /= sum_exp;
 								    }
 								    return probs;
 								}
 								static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
 								    float max_logit = logits[0];
 								    for (int i = 1; i < n_vocab; ++i) {
 								        max_logit = std::max(max_logit, logits[i]);
 								    }
 								    double sum_exp = 0.0;
 								    for (int i = 0; i < n_vocab; ++i) {
 								        sum_exp += expf(logits[i] - max_logit);
 								    }
 								    return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
 								}
 								static void process_logits(
 								    int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    double & nll, double & nll2, float * logit_history, float * prob_history) {
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    std::mutex mutex;
 								    int counter = 0;
 								    auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
 								        double local_nll  = 0;
 								        double local_nll2 = 0;
 								        while (true) {
 								            std::unique_lock<std::mutex> lock(mutex);
 								            int i = counter++;
 								            if (i >= n_token) {
 								                nll += local_nll; nll2 += local_nll2;
 								                break;
 								            }
 								            lock.unlock();
 								            const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]);
 								            const double v = -results.log_softmax;
 								            local_nll += v;
 								            local_nll2 += v*v;
 								            logit_history[i] = results.logit;
 								            prob_history[i]  = results.prob;
 								        }
 								    };
 								    for (auto & w : workers) {
 								        w = std::thread(compute);
 								    }
 								    compute();
 								    for (auto & w : workers) {
 								        w.join();
 								    }
 								}
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								static bool compute_imatrix(llama_context * ctx, const common_params & params, const int32_t n_ctx) {
-												llama : add `llama_vocab`, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110)

* llama : add struct llama_vocab to the API (#11156)

ggml-ci

* hparams : move vocab params to llama_vocab (#11159)

ggml-ci

* vocab : more pimpl (#11165)

ggml-ci

* vocab : minor tokenization optimizations (#11160)

ggml-ci

Co-authored-by: Diego Devesa <slarengh@gmail.com>

* lora : update API names (#11167)

ggml-ci

* llama : update API names to use correct prefix (#11174)

* llama : update API names to use correct prefix

ggml-ci

* cont

ggml-ci

* cont

ggml-ci

* minor [no ci]

* vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174)

ggml-ci

* vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174)

ggml-ci

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
											
										
										
											2025-01-12 11:32:42 +02:00
+								    const llama_model * model = llama_get_model(ctx);
 								    const llama_vocab * vocab = llama_model_get_vocab(model);
 								    const bool add_bos = llama_vocab_get_add_bos(vocab);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												llama : add `llama_vocab`, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110)

* llama : add struct llama_vocab to the API (#11156)

ggml-ci

* hparams : move vocab params to llama_vocab (#11159)

ggml-ci

* vocab : more pimpl (#11165)

ggml-ci

* vocab : minor tokenization optimizations (#11160)

ggml-ci

Co-authored-by: Diego Devesa <slarengh@gmail.com>

* lora : update API names (#11167)

ggml-ci

* llama : update API names to use correct prefix (#11174)

* llama : update API names to use correct prefix

ggml-ci

* cont

ggml-ci

* cont

ggml-ci

* minor [no ci]

* vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174)

ggml-ci

* vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174)

ggml-ci

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
											
										
										
											2025-01-12 11:32:42 +02:00
+								    GGML_ASSERT(!llama_vocab_get_add_eos(vocab));
-												llama : refactor `src/llama.cpp` (#10902)

* llama : scatter llama.cpp into multiple modules (wip)

* llama : control-vector -> adapter

* llama : arch

* llama : mmap

ggml-ci

* ci : remove BUILD_SHARED_LIBS=OFF

ggml-ci

* llama : arch (cont)

ggml-ci

* llama : chat

ggml-ci

* llama : model

ggml-ci

* llama : hparams

ggml-ci

* llama : adapter

ggml-ci

* examples : fix

ggml-ci

* rebase

ggml-ci

* minor

* llama : kv cache

ggml-ci

* llama : impl

ggml-ci

* llama : batch

ggml-ci

* cont

ggml-ci

* llama : context

ggml-ci

* minor

* llama : context (cont)

ggml-ci

* llama : model loader

ggml-ci

* common : update lora

ggml-ci

* llama : quant

ggml-ci

* llama : quant (cont)

ggml-ci

* minor [no ci]
											
										
										
											2025-01-03 10:18:53 +02:00
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    auto tim1 = std::chrono::high_resolution_clock::now();
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								    LOG_INF("%s: tokenizing the input ..\n", __func__);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : Add --parse-special for enabling parsing of special tokens in imatrix calculation (#13389)

* Add --parse-special for enabling parsing of special tokens in imatrix calculation

* whitespace
											
										
										
											2025-05-09 05:53:58 -04:00
+								    std::vector<llama_token> tokens = common_tokenize(ctx, params.prompt, true, params.parse_special);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
 								    auto tim2 = std::chrono::high_resolution_clock::now();
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								    LOG_INF("%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast<std::chrono::microseconds>(tim2-tim1).count());
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    if (params.i_chunk > 0) {
 								        if (size_t((params.i_chunk + 2)*n_ctx) >= tokens.size()) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG_ERR("%s: there will be not enough tokens left after removing %d chunks\n", __func__, params.i_chunk);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								            return false;
 								        }
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_INF("%s: removing initial %d chunks (%d tokens)\n", __func__, params.i_chunk, params.i_chunk*n_ctx);
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        tokens.erase(tokens.begin(), tokens.begin() + params.i_chunk*n_ctx);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								    }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    if (int(tokens.size()) < 2*n_ctx) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_ERR("%s: you need at least %d tokens for a context of %d tokens\n", __func__, 2*n_ctx, n_ctx);
 								        LOG_ERR("%s: the data file you provided tokenizes to only %zu tokens\n", __func__, tokens.size());
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								        return false;
 								    }
 								    std::vector<float> logit_history;
 								    std::vector<float> prob_history;
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    if (params.compute_ppl) {
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								        logit_history.resize(tokens.size());
 								        prob_history.resize(tokens.size());
 								    }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
 								    const int n_chunk_max = tokens.size() / n_ctx;
 								    const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max);
-												llama : add `llama_vocab`, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110)

* llama : add struct llama_vocab to the API (#11156)

ggml-ci

* hparams : move vocab params to llama_vocab (#11159)

ggml-ci

* vocab : more pimpl (#11165)

ggml-ci

* vocab : minor tokenization optimizations (#11160)

ggml-ci

Co-authored-by: Diego Devesa <slarengh@gmail.com>

* lora : update API names (#11167)

ggml-ci

* llama : update API names to use correct prefix (#11174)

* llama : update API names to use correct prefix

ggml-ci

* cont

ggml-ci

* cont

ggml-ci

* minor [no ci]

* vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174)

ggml-ci

* vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174)

ggml-ci

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
											
										
										
											2025-01-12 11:32:42 +02:00
+								    const int n_vocab = llama_vocab_n_tokens(vocab);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    const int n_batch = params.n_batch;
 								    int count = 0;
 								    double nll = 0.0;
 								    double nll2 = 0.0;
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    const int num_batches = (n_ctx + n_batch - 1) / n_batch;
 								    const int n_seq = std::max(1, n_batch / n_ctx);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    GGML_ASSERT(n_batch < n_ctx || n_batch % n_ctx == 0);
 								    GGML_ASSERT(params.n_ctx == n_seq * n_ctx);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    llama_batch batch = llama_batch_init(std::min(n_batch, n_ctx*n_seq), 0, 1);
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
 								    std::vector<float> logits;
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    if (params.compute_ppl && num_batches > 1) {
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								        logits.reserve((size_t)n_ctx * n_vocab);
 								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    LOG_INF("%s: computing over %d chunks, n_ctx=%d, batch_size=%d, n_seq=%d\n", __func__, n_chunk, n_ctx, n_batch, n_seq);
 								    std::vector<std::thread> workers(std::thread::hardware_concurrency() - 1);
 								    for (int i = 0; i < n_chunk; i += n_seq) {
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								        const int start =     i * n_ctx;
 								        const int end   = start + n_ctx;
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								        const int n_seq_batch = std::min(n_seq, n_chunk - i);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
 								        const auto t_start = std::chrono::high_resolution_clock::now();
 								        // clear the KV cache
-												llama : deprecate llama_kv_self_ API (#14030)

* llama : deprecate llama_kv_self_ API

ggml-ci

* llama : allow llama_memory_(nullptr)

ggml-ci

* memory : add flag for optional data clear in llama_memory_clear

ggml-ci
											
										
										
											2025-06-06 14:11:15 +03:00
+								        llama_memory_clear(llama_get_memory(ctx), true);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
 								        for (int j = 0; j < num_batches; ++j) {
 								            const int batch_start = start + j * n_batch;
 								            const int batch_size  = std::min(end - batch_start, n_batch);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            // clear the batch
 								            common_batch_clear(batch);
 								            for (int seq = 0; seq < n_seq_batch; seq++) {
 								                int seq_start = batch_start + seq*n_ctx;
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								                // save original token and restore it after eval
 								                const auto token_org = tokens[seq_start];
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								                // add BOS token for the first batch of each chunk
 								                if (add_bos && j == 0) {
 								                    tokens[seq_start] = llama_vocab_bos(vocab);
 								                }
 								                for (int k = 0; k < batch_size; ++k) {
 								                    // NOTE: specifying all logits to get activations for the output.weight tensor
 								                    //       and also for the perplexity calculation.
 								                    // TODO: only get outputs when (params.process_output || params.compute_ppl)
 								                    //       (not possible when this skips FFN computation of the last layer)
 								                    common_batch_add(batch, tokens[seq_start + k], j*n_batch + k, { seq }, true);
 								                }
 								                // restore the original token in case it was set to BOS
 								                tokens[seq_start] = token_org;
-												llama : remove all_pos_0, all_pos_1, all_seq_id from llama_batch (#9745)

* refactor llama_batch_get_one

* adapt all examples

* fix simple.cpp

* fix llama_bench

* fix

* fix context shifting

* free batch before return

* use common_batch_add, reuse llama_batch in loop

* null terminated seq_id list

* fix save-load-state example

* fix perplexity

* correct token pos in llama_batch_allocr
											
										
										
											2024-10-18 23:18:01 +02:00
+								            }
 								            if (llama_decode(ctx, batch)) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								                LOG_ERR("%s : failed to eval\n", __func__);
-												llama : remove all_pos_0, all_pos_1, all_seq_id from llama_batch (#9745)

* refactor llama_batch_get_one

* adapt all examples

* fix simple.cpp

* fix llama_bench

* fix

* fix context shifting

* free batch before return

* use common_batch_add, reuse llama_batch in loop

* null terminated seq_id list

* fix save-load-state example

* fix perplexity

* correct token pos in llama_batch_allocr
											
										
										
											2024-10-18 23:18:01 +02:00
+								                llama_batch_free(batch);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								                return false;
 								            }
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								            if (params.compute_ppl && num_batches > 1) {
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								                const auto * batch_logits = llama_get_logits(ctx);
 								                logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab);
 								            }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								        }
 								        if (i == 0) {
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            llama_synchronize(ctx);
 								            const auto t_end = std::chrono::high_resolution_clock::now();
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								            const float t_total = std::chrono::duration<float>(t_end - t_start).count();
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG_INF("%s: %.2f seconds per pass - ETA ", __func__, t_total);
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            int total_seconds = (int)(t_total * n_chunk / n_seq);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								            if (total_seconds >= 60*60) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								                LOG("%d hours ", total_seconds / (60*60));
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								                total_seconds = total_seconds % (60*60);
 								            }
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG("%.2f minutes\n", total_seconds / 60.0);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								        }
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        if (params.compute_ppl) {
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								            const int first = n_ctx/2;
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								            for (int seq = 0; seq < n_seq_batch; seq++) {
 								                const float * all_logits = num_batches > 1 ? logits.data() : llama_get_logits_ith(ctx, seq*n_ctx);
 								                llama_token * tokens_data = tokens.data() + start + seq*n_ctx + first;
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								                process_logits(n_vocab, all_logits + first*n_vocab,
 								                        tokens_data, n_ctx - 1 - first,
 								                        workers, nll, nll2,
 								                        logit_history.data() + start + seq*n_ctx + first,
 								                        prob_history.data()  + start + seq*n_ctx + first);
 								                count += n_ctx - first - 1;
 								                LOG("[%d]%.4lf,", i + seq + 1, std::exp(nll / count));
 								            }
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								            fflush(stdout);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								            logits.clear();
 								        }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								    LOG("\n");
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    if (params.compute_ppl) {
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								        nll2 /= count;
 								        nll /= count;
 								        const double ppl = exp(nll);
 								        nll2 -= nll * nll;
 								        if (nll2 > 0) {
 								            nll2 = sqrt(nll2/(count-1));
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl);
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								        } else {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG("Unexpected negative standard deviation of log(prob)\n");
-												Slightly faster imatrix (#5050)

* imatrix: speedup by avoiding unnecessary allocations and copies

* imatrix: add --no-ppl option to skip PPL calculations altogether

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-01-21 08:01:20 +02:00
+								        }
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    llama_batch_free(batch);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    return true;
 								}
-												imatrix: add option to display importance score statistics for a given imatrix file (#12718)

* Add --show-statistics option

* Add --show-statistics logic

* Add tensor name parsing

* Tidy output format

* Fix typo in title

* Improve tensor influence ranking

* Add better statistics

* Change statistics' sort order

* Add Cosine Similarity

* Add header search path

* Change header search path to private

* Add weighted statistics per layer

* Update report title

* Refactor compute_statistics out of main

* Refactor compute_cossim out of load_imatrix

* Refactor compute_statistics out of load_imatrix

* Move imatrix statistics calculation into its own functions

* Add checks and validations

* Remove unnecessary include directory

* Rename labels

* Add m_stats getter and refactor compute_statistics out of load_imatrix

* Refactor variable names

* Minor cosmetic change

* Retrigger checks (empty commit)

* Rerun checks (empty commit)

* Fix unnecessary type promotion

Co-authored-by: compilade <git@compilade.net>

* Reverting change to improve code readability

* Rerun checks (empty commit)

* Rerun checks (empty commit)

* Rerun checks - third time's the Charm 🤞 (empty commit)

* Minor cosmetic change

* Update README

* Fix typo

* Update README

* Rerun checks (empty commit)

* Re-implement changes on top of #9400

* Update README.md

* Update README

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

* Remove duplicate option in print_usage()

* Update README.md

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Remove input check

* Remove commented out code

---------

Co-authored-by: compilade <git@compilade.net>
											
										
										
											2025-07-22 13:33:37 +01:00
+								static bool show_statistics(const common_params & params) {
 								    std::vector<tensor_statistics> ts;
 								    if (params.in_files.empty() || params.in_files.size() > 1) {
 								        LOG_ERR("\nError: a single imatrix file is required to compute tensor statistics\n\n");
 								        return false;
 								    }
 								    if (g_collector.load_imatrix(params.in_files[0].c_str())) {
 								        for (const auto & [name, stats] :g_collector.get_mstats()) {
 								            compute_statistics(ts, name, stats);
 								        }
 								    } else {
 								        LOG_ERR("\nError: %s is not a valid imatrix file\n\n", params.in_files[0].c_str());
 								        return false;
 								    }
 								    if (!ts.empty()) {
 								        compute_cossim(ts);
 								    } else {
 								        LOG_ERR("Error: cannot compute statistics for %s\n\n", params.in_files[0].c_str());
 								        return false;
 								    }
 								    struct tensor_comparer {
 								        bool operator()(const tensor_statistics & a, const tensor_statistics & b) const {
 								            std::string layer, name_a, name_b;
 								            ;
 								            process_tensor_name(a.tensor, layer, name_a);
 								            process_tensor_name(b.tensor, layer, name_b);
 								            return name_a < name_b || (name_a == name_b && a.total_sqract > b.total_sqract);
 								        }
 								    };
 								    std::sort(ts.begin(), ts.end(), tensor_comparer());
 								    struct weighted_stats {
 								        float weighted_bias   = 0.0f;
 								        float weighted_zd     = 0.0f;
 								        float weighted_cossim = 0.0f;
 								        int   total_elements  = 0;
 								    };
 								    std::map<int, weighted_stats> ws;
 								    LOG_INF("\nComputing statistics for %s (%d tensors)\n", params.in_files[0].c_str(), static_cast<int>(ts.size()));
 								    LOG_INF("\n%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", " Layer", "       Tensor", "          Σ(Act²)",
 								            "  Min", "            Max", "           μ", "   σ", " % Active", "N", "   Entropy", "E (norm)", "ZD",
 								            "  CosSim");
 								    LOG_INF(
 								        "=============================================================================================================="
 								        "===========================================================\n");
 								    for (const auto & tstat : ts) {
 								        std::string layer, name;
 								        process_tensor_name(tstat.tensor, layer, name);
 								        int blk;
 								        try {
 								            blk = std::stoi(layer);
 								        } catch (const std::exception & e) {
 								            blk = -1;  // not a block layer
 								        }
 								        LOG_INF("%5s\t%-20s\t%10.2f\t%8.4f\t%11.4f\t%6.2f\t%6.2f\t%8.2f%%\t%6d\t%10.4f\t%6.2f%%\t%10.2f%%\t%8.4f\n",
 								                layer.c_str(), name.c_str(), tstat.total_sqract, tstat.min_sqract, tstat.max_sqract, tstat.mean_sqract,
 								                tstat.stddev, tstat.active * 100.0f, tstat.elements, tstat.entropy,
 .0f * (tstat.entropy / std::log2(tstat.elements)), 100.0f * tstat.zd, tstat.cossim);
 								        const float weighted_bias   = tstat.elements * tstat.total_sqract;
 								        const float weighted_zd     = tstat.elements * tstat.zd;
 								        const float weighted_cossim = tstat.elements * tstat.cossim;
 								        if (ws.find(blk) != ws.end()) {
 								            ws[blk].weighted_bias += weighted_bias;
 								            ws[blk].weighted_zd += weighted_zd;
 								            ws[blk].weighted_cossim += weighted_cossim;
 								            ws[blk].total_elements += tstat.elements;
 								        } else {
 								            weighted_stats temp_ws;
 								            temp_ws.weighted_bias   = weighted_bias;
 								            temp_ws.weighted_zd     = weighted_zd;
 								            temp_ws.weighted_cossim = weighted_cossim;
 								            temp_ws.total_elements  = tstat.elements;
 								            ws[blk]                 = temp_ws;
 								        }
 								    }
 								    const int layers = std::count_if(ws.begin(), ws.end(), [](const auto & kv) { return kv.first >= 0; });
 								    LOG_INF("\nComputing weighted average statistics per layer (%d layers)\n", layers);
 								    LOG_INF("\n%s\t%s\t%s\t%s\n", "  Layer", "     μΣ(Act²)", "      μZD", "μCosSim");
 								    LOG_INF("================================================\n");
 								    for (const auto & [first, second] : ws) {
 								        const auto & layer = first;
 								        const auto & stats = second;
 								        if (stats.total_elements == 0) {
 								            continue;
 								        }
 								        if (layer >= 0) {
 								            const float bias   = stats.weighted_bias / stats.total_elements;
 								            const float zd     = stats.weighted_zd / stats.total_elements;
 								            const float cossim = stats.weighted_cossim / stats.total_elements;
 								            LOG_INF("%5d\t%14.2f\t%10.4f%%\t%6.4f\n", layer, bias, 100.0f * zd, cossim);
 								        }
 								    }
 								    LOG_INF("\n");
 								    return true;
 								}
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								int main(int argc, char ** argv) {
-												common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-10-10 22:57:42 +02:00
+								    common_params params;
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    params.out_file = "imatrix.gguf";
-												common : refactor '-o' option (#12278)

As discussed in PR 'llama-tts : add -o option' (#12042):

* common_params : 'out_file' string is the only output file name parameter left in common_params. It's intended to be used in all example programs implementing an '-o' option.

* cvector-generator, export-lora, imatrix : default output filenames moved from 'common_params' to the 'main()' of each example program.
											
										
										
											2025-03-10 12:34:13 +01:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    params.n_ctx = 512;
-												imatrix : disable prompt escape by default (#9543)


											
										
										
											2024-09-19 09:58:14 +02:00
+								    params.escape = false;
-												common : refactor cli arg parsing (#7675)

* common : gpt_params_parse do not print usage

* common : rework usage print (wip)

* common : valign

* common : rework print_usage

* infill : remove cfg support

* common : reorder args

* server : deduplicate parameters

ggml-ci

* common : add missing header

ggml-ci

* common : remote --random-prompt usages

ggml-ci

* examples : migrate to gpt_params

ggml-ci

* batched-bench : migrate to gpt_params

* retrieval : migrate to gpt_params

* common : change defaults for escape and n_ctx

* common : remove chatml and instruct params

ggml-ci

* common : passkey use gpt_params
											
										
										
											2024-06-04 21:23:39 +03:00
-												common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-10-10 22:57:42 +02:00
+								    if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_IMATRIX, print_usage)) {
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
+								        return 1;
 								    }
-												imatrix: add option to display importance score statistics for a given imatrix file (#12718)

* Add --show-statistics option

* Add --show-statistics logic

* Add tensor name parsing

* Tidy output format

* Fix typo in title

* Improve tensor influence ranking

* Add better statistics

* Change statistics' sort order

* Add Cosine Similarity

* Add header search path

* Change header search path to private

* Add weighted statistics per layer

* Update report title

* Refactor compute_statistics out of main

* Refactor compute_cossim out of load_imatrix

* Refactor compute_statistics out of load_imatrix

* Move imatrix statistics calculation into its own functions

* Add checks and validations

* Remove unnecessary include directory

* Rename labels

* Add m_stats getter and refactor compute_statistics out of load_imatrix

* Refactor variable names

* Minor cosmetic change

* Retrigger checks (empty commit)

* Rerun checks (empty commit)

* Fix unnecessary type promotion

Co-authored-by: compilade <git@compilade.net>

* Reverting change to improve code readability

* Rerun checks (empty commit)

* Rerun checks (empty commit)

* Rerun checks - third time's the Charm 🤞 (empty commit)

* Minor cosmetic change

* Update README

* Fix typo

* Update README

* Rerun checks (empty commit)

* Re-implement changes on top of #9400

* Update README.md

* Update README

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

* Remove duplicate option in print_usage()

* Update README.md

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Update README.md

Co-authored-by: compilade <git@compilade.net>

* Remove input check

* Remove commented out code

---------

Co-authored-by: compilade <git@compilade.net>
											
										
										
											2025-07-22 13:33:37 +01:00
+								    if (params.show_statistics) {
 								        if (!show_statistics(params)) {
 								            return 1;
 								        }
 								        return 0;
 								    }
-												common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-10-10 22:57:42 +02:00
+								    common_init();
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    const int32_t n_ctx = params.n_ctx;
 								    if (n_ctx <= 0) {
 								        LOG_ERR("%s: imatrix tool requires '--ctx-size' > 0\n", __func__);
 								        return 1;
 								    }
 								    {
 								        const int32_t n_seq = std::max(1, params.n_batch / n_ctx);
 								        const int32_t n_kv = n_seq * n_ctx;
 								        params.n_parallel = n_seq;
 								        params.n_ctx      = n_kv;
 								        params.n_batch = std::min(params.n_batch, n_kv);
 								    }
-												quantize: add imatrix and dataset metadata in GGUF (#6658)

* imatrix: save the dataset file used in the output file

* llama: support kv overrides type string string

* common: factorize KV Overrides parsing between common and server

* quantize: add imatrix n entries and dataset KV metadata
quantize: factorize KV Overrides parsing between common
#6656

* llama: remove kv override str_value initialization as it does not compile on some toolchain

* quantize: add imatrix m_last_call as `quantize.imatrix.chunks_count`

* quantize: add imatrix filename in KV

* llama: add llama_model_kv_override_free

* common: add llama_model_kv_override_free
common: free kv override if used after model loading

* llama: finally move the string KV override value to the stack

* llama : minor

* no need to add a NUL to the std::vector, std::string can be initialized from a pair of iterators.

Co-authored-by: slaren <slarengh@gmail.com>

* kv override: ensure string termination

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: slaren <slarengh@gmail.com>
											
										
										
											2024-04-26 20:06:33 +02:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    g_collector.set_params(params);
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								    for (const auto & in_file : params.in_files) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_INF("%s : loading imatrix from '%s'\n", __func__, in_file.c_str());
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        if (!g_collector.load_imatrix(in_file.c_str())) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								            LOG_ERR("%s : failed to load %s\n", __func__, in_file.c_str());
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								            return 1;
 								        }
 								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    if (params.prompt.empty()) {
 								        LOG_INF("No prompt provided; combining precomputed matrices only.\n");
 								        if (params.in_files.empty()) {
 								            LOG_ERR("Error: No prompt provided and no precomputed matrices (--in-file) to combine.\n");
 								            return 1;
 								        }
 								        if (params.in_files.size() == 1) {
 								            LOG_INF("%s : saving imatrix to '%s'\n", __func__, params.out_file.c_str());
 								        } else if (params.in_files.size() > 1) {
 								            LOG_INF("%s : saving combined imatrix to '%s'\n", __func__, params.out_file.c_str());
 								        }
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
+								        g_collector.save_imatrix();
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
 								        return 0;
-												Adding some imatrix tools (#5302)

* imatrix: adding --combine and --continue-from

* imatrix: be able to start from a specific chunk

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
											
										
										
											2024-02-04 10:39:58 +02:00
+								    }
-												ggml : add numa options (#5377)

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverted Makefile

* Fixed include

* Removed sched.h from ggml.h, moved ggml_get_numa_affinity into ggml.c, removed trailing whitespace and fixed up a few inconsistent variables

* removed trailing whitespace

* Added numa options to allow finer grained control as well as plumbing for a new mirror mode that will require numa.h

* Reverting Makefile

* Fixed a number of issues with the move from BOOL to ggml_numa_strategies. Added a note about mirror mode note being implemented yet

* Removing MIRROR_MODE code for this PR

* Removing last bit of MIRROR_MODE code for this PR

* Removing unneeded branch in server.cpp example and moving get_numa_affinity and making it static

* Fixed lingering init_llama_backend() bool calls in tests and examples

* Remote enum llama_numa_strategies

* Revert bad merge with dynatemp flags

* add missing enum ggml_numa_strategies declaration and revert sync problem with master

* add missing enum ggml_numa_strategies declaration

* fixed ggml_init_numa variable

* Update ggml.h

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* Update READMEs with info about numa flags, change INTERLEAVE strategy name to DISTRIBUTE everywhere, implement the improved distribution strategy from @rankaiyx, fix a spelling mistake and un-merge some bad merges

* split numa init out from llama_backend_init and created llama_numa_init. Updated all code paths and samples

* Fix up some boolean vs enum comparisons

* Added #ifdefs for non-Linux OS that don't have cpu_set_t datatype

* Update ggml.h

Align enum values

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

Remove whitespace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update ggml.c

align paremeters

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update examples/server/server.cpp

remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update common/common.cpp

Remove whitespace and align brace

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* unified ggml_numa_strategy enum and fixed text alignment in server.cpp example

* Update ggml.c

simplified return for platforms without NUMA support

Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>

* removed redundant else from cli argument processing of --numa

* whitespace

---------

Co-authored-by: root <root@nenya.lothlorien.ca>
Co-authored-by: Jared Van Bortel <cebtenzzre@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Jared Van Bortel <jared@nomic.ai>
											
										
										
											2024-02-16 01:31:07 -08:00
+								    llama_backend_init();
 								    llama_numa_init(params.numa);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								    // pass the callback to the backend scheduler
 								    // it will be executed for each node during the graph computation
-												eval-callback: Example how to use eval callback for debugging (#6576)

* gguf-debug: Example how to use ggml callback for debugging

* gguf-debug: no mutex, verify type, fix stride.

* llama: cv eval: move cb eval field in common gpt_params

* ggml_debug: use common gpt_params to pass cb eval.
Fix get tensor SIGV random.

* ggml_debug: ci: add tests

* ggml_debug: EOL in CMakeLists.txt

* ggml_debug: Remove unused param n_batch, no batching here

* ggml_debug: fix trailing spaces

* ggml_debug: fix trailing spaces

* common: fix cb_eval and user data not initialized

* ci: build revert label

* ggml_debug: add main test label

* doc: add a model: add a link to ggml-debug

* ggml-debug: add to make toolchain

* ggml-debug: tests add the main label

* ggml-debug: ci add test curl label

* common: allow the warmup to be disabled in llama_init_from_gpt_params

* ci: add curl test

* ggml-debug: better tensor type support

* gitignore : ggml-debug

* ggml-debug: printing also the sum of each tensor

* ggml-debug: remove block size

* eval-callback: renamed from ggml-debug

* eval-callback: fix make toolchain

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-11 14:51:07 +02:00
+								    params.cb_eval = ik_collect_imatrix;
 								    params.cb_eval_user_data = NULL;
 								    params.warmup = false;
 								    // init
-												common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-10-10 22:57:42 +02:00
+								    common_init_result llama_init = common_init_from_params(params);
-												imatrix : migrate to gpt_params (#7771)

* imatrix : migrate to gpt_params

ggml-ci

* imatrix : add --save-frequency cli arg

* common : fix --no-ppl
											
										
										
											2024-06-06 16:30:58 +03:00
-												llama : refactor `src/llama.cpp` (#10902)

* llama : scatter llama.cpp into multiple modules (wip)

* llama : control-vector -> adapter

* llama : arch

* llama : mmap

ggml-ci

* ci : remove BUILD_SHARED_LIBS=OFF

ggml-ci

* llama : arch (cont)

ggml-ci

* llama : chat

ggml-ci

* llama : model

ggml-ci

* llama : hparams

ggml-ci

* llama : adapter

ggml-ci

* examples : fix

ggml-ci

* rebase

ggml-ci

* minor

* llama : kv cache

ggml-ci

* llama : impl

ggml-ci

* llama : batch

ggml-ci

* cont

ggml-ci

* llama : context

ggml-ci

* minor

* llama : context (cont)

ggml-ci

* llama : model loader

ggml-ci

* common : update lora

ggml-ci

* llama : quant

ggml-ci

* llama : quant (cont)

ggml-ci

* minor [no ci]
											
										
										
											2025-01-03 10:18:53 +02:00
+								    llama_model * model = llama_init.model.get();
 								    llama_context * ctx = llama_init.context.get();
-												eval-callback: Example how to use eval callback for debugging (#6576)

* gguf-debug: Example how to use ggml callback for debugging

* gguf-debug: no mutex, verify type, fix stride.

* llama: cv eval: move cb eval field in common gpt_params

* ggml_debug: use common gpt_params to pass cb eval.
Fix get tensor SIGV random.

* ggml_debug: ci: add tests

* ggml_debug: EOL in CMakeLists.txt

* ggml_debug: Remove unused param n_batch, no batching here

* ggml_debug: fix trailing spaces

* ggml_debug: fix trailing spaces

* common: fix cb_eval and user data not initialized

* ci: build revert label

* ggml_debug: add main test label

* doc: add a model: add a link to ggml-debug

* ggml-debug: add to make toolchain

* ggml-debug: tests add the main label

* ggml-debug: ci add test curl label

* common: allow the warmup to be disabled in llama_init_from_gpt_params

* ci: add curl test

* ggml-debug: better tensor type support

* gitignore : ggml-debug

* ggml-debug: printing also the sum of each tensor

* ggml-debug: remove block size

* eval-callback: renamed from ggml-debug

* eval-callback: fix make toolchain

---------

Co-authored-by: slaren <slarengh@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-04-11 14:51:07 +02:00
+								    if (model == nullptr || ctx == nullptr) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_ERR("%s : failed to init\n", __func__);
-												imatrix : offload to GPU support (#4957)

* backend : add eval callback

ggml-ci

* backend : group nodes in a single compute when user don't need them

* backend : clean-up the implementation

ggml-ci

* simple : do not perform tensor data copy if not needed

* simple : fix

* imatrix : offload to GPU support

* imatrix : fix ggml_mul_mat_id hanlding

ggml-ci

* ci : add imatrix test

ggml-ci

* ci : rearrange output

ggml-ci
											
										
										
											2024-01-17 18:46:30 +02:00
+								        return 1;
 								    }
-												llama : add `llama_vocab`, functions -> methods, naming (#11110)

* llama : functions -> methods (#11110)

* llama : add struct llama_vocab to the API (#11156)

ggml-ci

* hparams : move vocab params to llama_vocab (#11159)

ggml-ci

* vocab : more pimpl (#11165)

ggml-ci

* vocab : minor tokenization optimizations (#11160)

ggml-ci

Co-authored-by: Diego Devesa <slarengh@gmail.com>

* lora : update API names (#11167)

ggml-ci

* llama : update API names to use correct prefix (#11174)

* llama : update API names to use correct prefix

ggml-ci

* cont

ggml-ci

* cont

ggml-ci

* minor [no ci]

* vocab : llama_vocab_add_[be]os -> llama_vocab_get_add_[be]os (#11174)

ggml-ci

* vocab : llama_vocab_n_vocab -> llama_vocab_n_tokens (#11174)

ggml-ci

---------

Co-authored-by: Diego Devesa <slarengh@gmail.com>
											
										
										
											2025-01-12 11:32:42 +02:00
+								    const int n_ctx_train = llama_model_n_ctx_train(model);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    if (params.n_ctx > n_ctx_train) {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_WRN("%s: model was trained on only %d context tokens (%d specified)\n",
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								                __func__, n_ctx_train, params.n_ctx);
 								    }
 								    // print system information
 								    {
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								        LOG_INF("\n");
-												common : use common_ prefix for common library functions (#9805)

* common : use common_ prefix for common library functions

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-10-10 22:57:42 +02:00
+								        LOG_INF("%s\n", common_params_get_system_info(params).c_str());
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    }
-												imatrix : use GGUF to store importance matrices (#9400)

* imatrix : allow processing multiple chunks per batch

* perplexity : simplify filling the batch

* imatrix : fix segfault when using a single chunk per batch

* imatrix : use GGUF to store imatrix data

* imatrix : fix conversion problems

* imatrix : use FMA and sort tensor names

* py : add requirements for legacy imatrix convert script

* perplexity : revert changes

* py : include imatrix converter requirements in toplevel requirements

* imatrix : avoid using designated initializers in C++

* imatrix : remove unused n_entries

* imatrix : allow loading mis-ordered tensors

Sums and counts tensors no longer need to be consecutive.

* imatrix : more sanity checks when loading multiple imatrix files

* imatrix : use ggml_format_name instead of std::string concatenation

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>

* quantize : use unused imatrix chunk_size with LLAMA_TRACE

* common : use GGUF for imatrix output by default

* imatrix : two-way conversion between old format and GGUF

* convert : remove imatrix to gguf python script

* imatrix : use the function name in more error messages

* imatrix : don't use FMA explicitly

This should make comparisons between the formats easier
because this matches the behavior of the previous version.

* imatrix : avoid returning from void function save_imatrix

* imatrix : support 3d tensors with MUL_MAT

* quantize : fix dataset name loading from gguf imatrix

* common : move string_remove_suffix from quantize and imatrix

Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>

* imatrix : add warning when legacy format is written

* imatrix : warn when writing partial data, to help guess dataset coverage

Also make the legacy format store partial data
by using neutral values for missing data.
This matches what is done at read-time for the new format,
and so should get the same quality in case the old format is still used.

* imatrix : avoid loading model to convert or combine imatrix

* imatrix : avoid using imatrix.dat in README

---------

Co-authored-by: Xuan Son Nguyen <son@huggingface.co>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
											
										
										
											2025-07-19 12:51:22 -04:00
+								    if (!compute_imatrix(ctx, params, n_ctx)) {
 								        return 1;
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
+								    }
 								    g_collector.save_imatrix();
-												common : reimplement logging (#9418)

https://github.com/ggerganov/llama.cpp/pull/9418
											
										
										
											2024-09-15 20:46:12 +03:00
+								    LOG("\n");
-												llama : llama_perf + option to disable timings during decode (#9355)

* llama : llama_perf + option to disable timings during decode

ggml-ci

* common : add llama_arg

* Update src/llama.cpp

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>

* perf : separate functions in the API

ggml-ci

* perf : safer pointer handling + naming update

ggml-ci

* minor : better local var name

* perf : abort on invalid sampler pointer

ggml-ci

---------

Co-authored-by: Xuan Son Nguyen <thichthat@gmail.com>
											
										
										
											2024-09-13 09:53:38 +03:00
+								    llama_perf_context_print(ctx);
-												Importance Matrix calculation (#4861)

* imatrix: 1st version

* imatrix: WIP

* Cleanup

* Update examples/imatrix/imatrix.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

---------

Co-authored-by: Iwan Kawrakow <iwan.kawrakow@gmail.com>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
											
										
										
											2024-01-12 06:59:57 +01:00
 								    llama_backend_free();
 								    return 0;
 								}