mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 04:15:21 +00:00
mtmd : remove libllava, remove clip-quantize-cli (⚠️ breaking change) (#13460)
* mtmd : remove libllava, remove clip-quantize-cli * rm clip_model_quantize
This commit is contained in:
@ -3586,141 +3586,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||
return true;
|
||||
}
|
||||
|
||||
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
|
||||
assert(itype < GGML_TYPE_COUNT);
|
||||
ggml_type type = static_cast<ggml_type>(itype);
|
||||
|
||||
auto * ctx_clip = clip_init(fname_inp, clip_context_params{
|
||||
/* use_gpu */ false,
|
||||
/* verbosity */ GGML_LOG_LEVEL_ERROR,
|
||||
});
|
||||
|
||||
const auto & ctx_src = ctx_clip->ctx_gguf.get();
|
||||
const auto & ctx_data = ctx_clip->ctx_data.get();
|
||||
|
||||
auto * ctx_out = gguf_init_empty();
|
||||
gguf_set_kv(ctx_out, ctx_src);
|
||||
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
|
||||
gguf_set_val_u32(ctx_out, "general.file_type", itype);
|
||||
|
||||
auto fout = std::ofstream(fname_out, std::ios::binary);
|
||||
|
||||
const int n_tensors = gguf_get_n_tensors(ctx_src);
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const char * name = gguf_get_tensor_name(ctx_src, i);
|
||||
ggml_tensor * cur = ggml_get_tensor(ctx_data, name);
|
||||
gguf_add_tensor(ctx_out, cur);
|
||||
}
|
||||
|
||||
const size_t meta_size = gguf_get_meta_size(ctx_out);
|
||||
for (size_t i = 0; i < meta_size; ++i) {
|
||||
fout.put(0);
|
||||
}
|
||||
|
||||
// regexes of tensor names to be quantized
|
||||
const std::vector<std::string> k_names = {
|
||||
".*weight",
|
||||
};
|
||||
|
||||
std::vector<uint8_t> work(512);
|
||||
std::vector<float> conv_buf(512);
|
||||
size_t total_size_org = 0;
|
||||
size_t total_size_new = 0;
|
||||
|
||||
for (int i = 0; i < n_tensors; ++i) {
|
||||
const std::string name = gguf_get_tensor_name(ctx_src, i);
|
||||
ggml_tensor * cur = ggml_get_tensor(ctx_data, name.c_str());
|
||||
|
||||
enum ggml_type new_type;
|
||||
void * new_data;
|
||||
size_t new_size;
|
||||
|
||||
bool quantize = false;
|
||||
for (const auto & s : k_names) {
|
||||
if (std::regex_match(name, std::regex(s))) {
|
||||
quantize = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// quantize only 2D tensors and bigger than block size
|
||||
quantize &= (ggml_n_dims(cur) == 2) && cur->ne[0] > ggml_blck_size(type);
|
||||
|
||||
if (quantize) {
|
||||
new_type = type;
|
||||
if (new_type >= GGML_TYPE_Q2_K && name.find("embd") != std::string::npos) {
|
||||
new_type = GGML_TYPE_Q8_0; // ggml_get_rows needs non K type
|
||||
// LOG_ERR("%s: quantizing %s to %s\n", __func__, name.c_str(), ggml_type_name(new_type));
|
||||
}
|
||||
const size_t n_elms = ggml_nelements(cur);
|
||||
float * f32_data;
|
||||
|
||||
switch (cur->type) {
|
||||
case GGML_TYPE_F32:
|
||||
f32_data = (float *)cur->data;
|
||||
break;
|
||||
case GGML_TYPE_F16:
|
||||
if (conv_buf.size() < n_elms) {
|
||||
conv_buf.resize(n_elms);
|
||||
}
|
||||
for (size_t j = 0; j < n_elms; ++j) {
|
||||
conv_buf[j] = ggml_fp16_to_fp32(((ggml_fp16_t *)cur->data)[j]);
|
||||
}
|
||||
f32_data = (float *)conv_buf.data();
|
||||
break;
|
||||
default:
|
||||
LOG_ERR("%s: Please use an input file in f32 or f16\n", __func__);
|
||||
gguf_free(ctx_out);
|
||||
return false;
|
||||
}
|
||||
|
||||
if (work.size() < n_elms * 4) {
|
||||
work.resize(n_elms * 4);
|
||||
}
|
||||
new_data = work.data();
|
||||
|
||||
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, n_elms/cur->ne[0], cur->ne[0], nullptr);
|
||||
} else {
|
||||
new_type = cur->type;
|
||||
new_data = cur->data;
|
||||
new_size = ggml_nbytes(cur);
|
||||
}
|
||||
const size_t orig_size = ggml_nbytes(cur);
|
||||
total_size_org += orig_size;
|
||||
total_size_new += new_size;
|
||||
gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
|
||||
GGML_ASSERT(gguf_get_tensor_size(ctx_out, gguf_find_tensor(ctx_out, name.c_str())) == new_size);
|
||||
gguf_set_tensor_data(ctx_out, name.c_str(), new_data);
|
||||
fout.write((const char *)new_data, new_size);
|
||||
size_t pad = GGML_PAD(new_size, gguf_get_alignment(ctx_out)) - new_size;
|
||||
for (size_t j = 0; j < pad; ++j) {
|
||||
fout.put(0);
|
||||
}
|
||||
|
||||
LOG_INF("%s: n_dims = %d | quantize=%d | size = %f MB -> %f MB\n", name.c_str(), ggml_n_dims(cur), quantize,
|
||||
orig_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
// go back to beginning of file and write the updated metadata
|
||||
fout.seekp(0, std::ios::beg);
|
||||
std::vector<uint8_t> meta(meta_size);
|
||||
gguf_get_meta_data(ctx_out, meta.data());
|
||||
fout.write((const char *)meta.data(), meta_size);
|
||||
|
||||
fout.close();
|
||||
|
||||
clip_free(ctx_clip);
|
||||
gguf_free(ctx_out);
|
||||
|
||||
{
|
||||
LOG_INF("%s: original size = %8.2f MB\n", __func__, total_size_org / 1024.0 / 1024.0);
|
||||
LOG_INF("%s: quantized size = %8.2f MB\n", __func__, total_size_new / 1024.0 / 1024.0);
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
||||
switch (ctx->proj_type) {
|
||||
case PROJECTOR_TYPE_LDP:
|
||||
|
Reference in New Issue
Block a user