ggml : generalize quantize_fns for simpler FP16 handling (#1237)

* Generalize quantize_fns for simpler FP16 handling * Remove call to ggml_cuda_mul_mat_get_wsize * ci : disable FMA for mac os actions --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2025-06-26 19:55:04 +00:00 · 2023-07-05 16:13:06 +00:00
parent 8567c76b53
commit 1b107b8550
9 changed files with 172 additions and 548 deletions
--- a/tests/test-quantize-fns.cpp
+++ b/tests/test-quantize-fns.cpp
@ -40,26 +40,26 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
 }

 // Total quantization error on test data
-float total_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
    std::vector<uint8_t> tmp_q(2*test_size);
    std::vector<float> tmp_out(test_size);

-    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
-    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+    qfns.from_float(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
    return array_rmse(test_data, tmp_out.data(), test_size);
 }

 // Total quantization error on test data
-float reference_quantization_error(quantize_fns_t & qfns, size_t test_size, const float * test_data) {
+float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
    std::vector<uint8_t> tmp_q(2*test_size);
    std::vector<float> tmp_out(test_size);
    std::vector<float> tmp_out_ref(test_size);

-    qfns.quantize_row_q(test_data, tmp_q.data(), test_size);
-    qfns.dequantize_row_q(tmp_q.data(), tmp_out.data(), test_size);
+    qfns.from_float(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);

-    qfns.quantize_row_q_reference(test_data, tmp_q.data(), test_size);
-    qfns.dequantize_row_q(tmp_q.data(), tmp_out_ref.data(), test_size);
+    qfns.from_float_reference(test_data, tmp_q.data(), test_size);
+    qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);

    return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
 }
@ -73,15 +73,17 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
 }

 // Total dot product error
-float dot_product_error(quantize_fns_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
+float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
    std::vector<uint8_t> tmp_q1(2*test_size);
    std::vector<uint8_t> tmp_q2(2*test_size);

-    qfns.quantize_row_q    (test_data1, tmp_q1.data(), test_size);
-    qfns.quantize_row_q_dot(test_data2, tmp_q2.data(), test_size);
+    auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+
+    qfns.from_float(test_data1, tmp_q1.data(), test_size);
+    vdot.from_float(test_data2, tmp_q2.data(), test_size);

    float result = INFINITY;
-    qfns.vec_dot_q(test_size, &result, tmp_q1.data(), tmp_q2.data());
+    qfns.vec_dot(test_size, &result, tmp_q1.data(), tmp_q2.data());

    const float dot_ref = dot_product(test_data1, test_data2, test_size);

@ -123,9 +125,9 @@ int main(int argc, char * argv[]) {

    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);

-        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+        if (qfns.from_float && qfns.to_float) {
            const float total_error = total_quantization_error(qfns, test_size, test_data.data());
            const float max_quantization_error =
                type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
--- a/tests/test-quantize-perf.cpp
+++ b/tests/test-quantize-perf.cpp
@ -123,9 +123,9 @@ void usage(char * argv[]) {
    printf("  --type TYPE           set test type as");
    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(type);
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
        if (ggml_type_name(type) != NULL) {
-            if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+            if (qfns.from_float && qfns.to_float) {
                printf(" %s", ggml_type_name(type));
            }
        }
@ -271,12 +271,12 @@ int main(int argc, char * argv[]) {

    for (int i = 0; i < GGML_TYPE_COUNT; i++) {
        ggml_type type = (ggml_type) i;
-        quantize_fns_t qfns = ggml_internal_get_quantize_fn(i);
+        ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
        if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
            continue;
        }

-        if (qfns.quantize_row_q && qfns.dequantize_row_q) {
+        if (qfns.from_float && qfns.to_float) {
            printf("%s\n", ggml_type_name(type));

            if (params.op_quantize_row_q_reference) {
@ -284,7 +284,7 @@ int main(int argc, char * argv[]) {
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void ) {
-                        qfns.quantize_row_q_reference(test_data1, test_q1, size);
+                        qfns.from_float_reference(test_data1, test_q1, size);
                        return test_q1[0];
                    };
                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@ -298,7 +298,7 @@ int main(int argc, char * argv[]) {
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void ) {
-                        qfns.quantize_row_q(test_data1, test_q1, size);
+                        qfns.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@ -309,11 +309,11 @@ int main(int argc, char * argv[]) {

            if (params.op_dequantize_row_q) {
                printf("  dequantize_row_q\n");
-                qfns.quantize_row_q(test_data1, test_q1, largest);
+                qfns.from_float(test_data1, test_q1, largest);
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void ) {
-                        qfns.dequantize_row_q(test_q1, test_out, size);
+                        qfns.to_float(test_q1, test_out, size);
                        return test_out[0];
                    };
                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@ -327,7 +327,8 @@ int main(int argc, char * argv[]) {
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void ) {
-                        qfns.quantize_row_q_dot(test_data1, test_q1, size);
+                        auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
+                        vdot.from_float(test_data1, test_q1, size);
                        return test_q1[0];
                    };
                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);
@ -338,13 +339,13 @@ int main(int argc, char * argv[]) {

            if (params.op_vec_dot_q) {
                printf("  vec_dot_q\n");
-                qfns.quantize_row_q(test_data1, test_q1, largest);
-                qfns.quantize_row_q(test_data2, test_q2, largest);
+                qfns.from_float(test_data1, test_q1, largest);
+                qfns.from_float(test_data2, test_q2, largest);
                for (size_t size : params.test_sizes) {
                    printf("    %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
                    auto quantize_fn = [&](void ) {
                        float result;
-                        qfns.vec_dot_q(size, &result, test_q1, test_q2);
+                        qfns.vec_dot(size, &result, test_q1, test_q2);
                        return result;
                    };
                    size_t quantized_size = size / ggml_blck_size(type) * ggml_type_size(type);