ggml: refactor fp32->fp16 and fp16->fp32 simd to ggml-cpu

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-06-27 20:05:20 +00:00 · 2025-06-24 15:05:46 +08:00
parent 5834dee1fc
commit bd288e8fa5
2 changed files with 192 additions and 166 deletions
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@ -248,6 +248,43 @@
    #define GGML_F16_VEC_REDUCE         GGML_F32Cx4_REDUCE
 #endif
 #if !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
 #ifdef GGML_FP16_TO_FP32
 #undef GGML_FP16_TO_FP32
 #endif
 #ifdef GGML_FP32_TO_FP16
 #undef GGML_FP32_TO_FP16
 #endif
 #ifdef GGML_COMPUTE_FP16_TO_FP32
 #undef GGML_COMPUTE_FP16_TO_FP32
 #endif
 #ifdef GGML_COMPUTE_FP32_TO_FP16
 #undef GGML_COMPUTE_FP32_TO_FP16
 #endif
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    __fp16 tmp;
    memcpy(&tmp, &h, sizeof(ggml_fp16_t));
    return (float)tmp;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    ggml_fp16_t res;
    __fp16 tmp = f;
    memcpy(&res, &tmp, sizeof(ggml_fp16_t));
    return res;
 }
 #endif  // !__CUDACC__ && __CUDACC_VER_MAJOR__ <= 11 && !__MUSACC__
 #elif defined(__AVX512F__)
 #define GGML_SIMD
@ -410,6 +447,23 @@ do {                                                              \
 // the  _mm256_cvt intrinsics require F16C
 #define GGML_F32Cx8_LOAD(x)     _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(x)))
 #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0))
 #ifdef GGML_COMPUTE_FP16_TO_FP32
 #undef GGML_COMPUTE_FP16_TO_FP32
 #endif
 #ifdef GGML_COMPUTE_FP32_TO_FP16
 #undef GGML_COMPUTE_FP32_TO_FP16
 #endif
 #ifdef _MSC_VER
    #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
    #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
 #else
    #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
 #endif
 #else
 static inline __m256 __avx_f32cx8_load(const ggml_fp16_t * x) {
    float tmp[8];
@ -519,6 +573,53 @@ static inline unsigned char ggml_endian_byte(int i) {
                                   r[i - GGML_ENDIAN_BYTE(0)]), \
            0, p - GGML_F16_EPR)
 #ifdef GGML_FP16_TO_FP32
 #undef GGML_FP16_TO_FP32
 #endif
 #ifdef GGML_FP32_TO_FP16
 #undef GGML_FP32_TO_FP16
 #endif
 #ifdef GGML_COMPUTE_FP16_TO_FP32
 #undef GGML_COMPUTE_FP16_TO_FP32
 #endif
 #ifdef GGML_COMPUTE_FP32_TO_FP16
 #undef GGML_COMPUTE_FP32_TO_FP16
 #endif
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 /* the inline asm below is about 12% faster than the lookup method */
 #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    float f;
    double d;
    __asm__(
        "mtfprd %0,%2\n"
        "xscvhpdp %0,%0\n"
        "frsp %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=f"(f):
        /* in */   "r"(h));
    return f;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    double d;
    ggml_fp16_t r;
    __asm__( /* xscvdphp can work on double or single precision */
        "xscvdphp %0,%2\n"
        "mffprd %1,%0\n" :
        /* temp */ "=d"(d),
        /* out */  "=r"(r):
        /* in */   "f"(f));
    return r;
 }
 #elif defined(__wasm_simd128__)
 #define GGML_SIMD
@ -1052,6 +1153,35 @@ static inline ggml_fp16_t nnpa_compute_fp32_to_fp16(float f) {
 #endif  // __NNPA__
 #elif defined(__riscv) && defined(__riscv_zfhmin)
 static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    float f;
    __asm__(
        "fmv.h.x %[f], %[h]\n\t"
        "fcvt.s.h %[f], %[f]"
        : [f] "=&f" (f)
        : [h] "r" (h)
    );
    return f;
 }
 static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
    ggml_fp16_t res;
    __asm__(
        "fcvt.h.s %[f], %[f]\n\t"
        "fmv.x.h %[h], %[f]"
        : [h] "=&r" (res)
        : [f] "f" (f)
    );
    return res;
 }
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 // GGML_F32_ARR / GGML_F16_ARR
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -317,140 +317,38 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 GGML_API void * ggml_aligned_malloc(size_t size);
 GGML_API void ggml_aligned_free(void * ptr, size_t size);
-// FP16 to FP32 conversion
+// FP16 <-> FP32
 // ref: https://github.com/Maratyszcza/FP16
-// 16-bit float
+static inline float fp32_from_bits(uint32_t w) {
 // on Arm, we use __fp16
 // on x86, we use uint16_t
 //
 // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
 // for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
 //
 #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
    #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
        __fp16 tmp;
        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
        return (float)tmp;
    }
    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
        ggml_fp16_t res;
        __fp16 tmp = f;
        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
        return res;
    }
 #elif defined(__F16C__)
    #ifdef _MSC_VER
        #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
        #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
    #else
        #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
        #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
    #endif
 #elif defined(__POWER9_VECTOR__)
    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
    /* the inline asm below is about 12% faster than the lookup method */
    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
        float f;
        double d;
        __asm__(
            "mtfprd %0,%2\n"
            "xscvhpdp %0,%0\n"
            "frsp %1,%0\n" :
            /* temp */ "=d"(d),
            /* out */  "=f"(f):
            /* in */   "r"(h));
        return f;
    }
    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
        double d;
        ggml_fp16_t r;
        __asm__( /* xscvdphp can work on double or single precision */
            "xscvdphp %0,%2\n"
            "mffprd %1,%0\n" :
            /* temp */ "=d"(d),
            /* out */  "=r"(r):
            /* in */   "f"(f));
        return r;
    }
 #elif defined(__riscv) && defined(__riscv_zfhmin)
    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
        float f;
        __asm__(
            "fmv.h.x %[f], %[h]\n\t"
            "fcvt.s.h %[f], %[f]"
            : [f] "=&f" (f)
            : [h] "r" (h)
        );
        return f;
    }
    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
        ggml_fp16_t res;
        __asm__(
            "fcvt.h.s %[f], %[f]\n\t"
            "fmv.x.h %[h], %[f]"
            : [h] "=&r" (res)
            : [f] "f" (f)
        );
        return res;
    }
    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #else
    // FP16 <-> FP32
    // ref: https://github.com/Maratyszcza/FP16
    static inline float fp32_from_bits(uint32_t w) {
    union {
        uint32_t as_bits;
        float as_value;
    } fp32;
    fp32.as_bits = w;
    return fp32.as_value;
-    }
+}
-    static inline uint32_t fp32_to_bits(float f) {
+static inline uint32_t fp32_to_bits(float f) {
    union {
        float as_value;
        uint32_t as_bits;
    } fp32;
    fp32.as_value = f;
    return fp32.as_bits;
-    }
+}
-    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
+static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
    const uint32_t w = (uint32_t) h << 16;
    const uint32_t sign = w & UINT32_C(0x80000000);
    const uint32_t two_w = w + w;
    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
-    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
    const float exp_scale = 0x1.0p-112f;
-    #else
+#else
    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
-    #endif
+#endif
    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
    const uint32_t magic_mask = UINT32_C(126) << 23;
@ -461,16 +359,16 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
    const uint32_t result = sign |
        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
    return fp32_from_bits(result);
-    }
+}
-    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
+static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
-    #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
+#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L)
    const float scale_to_inf = 0x1.0p+112f;
    const float scale_to_zero = 0x1.0p-110f;
-    #else
+#else
    const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
    const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
-    #endif
+#endif
    float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
    const uint32_t w = fp32_to_bits(f);
@ -487,12 +385,10 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
    const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
    const uint32_t nonsign = exp_bits + mantissa_bits;
    return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
-    }
+}
-    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
+#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
-    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
+#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()