ggml: refactor fp16<->fp32 simd to ggml-cpu

Signed-off-by: Aaron Teo <aaron.teo1@ibm.com>
2025-06-28 04:15:21 +00:00 · 2025-06-24 20:42:15 +08:00
parent 8a5e011cb5
commit 17b032fab8
10 changed files with 256 additions and 183 deletions
--- a/ggml/src/ggml-cpu/arch/arm/quants.c
+++ b/ggml/src/ggml-cpu/arch/arm/quants.c
@ -6,6 +6,7 @@
 #include "../../quants.h"
 #include "../../ggml-cpu-impl.h"
 #include "../../simd-mappings.h"
 #include <math.h>
 #include <string.h>
--- a/ggml/src/ggml-cpu/arch/arm/repack.cpp
+++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp
@ -6,6 +6,7 @@
 #include "ggml-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-cpu-impl.h"
 #include "simd-mappings.h"
 #include "traits.h"
 #include <cmath>
--- a/ggml/src/ggml-cpu/common.h
+++ b/ggml/src/ggml-cpu/common.h
@ -4,6 +4,7 @@
 #include "traits.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-impl.h"
 #include "simd-mappings.h"
 #ifdef __cplusplus
--- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp
+++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp
@ -52,6 +52,7 @@
 #include "ggml-impl.h"
 #include "ggml-cpu-impl.h"
 #include "ggml-quants.h"
 #include "simd-mappings.h"
 #include <array>
 #include <type_traits>
--- a/ggml/src/ggml-cpu/quants.c
+++ b/ggml/src/ggml-cpu/quants.c
@ -2,6 +2,7 @@
 #include "ggml-common.h"
 #include "ggml-cpu-impl.h"
 #include "simd-mappings.h"
 #include "ggml-quants.h"
 #include "quants.h"
--- a/ggml/src/ggml-cpu/repack.cpp
+++ b/ggml/src/ggml-cpu/repack.cpp
@ -6,6 +6,7 @@
 #include "ggml-impl.h"
 #include "ggml-cpu.h"
 #include "ggml-cpu-impl.h"
 #include "simd-mappings.h"
 #include "traits.h"
 #include "arch-fallback.h"
--- a/ggml/src/ggml-cpu/simd-mappings.h
+++ b/ggml/src/ggml-cpu/simd-mappings.h
@ -2,10 +2,196 @@
 #include "ggml-cpu-impl.h"
 #ifdef __ARM_FEATURE_SVE
 #include <arm_sve.h>
 #endif // __ARM_FEATURE_SVE
 #if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__)
 // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
 //
 //   $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
 //
 #include <arm_neon.h>
 #endif
 #if defined(__F16C__)
 #include <immintrin.h>
 #endif
 //
 // simd mappings
 //
 // FP16 to FP32 conversion
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
 //
 // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
 // for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
 //
 #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
    #ifdef GGML_FP16_TO_FP32
    #undef GGML_FP16_TO_FP32
    #endif
    #ifdef GGML_FP32_TO_FP16
    #undef GGML_FP32_TO_FP16
    #endif
    #ifdef GGML_COMPUTE_FP16_TO_FP32
    #undef GGML_COMPUTE_FP16_TO_FP32
    #endif
    #ifdef GGML_COMPUTE_FP32_TO_FP16
    #undef GGML_COMPUTE_FP32_TO_FP16
    #endif
    #define GGML_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x)
    #define GGML_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x)
    static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) {
        __fp16 tmp;
        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
        return (float)tmp;
    }
    static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) {
        ggml_fp16_t res;
        __fp16 tmp = f;
        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
        return res;
    }
 #elif defined(__F16C__)
    #ifdef GGML_COMPUTE_FP16_TO_FP32
    #undef GGML_COMPUTE_FP16_TO_FP32
    #endif
    #ifdef GGML_COMPUTE_FP32_TO_FP16
    #undef GGML_COMPUTE_FP32_TO_FP16
    #endif
    #ifdef _MSC_VER
        #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
        #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
    #else
        #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
        #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
    #endif
 #elif defined(__POWER9_VECTOR__)
    #ifdef GGML_FP16_TO_FP32
    #undef GGML_FP16_TO_FP32
    #endif
    #ifdef GGML_FP32_TO_FP16
    #undef GGML_FP32_TO_FP16
    #endif
    #ifdef GGML_COMPUTE_FP16_TO_FP32
    #undef GGML_COMPUTE_FP16_TO_FP32
    #endif
    #ifdef GGML_COMPUTE_FP32_TO_FP16
    #undef GGML_COMPUTE_FP32_TO_FP16
    #endif
    #define GGML_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x)
    /* the inline asm below is about 12% faster than the lookup method */
    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
    static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) {
        float f;
        double d;
        __asm__(
            "mtfprd %0,%2\n"
            "xscvhpdp %0,%0\n"
            "frsp %1,%0\n" :
            /* temp */ "=d"(d),
            /* out */  "=f"(f):
            /* in */   "r"(h));
        return f;
    }
    static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) {
        double d;
        ggml_fp16_t r;
        __asm__( /* xscvdphp can work on double or single precision */
            "xscvdphp %0,%2\n"
            "mffprd %1,%0\n" :
            /* temp */ "=d"(d),
            /* out */  "=r"(r):
            /* in */   "f"(f));
        return r;
    }
 #elif defined(__riscv) && defined(__riscv_zfhmin)
    #ifdef GGML_FP16_TO_FP32
    #undef GGML_FP16_TO_FP32
    #endif
    #ifdef GGML_FP32_TO_FP16
    #undef GGML_FP32_TO_FP16
    #endif
    #ifdef GGML_COMPUTE_FP16_TO_FP32
    #undef GGML_COMPUTE_FP16_TO_FP32
    #endif
    #ifdef GGML_COMPUTE_FP32_TO_FP16
    #undef GGML_COMPUTE_FP32_TO_FP16
    #endif
    static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) {
        float f;
        __asm__(
            "fmv.h.x %[f], %[h]\n\t"
            "fcvt.s.h %[f], %[f]"
            : [f] "=&f" (f)
            : [h] "r" (h)
        );
        return f;
    }
    static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) {
        ggml_fp16_t res;
        __asm__(
            "fcvt.h.s %[f], %[f]\n\t"
            "fmv.x.h %[h], %[f]"
            : [h] "=&r" (res)
            : [f] "f" (f)
        );
        return res;
    }
    #define GGML_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x)
    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
    return ggml_table_f32_f16[s];
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 #endif
 #if !defined(GGML_FP32_TO_FP16)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 // we define a common set of C macros which map to specific intrinsics based on the current architecture
 // we then implement the fundamental computation operations below using only these macros
 // adding support for new architectures requires to define the corresponding SIMD macros
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@ -317,108 +317,6 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
 GGML_API void * ggml_aligned_malloc(size_t size);
 GGML_API void ggml_aligned_free(void * ptr, size_t size);
 // FP16 to FP32 conversion
 // 16-bit float
 // on Arm, we use __fp16
 // on x86, we use uint16_t
 //
 // for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616
 // for     MUSA compilers        , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843
 //
 #if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
    #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
        __fp16 tmp;
        memcpy(&tmp, &h, sizeof(ggml_fp16_t));
        return (float)tmp;
    }
    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
        ggml_fp16_t res;
        __fp16 tmp = f;
        memcpy(&res, &tmp, sizeof(ggml_fp16_t));
        return res;
    }
 #elif defined(__F16C__)
    #ifdef _MSC_VER
        #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x)))
        #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0)
    #else
        #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x)
        #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0)
    #endif
 #elif defined(__POWER9_VECTOR__)
    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
    /* the inline asm below is about 12% faster than the lookup method */
    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
        float f;
        double d;
        __asm__(
            "mtfprd %0,%2\n"
            "xscvhpdp %0,%0\n"
            "frsp %1,%0\n" :
            /* temp */ "=d"(d),
            /* out */  "=f"(f):
            /* in */   "r"(h));
        return f;
    }
    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
        double d;
        ggml_fp16_t r;
        __asm__( /* xscvdphp can work on double or single precision */
            "xscvdphp %0,%2\n"
            "mffprd %1,%0\n" :
            /* temp */ "=d"(d),
            /* out */  "=r"(r):
            /* in */   "f"(f));
        return r;
    }
 #elif defined(__riscv) && defined(__riscv_zfhmin)
    static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) {
        float f;
        __asm__(
            "fmv.h.x %[f], %[h]\n\t"
            "fcvt.s.h %[f], %[f]"
            : [f] "=&f" (f)
            : [h] "r" (h)
        );
        return f;
    }
    static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) {
        ggml_fp16_t res;
        __asm__(
            "fcvt.h.s %[f], %[f]\n\t"
            "fmv.x.h %[h], %[f]"
            : [h] "=&r" (res)
            : [f] "f" (f)
        );
        return res;
    }
    #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
    #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
    #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x)
    #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #else
 // FP16 <-> FP32
 // ref: https://github.com/Maratyszcza/FP16
@ -492,29 +390,10 @@ GGML_API void ggml_aligned_free(void * ptr, size_t size);
 #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x)
 #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x)
 #endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__)
 // precomputed f32 table for f16 (256 KB)
 // defined in ggml.c, initialized in ggml_init()
 GGML_API float ggml_table_f32_f16[1 << 16];
 // On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32,
 // so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON.
 // This is also true for POWER9.
 #if !defined(GGML_FP16_TO_FP32)
 inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
    uint16_t s;
    memcpy(&s, &f, sizeof(uint16_t));
    return ggml_table_f32_f16[s];
 }
 #define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x)
 #endif
 #if !defined(GGML_FP32_TO_FP16)
 #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
 #endif
 /**
 * Converts brain16 to float32.
 *
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@ -4,6 +4,7 @@
 #include "ggml-quants.h"
 #include "ggml-impl.h"
 #include "ggml-cpu/ggml-cpu-impl.h"
 #include "ggml-cpu/simd-mappings.h"
 #include "ggml-cpu.h"
 #include <math.h>
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@ -2,6 +2,7 @@
 #define _USE_MATH_DEFINES // For M_PI on MSVC
 #include "ggml-backend.h"
 #include "ggml-cpu/simd-mappings.h"
 #include "ggml-impl.h"
 #include "ggml-threading.h"
 #include "ggml-cpu.h"