From 17b032fab81dcca95618f45bbf59af6ee5c79e79 Mon Sep 17 00:00:00 2001 From: Aaron Teo Date: Tue, 24 Jun 2025 20:42:15 +0800 Subject: [PATCH] ggml: refactor fp16<->fp32 simd to ggml-cpu Signed-off-by: Aaron Teo --- ggml/src/ggml-cpu/arch/arm/quants.c | 1 + ggml/src/ggml-cpu/arch/arm/repack.cpp | 1 + ggml/src/ggml-cpu/common.h | 1 + ggml/src/ggml-cpu/llamafile/sgemm.cpp | 1 + ggml/src/ggml-cpu/quants.c | 1 + ggml/src/ggml-cpu/repack.cpp | 1 + ggml/src/ggml-cpu/simd-mappings.h | 186 +++++++++++++++++++ ggml/src/ggml-impl.h | 245 +++++++------------------- ggml/src/ggml-quants.c | 1 + ggml/src/ggml.c | 1 + 10 files changed, 256 insertions(+), 183 deletions(-) diff --git a/ggml/src/ggml-cpu/arch/arm/quants.c b/ggml/src/ggml-cpu/arch/arm/quants.c index b0909dac0..8a615f006 100644 --- a/ggml/src/ggml-cpu/arch/arm/quants.c +++ b/ggml/src/ggml-cpu/arch/arm/quants.c @@ -6,6 +6,7 @@ #include "../../quants.h" #include "../../ggml-cpu-impl.h" +#include "../../simd-mappings.h" #include #include diff --git a/ggml/src/ggml-cpu/arch/arm/repack.cpp b/ggml/src/ggml-cpu/arch/arm/repack.cpp index 39a0dd301..b22a9bbb9 100644 --- a/ggml/src/ggml-cpu/arch/arm/repack.cpp +++ b/ggml/src/ggml-cpu/arch/arm/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include diff --git a/ggml/src/ggml-cpu/common.h b/ggml/src/ggml-cpu/common.h index 5624176cc..60e282a4a 100644 --- a/ggml/src/ggml-cpu/common.h +++ b/ggml/src/ggml-cpu/common.h @@ -4,6 +4,7 @@ #include "traits.h" #include "ggml-cpu-impl.h" #include "ggml-impl.h" +#include "simd-mappings.h" #ifdef __cplusplus diff --git a/ggml/src/ggml-cpu/llamafile/sgemm.cpp b/ggml/src/ggml-cpu/llamafile/sgemm.cpp index 7ed3874af..6f40ce41c 100644 --- a/ggml/src/ggml-cpu/llamafile/sgemm.cpp +++ b/ggml/src/ggml-cpu/llamafile/sgemm.cpp @@ -52,6 +52,7 @@ #include "ggml-impl.h" #include "ggml-cpu-impl.h" #include "ggml-quants.h" +#include "simd-mappings.h" #include #include diff --git a/ggml/src/ggml-cpu/quants.c b/ggml/src/ggml-cpu/quants.c index d2e705f28..4bbf93860 100644 --- a/ggml/src/ggml-cpu/quants.c +++ b/ggml/src/ggml-cpu/quants.c @@ -2,6 +2,7 @@ #include "ggml-common.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "ggml-quants.h" #include "quants.h" diff --git a/ggml/src/ggml-cpu/repack.cpp b/ggml/src/ggml-cpu/repack.cpp index 692c53e01..b456405c9 100644 --- a/ggml/src/ggml-cpu/repack.cpp +++ b/ggml/src/ggml-cpu/repack.cpp @@ -6,6 +6,7 @@ #include "ggml-impl.h" #include "ggml-cpu.h" #include "ggml-cpu-impl.h" +#include "simd-mappings.h" #include "traits.h" #include "arch-fallback.h" diff --git a/ggml/src/ggml-cpu/simd-mappings.h b/ggml/src/ggml-cpu/simd-mappings.h index 88d9bbac1..a0a73e545 100644 --- a/ggml/src/ggml-cpu/simd-mappings.h +++ b/ggml/src/ggml-cpu/simd-mappings.h @@ -2,10 +2,196 @@ #include "ggml-cpu-impl.h" +#ifdef __ARM_FEATURE_SVE +#include +#endif // __ARM_FEATURE_SVE + +#if defined(__ARM_NEON) && !defined(__CUDACC__) && !defined(__MUSACC__) +// if YCM cannot find , make a symbolic link to it, for example: +// +// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/ +// +#include +#endif + +#if defined(__F16C__) +#include +#endif + // // simd mappings // +// FP16 to FP32 conversion + +// 16-bit float +// on Arm, we use __fp16 +// on x86, we use uint16_t +// +// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 +// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 +// +#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) + #ifdef GGML_FP16_TO_FP32 + #undef GGML_FP16_TO_FP32 + #endif + + #ifdef GGML_FP32_TO_FP16 + #undef GGML_FP32_TO_FP16 + #endif + + #ifdef GGML_COMPUTE_FP16_TO_FP32 + #undef GGML_COMPUTE_FP16_TO_FP32 + #endif + + #ifdef GGML_COMPUTE_FP32_TO_FP16 + #undef GGML_COMPUTE_FP32_TO_FP16 + #endif + + #define GGML_COMPUTE_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) neon_compute_fp32_to_fp16(x) + + #define GGML_FP16_TO_FP32(x) neon_compute_fp16_to_fp32(x) + + static inline float neon_compute_fp16_to_fp32(ggml_fp16_t h) { + __fp16 tmp; + memcpy(&tmp, &h, sizeof(ggml_fp16_t)); + return (float)tmp; + } + + static inline ggml_fp16_t neon_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __fp16 tmp = f; + memcpy(&res, &tmp, sizeof(ggml_fp16_t)); + return res; + } +#elif defined(__F16C__) + #ifdef GGML_COMPUTE_FP16_TO_FP32 + #undef GGML_COMPUTE_FP16_TO_FP32 + #endif + + #ifdef GGML_COMPUTE_FP32_TO_FP16 + #undef GGML_COMPUTE_FP32_TO_FP16 + #endif + + #ifdef _MSC_VER + #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) + #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) + #else + #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) + #endif +#elif defined(__POWER9_VECTOR__) + #ifdef GGML_FP16_TO_FP32 + #undef GGML_FP16_TO_FP32 + #endif + + #ifdef GGML_FP32_TO_FP16 + #undef GGML_FP32_TO_FP16 + #endif + + #ifdef GGML_COMPUTE_FP16_TO_FP32 + #undef GGML_COMPUTE_FP16_TO_FP32 + #endif + + #ifdef GGML_COMPUTE_FP32_TO_FP16 + #undef GGML_COMPUTE_FP32_TO_FP16 + #endif + + #define GGML_COMPUTE_FP16_TO_FP32(x) power_compute_fp16_to_fp32(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) power_compute_fp32_to_fp16(x) + /* the inline asm below is about 12% faster than the lookup method */ + #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) + #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) + + static inline float power_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + double d; + __asm__( + "mtfprd %0,%2\n" + "xscvhpdp %0,%0\n" + "frsp %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=f"(f): + /* in */ "r"(h)); + return f; + } + + static inline ggml_fp16_t power_compute_fp32_to_fp16(float f) { + double d; + ggml_fp16_t r; + __asm__( /* xscvdphp can work on double or single precision */ + "xscvdphp %0,%2\n" + "mffprd %1,%0\n" : + /* temp */ "=d"(d), + /* out */ "=r"(r): + /* in */ "f"(f)); + return r; + } + +#elif defined(__riscv) && defined(__riscv_zfhmin) + #ifdef GGML_FP16_TO_FP32 + #undef GGML_FP16_TO_FP32 + #endif + + #ifdef GGML_FP32_TO_FP16 + #undef GGML_FP32_TO_FP16 + #endif + + #ifdef GGML_COMPUTE_FP16_TO_FP32 + #undef GGML_COMPUTE_FP16_TO_FP32 + #endif + + #ifdef GGML_COMPUTE_FP32_TO_FP16 + #undef GGML_COMPUTE_FP32_TO_FP16 + #endif + + static inline float riscv_compute_fp16_to_fp32(ggml_fp16_t h) { + float f; + __asm__( + "fmv.h.x %[f], %[h]\n\t" + "fcvt.s.h %[f], %[f]" + : [f] "=&f" (f) + : [h] "r" (h) + ); + return f; + } + + static inline ggml_fp16_t riscv_compute_fp32_to_fp16(float f) { + ggml_fp16_t res; + __asm__( + "fcvt.h.s %[f], %[f]\n\t" + "fmv.x.h %[h], %[f]" + : [h] "=&r" (res) + : [f] "f" (f) + ); + return res; + } + + #define GGML_COMPUTE_FP16_TO_FP32(x) riscv_compute_fp16_to_fp32(x) + #define GGML_COMPUTE_FP32_TO_FP16(x) riscv_compute_fp32_to_fp16(x) + #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) + #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + +// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, +// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. +// This is also true for POWER9. +#if !defined(GGML_FP16_TO_FP32) +inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { + uint16_t s; + memcpy(&s, &f, sizeof(uint16_t)); + return ggml_table_f32_f16[s]; +} + +#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) +#endif + +#if !defined(GGML_FP32_TO_FP16) +#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +#endif + + // we define a common set of C macros which map to specific intrinsics based on the current architecture // we then implement the fundamental computation operations below using only these macros // adding support for new architectures requires to define the corresponding SIMD macros diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h index 6dc5ce0d9..59bae840f 100644 --- a/ggml/src/ggml-impl.h +++ b/ggml/src/ggml-impl.h @@ -317,204 +317,83 @@ struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1); GGML_API void * ggml_aligned_malloc(size_t size); GGML_API void ggml_aligned_free(void * ptr, size_t size); -// FP16 to FP32 conversion +// FP16 <-> FP32 +// ref: https://github.com/Maratyszcza/FP16 -// 16-bit float -// on Arm, we use __fp16 -// on x86, we use uint16_t -// -// for old CUDA compilers (<= 11), we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/10616 -// for MUSA compilers , we use uint16_t: ref https://github.com/ggml-org/llama.cpp/pull/11843 -// -#if defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) +static inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32; + fp32.as_bits = w; + return fp32.as_value; +} - #define GGML_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +static inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32; + fp32.as_value = f; + return fp32.as_bits; +} - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - __fp16 tmp; - memcpy(&tmp, &h, sizeof(ggml_fp16_t)); - return (float)tmp; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __fp16 tmp = f; - memcpy(&res, &tmp, sizeof(ggml_fp16_t)); - return res; - } - -#elif defined(__F16C__) - - #ifdef _MSC_VER - #define GGML_COMPUTE_FP16_TO_FP32(x) _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(x))) - #define GGML_COMPUTE_FP32_TO_FP16(x) _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(x), 0), 0) - #else - #define GGML_COMPUTE_FP16_TO_FP32(x) _cvtsh_ss(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) _cvtss_sh(x, 0) - #endif - -#elif defined(__POWER9_VECTOR__) - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - /* the inline asm below is about 12% faster than the lookup method */ - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - double d; - __asm__( - "mtfprd %0,%2\n" - "xscvhpdp %0,%0\n" - "frsp %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=f"(f): - /* in */ "r"(h)); - return f; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - double d; - ggml_fp16_t r; - __asm__( /* xscvdphp can work on double or single precision */ - "xscvdphp %0,%2\n" - "mffprd %1,%0\n" : - /* temp */ "=d"(d), - /* out */ "=r"(r): - /* in */ "f"(f)); - return r; - } - -#elif defined(__riscv) && defined(__riscv_zfhmin) - - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - float f; - __asm__( - "fmv.h.x %[f], %[h]\n\t" - "fcvt.s.h %[f], %[f]" - : [f] "=&f" (f) - : [h] "r" (h) - ); - return f; - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - ggml_fp16_t res; - __asm__( - "fcvt.h.s %[f], %[f]\n\t" - "fmv.x.h %[h], %[f]" - : [h] "=&r" (res) - : [f] "f" (f) - ); - return res; - } - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - #define GGML_FP16_TO_FP32(x) GGML_COMPUTE_FP16_TO_FP32(x) - #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) +static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { + const uint32_t w = (uint32_t) h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + const uint32_t exp_offset = UINT32_C(0xE0) << 23; +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float exp_scale = 0x1.0p-112f; #else + const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); +#endif + const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - // FP16 <-> FP32 - // ref: https://github.com/Maratyszcza/FP16 + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - static inline float fp32_from_bits(uint32_t w) { - union { - uint32_t as_bits; - float as_value; - } fp32; - fp32.as_bits = w; - return fp32.as_value; + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} + +static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { +#if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) + const float scale_to_inf = 0x1.0p+112f; + const float scale_to_zero = 0x1.0p-110f; +#else + const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); + const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); +#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); } - static inline uint32_t fp32_to_bits(float f) { - union { - float as_value; - uint32_t as_bits; - } fp32; - fp32.as_value = f; - return fp32.as_bits; - } + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); +} - static inline float ggml_compute_fp16_to_fp32(ggml_fp16_t h) { - const uint32_t w = (uint32_t) h << 16; - const uint32_t sign = w & UINT32_C(0x80000000); - const uint32_t two_w = w + w; - - const uint32_t exp_offset = UINT32_C(0xE0) << 23; - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float exp_scale = 0x1.0p-112f; - #else - const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); - #endif - const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - const uint32_t magic_mask = UINT32_C(126) << 23; - const float magic_bias = 0.5f; - const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - const uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); - } - - static inline ggml_fp16_t ggml_compute_fp32_to_fp16(float f) { - #if (defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)) && (!defined(__cplusplus) || __cplusplus >= 201703L) - const float scale_to_inf = 0x1.0p+112f; - const float scale_to_zero = 0x1.0p-110f; - #else - const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); - const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); - #endif - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); - } - - #define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) - #define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) - -#endif // defined(__ARM_NEON) && !(defined(__CUDACC__) && __CUDACC_VER_MAJOR__ <= 11) && !defined(__MUSACC__) +#define GGML_COMPUTE_FP16_TO_FP32(x) ggml_compute_fp16_to_fp32(x) +#define GGML_COMPUTE_FP32_TO_FP16(x) ggml_compute_fp32_to_fp16(x) // precomputed f32 table for f16 (256 KB) // defined in ggml.c, initialized in ggml_init() GGML_API float ggml_table_f32_f16[1 << 16]; -// On ARM NEON, it's quicker to directly convert x -> x instead of calling into ggml_lookup_fp16_to_fp32, -// so we define GGML_FP16_TO_FP32 and GGML_FP32_TO_FP16 elsewhere for NEON. -// This is also true for POWER9. -#if !defined(GGML_FP16_TO_FP32) -inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { - uint16_t s; - memcpy(&s, &f, sizeof(uint16_t)); - return ggml_table_f32_f16[s]; -} - -#define GGML_FP16_TO_FP32(x) ggml_lookup_fp16_to_fp32(x) -#endif - -#if !defined(GGML_FP32_TO_FP16) -#define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x) -#endif - /** * Converts brain16 to float32. * diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index e389a46db..5bd027159 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4,6 +4,7 @@ #include "ggml-quants.h" #include "ggml-impl.h" #include "ggml-cpu/ggml-cpu-impl.h" +#include "ggml-cpu/simd-mappings.h" #include "ggml-cpu.h" #include diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index f8e7c595b..be223554e 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2,6 +2,7 @@ #define _USE_MATH_DEFINES // For M_PI on MSVC #include "ggml-backend.h" +#include "ggml-cpu/simd-mappings.h" #include "ggml-impl.h" #include "ggml-threading.h" #include "ggml-cpu.h"