Refactor: Optimize SYCL element-wise operations with unary function inlining

This commit refactors the SYCL element-wise operations to improve performance by:

- Inlining unary operations (sgn, abs, elu, gelu, silu, etc.) to reduce kernel launch overhead.
- Introducing helper functions `op_xxx` for each unary operation to encapsulate the logic.
- Replacing direct kernel calls with calls to these inlined functions.
- Using `__dpct_inline__` to encourage compiler inlining.
- Minor code cleanup and consistency improvements.

The changes aim to reduce kernel launch overhead and improve the overall efficiency of element-wise operations on SYCL devices.
This commit is contained in:
Akarshan
2025-06-22 19:21:19 +05:30
parent a234e09f41
commit ab46d11de5

View File

@ -3,14 +3,12 @@
#include "ggml.h" #include "ggml.h"
#include "element_wise.hpp" #include "element_wise.hpp"
// --- Helper Macros for Kernel Indexing ---
#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \ #define SYCL_GLOBAL_ID_LOOP(K, ITEM) \
for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0)) for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0))
#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \ #define SYCL_LOCAL_ID_CALC(ITEM, IDX) \
(ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX)) (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX))
// --- Original Kernels (non-_sycl) - Modified to use indexing macros and cast literals ---
static void acc_f32(const float * x, const float * y, float * dst, const int ne, static void acc_f32(const float * x, const float * y, float * dst, const int ne,
const int ne10, const int ne11, const int ne12, const int ne10, const int ne11, const int ne12,
@ -30,181 +28,279 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
} }
} }
/* Unary OP funcs */
template<typename T> template<typename T>
static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { static __dpct_inline__ T op_sgn(T x) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { return x > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
}
} }
template<typename T> template<typename T>
static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { static __dpct_inline__ T op_abs(T x) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { return sycl::fabs(x);
dst[i] = sycl::fabs(x[i]);
}
} }
template<typename T> template<typename T>
static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { static __dpct_inline__ T op_elu(T x) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { return (x > static_cast<T>(0.f)) ? x : sycl::expm1(x);
dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
}
} }
template<typename T> template<typename T>
static void gelu(const T * x, T * dst, const int k, static __dpct_inline__ T op_gelu(T x) {
const sycl::nd_item<1> &item_ct1) {
const T GELU_COEF_A = static_cast<T>(0.044715f); const T GELU_COEF_A = static_cast<T>(0.044715f);
const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f); const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { return static_cast<T>(0.5f) * x *
dst[i] = static_cast<T>(0.5f) * x[i] *
(static_cast<T>(1.0f) + (static_cast<T>(1.0f) +
sycl::tanh(SQRT_2_OVER_PI * x[i] * (static_cast<T>(1.0f) + GELU_COEF_A * x[i] * x[i]))); sycl::tanh(SQRT_2_OVER_PI * x * (static_cast<T>(1.0f) + GELU_COEF_A * x * x)));
}
} }
template<typename T> template<typename T>
static void silu(const T * x, T * dst, const int k, static __dpct_inline__ T op_silu(T x) {
const sycl::nd_item<1> &item_ct1) { return x / (static_cast<T>(1.0f) + sycl::native::exp(-x));
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
}
} }
template<typename T> template<typename T>
static void gelu_quick(const T *x, T *dst, int k, static __dpct_inline__ T op_gelu_quick(T x) {
const sycl::nd_item<1> &item_ct1) {
const T GELU_QUICK_COEF_LOCAL = static_cast<T>(-1.702f); const T GELU_QUICK_COEF_LOCAL = static_cast<T>(-1.702f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { return x * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x)));
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x[i])));
}
} }
template<typename T> template<typename T>
static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) { static __dpct_inline__ T op_gelu_erf(T x) {
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f); const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { return static_cast<T>(0.5f) * x * (static_cast<T>(1.0f) + sycl::erf(x * SQRT_2_INV));
auto x_i = x[i];
dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
}
} }
template<typename T> template<typename T>
static void tanh(const T *x, T *dst, int k, static __dpct_inline__ T op_tanh(T x) {
const sycl::nd_item<1> &item_ct1) { return sycl::tanh(x);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::tanh((x[i]));
}
} }
template<typename T> template<typename T>
static void relu(const T * x, T * dst, const int k, static __dpct_inline__ T op_relu(T x) {
const sycl::nd_item<1> &item_ct1) { return sycl::fmax(x, static_cast<T>(0));
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
}
} }
template<typename T> template<typename T>
static void sigmoid(const T * x, T * dst, const int k, static __dpct_inline__ T op_sigmoid(T x) {
const sycl::nd_item<1> &item_ct1) { return static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x));
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
}
} }
template<typename T> template<typename T>
static void sqrt(const T * x, T * dst, const int k, static __dpct_inline__ T op_sqrt(T x) {
const sycl::nd_item<1> &item_ct1) { return sycl::sqrt(x);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::sqrt(x[i]);
}
} }
template<typename T> template<typename T>
static void sin(const T * x, T * dst, const int k, static __dpct_inline__ T op_sin(T x) {
const sycl::nd_item<1> &item_ct1) { return sycl::sin(x);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::sin(x[i]);
}
} }
template<typename T> template<typename T>
static void cos(const T * x, T * dst, const int k, static __dpct_inline__ T op_cos(T x) {
const sycl::nd_item<1> &item_ct1) { return sycl::cos(x);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::cos(x[i]);
}
} }
template<typename T> template<typename T>
static void hardsigmoid(const T * x, T * dst, const int k, static __dpct_inline__ T op_hardsigmoid(T x) {
const sycl::nd_item<1> &item_ct1) { return sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
}
} }
template<typename T> template<typename T>
static void hardswish(const T * x, T * dst, const int k, static __dpct_inline__ T op_hardswish(T x) {
const sycl::nd_item<1> &item_ct1) { return x * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
}
} }
template<typename T> template<typename T>
static void exp(const T * x, T * dst, const int k, static __dpct_inline__ T op_exp(T x) {
const sycl::nd_item<1> &item_ct1) { return sycl::exp(x);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::exp(x[i]);
}
} }
template<typename T> template<typename T>
static void log(const T * x, T * dst, const int k, static __dpct_inline__ T op_log(T x) {
const sycl::nd_item<1> &item_ct1) { if (x <= static_cast<T>(0)) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { return neg_infinity<T>();
T xi = x[i];
if (xi <= static_cast<T>(0)) {
dst[i] = neg_infinity<T>();
} else {
dst[i] = sycl::log(xi);
}
} }
return sycl::log(x);
} }
template<typename T> template<typename T>
static void neg(const T * x, T * dst, const int k, static __dpct_inline__ T op_neg(T x) {
const sycl::nd_item<1> &item_ct1) { return -x;
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = -x[i];
}
} }
template<typename T> template<typename T>
static void step(const T * x, T * dst, const int k, static __dpct_inline__ T op_step(T x) {
const sycl::nd_item<1> &item_ct1) { return (x > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = (x[i] > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
}
} }
template<typename T> template<typename T>
static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope, static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) {
const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
T neg_slope_T = static_cast<T>(negative_slope); T neg_slope_T = static_cast<T>(negative_slope);
dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) + return sycl::fmax(x, static_cast<T>(0)) +
sycl::fmin((x[i]), static_cast<T>(0.0f)) * neg_slope_T; sycl::fmin(x, static_cast<T>(0.0f)) * neg_slope_T;
}
template<typename T>
static __dpct_inline__ T op_sqr(T x) {
return x * x;
}
template<typename T>
static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) {
return x < static_cast<T>(min_val) ? static_cast<T>(min_val) : (x > static_cast<T>(max_val) ? static_cast<T>(max_val) : x);
}
template<typename T>
static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_sgn(x[i]);
} }
} }
template<typename T> template<typename T>
static void sqr(const T * x, T * dst, const int k, static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] * x[i]; dst[i] = op_abs(x[i]);
}
}
template<typename T>
static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_elu(x[i]);
}
}
template<typename T>
static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_gelu(x[i]);
}
}
template<typename T>
static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_silu(x[i]);
}
}
template<typename T>
static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_gelu_quick(x[i]);
}
}
template<typename T>
static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_gelu_erf(x[i]);
}
}
template<typename T>
static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_tanh(x[i]);
}
}
template<typename T>
static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_relu(x[i]);
}
}
template<typename T>
static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_sigmoid(x[i]);
}
}
template<typename T>
static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_sqrt(x[i]);
}
}
template<typename T>
static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_sin(x[i]);
}
}
template<typename T>
static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_cos(x[i]);
}
}
template<typename T>
static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_hardsigmoid(x[i]);
}
}
template<typename T>
static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_hardswish(x[i]);
}
}
template<typename T>
static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_exp(x[i]);
}
}
template<typename T>
static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_log(x[i]);
}
}
template<typename T>
static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_neg(x[i]);
}
}
template<typename T>
static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_step(x[i]);
}
}
template<typename T>
static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_leaky_relu(x[i], negative_slope);
}
}
template<typename T>
static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_sqr(x[i]);
}
}
template<typename T>
static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_clamp(x[i], min_val, max_val);
} }
} }
@ -262,18 +358,10 @@ static void clamp(const T * x, T * dst, const float min, const float max, const
template<typename T> template<typename T>
static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) { static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
const T GELU_COEF_A = static_cast<T>(0.044715f);
const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
const int64_t j0 = (i / n) * o0 + (i % n); const int64_t j0 = (i / n) * o0 + (i % n);
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
const T x_val = x[j0]; dst[i] = op_gelu(x[j0]) * g[j1];
const T x_cubed_term = static_cast<T>(1.0f) + GELU_COEF_A * x_val * x_val;
const T tanh_input = SQRT_2_OVER_PI * x_val * x_cubed_term;
const T gelu_val = static_cast<T>(0.5f) * x_val * (static_cast<T>(1.0f) + sycl::tanh(tanh_input));
dst[i] = gelu_val * g[j1];
} }
} }
@ -282,7 +370,7 @@ static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
const int64_t j0 = (i / n) * o0 + (i % n); const int64_t j0 = (i / n) * o0 + (i % n);
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
dst[i] = sycl::max((x[j0]), static_cast<T>(0)) * g[j1]; dst[i] = op_relu(x[j0]) * g[j1];
} }
} }
@ -291,13 +379,11 @@ static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint6
SYCL_GLOBAL_ID_LOOP(k, item_ct1) { SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
const int64_t j0 = (i / n) * o0 + (i % n); const int64_t j0 = (i / n) * o0 + (i % n);
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n); const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
dst[i] = (x[j0] / (static_cast<T>(1) + sycl::native::exp(-x[j0]))) * g[j1]; dst[i] = op_silu(x[j0]) * g[j1];
} }
} }
// --- Generic SYCL Kernel Launchers ---
namespace ggml_sycl_detail { namespace ggml_sycl_detail {
// acc_f32_sycl remains specific
static void acc_f32_sycl(const float *x, const float *y, float *dst, static void acc_f32_sycl(const float *x, const float *y, float *dst,
const int n_elements, const int ne10, const int ne11, const int n_elements, const int ne10, const int ne11,
const int ne12, const int nb1, const int nb2, const int ne12, const int nb1, const int nb2,
@ -313,7 +399,6 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
}); });
} }
// upscale_sycl remains specific
template<typename T> template<typename T>
static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01, static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
const int nb02, const int nb03, const int ne10, const int ne11, const int nb02, const int nb03, const int ne10, const int ne11,
@ -328,7 +413,6 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
}); });
} }
// pad_sycl remains specific
template<typename T> template<typename T>
static void pad_sycl(const T *x, T *dst, const int ne00, static void pad_sycl(const T *x, T *dst, const int ne00,
const int ne01, const int ne02, const int ne0, const int ne01, const int ne02, const int ne0,
@ -341,10 +425,8 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
[=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); }); [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
} }
// Common dispatcher for 1-input, 1-output element-wise ops, handling type switching.
// KernelInvoker is a lambda that takes (const T* src, T* dst, int k, queue_ptr stream, Args...)
template<typename KernelInvoker, typename... Args> template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16) #if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -375,9 +457,8 @@ inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_te
} }
} }
// Dispatcher for fused GLU ops, handling specific input pointer setup and type switching.
template<typename KernelInvoker, typename... Args> template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16) #if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -455,9 +536,8 @@ inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggm
} }
} }
// Dispatcher for upscale
template<typename KernelInvoker, typename... Args> template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16) #if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -498,9 +578,8 @@ inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_
} }
} }
// Dispatcher for pad
template<typename KernelInvoker, typename... Args> template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) { static inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16) #if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16); GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -537,9 +616,8 @@ inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tens
} // namespace ggml_sycl_detail } // namespace ggml_sycl_detail
// --- Backend Operation Functions (ggml_sycl_op_...) ---
inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256); const int num_blocks = ceil_div(k_elements, 256);
@ -547,12 +625,12 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)), sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
sgn(src, dst_ptr, k_elements, item_ct1); unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256); const int num_blocks = ceil_div(k_elements, 256);
@ -560,12 +638,12 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)), sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
abs_op(src, dst_ptr, k_elements, item_ct1); unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256); const int num_blocks = ceil_div(k_elements, 256);
@ -573,12 +651,12 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)), sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
elu_op(src, dst_ptr, k_elements, item_ct1); unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
@ -586,12 +664,12 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
sycl::range<1>(SYCL_SILU_BLOCK_SIZE)), sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
silu(src, dst_ptr, k_elements, item_ct1); unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -599,12 +677,12 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
gelu(src, dst_ptr, k_elements, item_ct1); unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -612,12 +690,12 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
gelu_quick(src, dst_ptr, k_elements, item_ct1); unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -625,12 +703,12 @@ inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)), sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
gelu_erf(src, dst_ptr, k_elements, item_ct1); unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
@ -638,12 +716,12 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
sycl::range<1>(SYCL_TANH_BLOCK_SIZE)), sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
tanh(src, dst_ptr, k_elements, item_ct1); unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
@ -651,12 +729,12 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
relu(src, dst_ptr, k_elements, item_ct1); unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
@ -664,12 +742,12 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)), sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
hardsigmoid(src, dst_ptr, k_elements, item_ct1); unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
@ -677,12 +755,12 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)), sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
hardswish(src, dst_ptr, k_elements, item_ct1); unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
@ -690,12 +768,12 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
exp(src, dst_ptr, k_elements, item_ct1); unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
@ -703,12 +781,12 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
sycl::range<1>(SYCL_EXP_BLOCK_SIZE)), sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
log(src, dst_ptr, k_elements, item_ct1); unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
@ -716,12 +794,12 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
neg(src, dst_ptr, k_elements, item_ct1); unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
@ -729,12 +807,12 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
sycl::range<1>(SYCL_NEG_BLOCK_SIZE)), sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
step(src, dst_ptr, k_elements, item_ct1); unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
@ -742,12 +820,12 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)), sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
sigmoid(src, dst_ptr, k_elements, item_ct1); unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
@ -755,12 +833,12 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)), sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
sqrt(src, dst_ptr, k_elements, item_ct1); unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
@ -768,12 +846,12 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
sin(src, dst_ptr, k_elements, item_ct1); unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
@ -781,12 +859,12 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)), sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
cos(src, dst_ptr, k_elements, item_ct1); unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
float negative_slope; float negative_slope;
memcpy(&negative_slope, dst->op_params, sizeof(float)); memcpy(&negative_slope, dst->op_params, sizeof(float));
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
@ -796,12 +874,12 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
sycl::range<1>(SYCL_RELU_BLOCK_SIZE)), sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
leaky_relu(src, dst_ptr, k_elements, slope, item_ct1); unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1);
}); });
}, negative_slope); }, negative_slope);
} }
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) { [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE); const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
@ -809,12 +887,12 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE), sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
sycl::range<1>(SYCL_SQR_BLOCK_SIZE)), sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) { [=](sycl::nd_item<1> item_ct1) {
sqr(src, dst_ptr, k_elements, item_ct1); unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1);
}); });
}); });
} }
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst,
[](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03, [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03,
int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3, int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3,
@ -823,7 +901,7 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
}); });
} }
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_pad(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_pad(ctx, dst,
[](const auto* src, auto* dst_ptr, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2, [](const auto* src, auto* dst_ptr, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2,
queue_ptr stream) { queue_ptr stream) {
@ -831,7 +909,7 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
}); });
} }
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
float min_val; float min_val;
float max_val; float max_val;
memcpy(&min_val, dst->op_params, sizeof(float)); memcpy(&min_val, dst->op_params, sizeof(float));
@ -848,7 +926,7 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds
}, min_val, max_val); }, min_val, max_val);
} }
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) { static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32); GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -867,7 +945,7 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream); ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
} }
inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE); const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
@ -878,7 +956,7 @@ inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * ds
}); });
} }
inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
@ -889,7 +967,7 @@ inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * ds
}); });
} }
inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) { static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst, ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) { [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu