Refactor: Optimize SYCL element-wise operations with unary function inlining

This commit refactors the SYCL element-wise operations to improve performance by:

- Inlining unary operations (sgn, abs, elu, gelu, silu, etc.) to reduce kernel launch overhead.
- Introducing helper functions `op_xxx` for each unary operation to encapsulate the logic.
- Replacing direct kernel calls with calls to these inlined functions.
- Using `__dpct_inline__` to encourage compiler inlining.
- Minor code cleanup and consistency improvements.

The changes aim to reduce kernel launch overhead and improve the overall efficiency of element-wise operations on SYCL devices.
This commit is contained in:
Akarshan
2025-06-22 19:21:19 +05:30
parent a234e09f41
commit ab46d11de5

View File

@ -3,14 +3,12 @@
#include "ggml.h"
#include "element_wise.hpp"
// --- Helper Macros for Kernel Indexing ---
#define SYCL_GLOBAL_ID_LOOP(K, ITEM) \
for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0))
#define SYCL_LOCAL_ID_CALC(ITEM, IDX) \
(ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX))
// --- Original Kernels (non-_sycl) - Modified to use indexing macros and cast literals ---
static void acc_f32(const float * x, const float * y, float * dst, const int ne,
const int ne10, const int ne11, const int ne12,
@ -30,181 +28,279 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
}
}
/* Unary OP funcs */
template<typename T>
static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
}
static __dpct_inline__ T op_sgn(T x) {
return x > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
}
template<typename T>
static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::fabs(x[i]);
}
static __dpct_inline__ T op_abs(T x) {
return sycl::fabs(x);
}
template<typename T>
static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
}
static __dpct_inline__ T op_elu(T x) {
return (x > static_cast<T>(0.f)) ? x : sycl::expm1(x);
}
template<typename T>
static void gelu(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static __dpct_inline__ T op_gelu(T x) {
const T GELU_COEF_A = static_cast<T>(0.044715f);
const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = static_cast<T>(0.5f) * x[i] *
(static_cast<T>(1.0f) +
sycl::tanh(SQRT_2_OVER_PI * x[i] * (static_cast<T>(1.0f) + GELU_COEF_A * x[i] * x[i])));
}
return static_cast<T>(0.5f) * x *
(static_cast<T>(1.0f) +
sycl::tanh(SQRT_2_OVER_PI * x * (static_cast<T>(1.0f) + GELU_COEF_A * x * x)));
}
template<typename T>
static void silu(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
}
static __dpct_inline__ T op_silu(T x) {
return x / (static_cast<T>(1.0f) + sycl::native::exp(-x));
}
template<typename T>
static void gelu_quick(const T *x, T *dst, int k,
const sycl::nd_item<1> &item_ct1) {
static __dpct_inline__ T op_gelu_quick(T x) {
const T GELU_QUICK_COEF_LOCAL = static_cast<T>(-1.702f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x[i])));
}
return x * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x)));
}
template<typename T>
static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
static __dpct_inline__ T op_gelu_erf(T x) {
const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
return static_cast<T>(0.5f) * x * (static_cast<T>(1.0f) + sycl::erf(x * SQRT_2_INV));
}
template<typename T>
static __dpct_inline__ T op_tanh(T x) {
return sycl::tanh(x);
}
template<typename T>
static __dpct_inline__ T op_relu(T x) {
return sycl::fmax(x, static_cast<T>(0));
}
template<typename T>
static __dpct_inline__ T op_sigmoid(T x) {
return static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x));
}
template<typename T>
static __dpct_inline__ T op_sqrt(T x) {
return sycl::sqrt(x);
}
template<typename T>
static __dpct_inline__ T op_sin(T x) {
return sycl::sin(x);
}
template<typename T>
static __dpct_inline__ T op_cos(T x) {
return sycl::cos(x);
}
template<typename T>
static __dpct_inline__ T op_hardsigmoid(T x) {
return sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
}
template<typename T>
static __dpct_inline__ T op_hardswish(T x) {
return x * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
}
template<typename T>
static __dpct_inline__ T op_exp(T x) {
return sycl::exp(x);
}
template<typename T>
static __dpct_inline__ T op_log(T x) {
if (x <= static_cast<T>(0)) {
return neg_infinity<T>();
}
return sycl::log(x);
}
template<typename T>
static __dpct_inline__ T op_neg(T x) {
return -x;
}
template<typename T>
static __dpct_inline__ T op_step(T x) {
return (x > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
}
template<typename T>
static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) {
T neg_slope_T = static_cast<T>(negative_slope);
return sycl::fmax(x, static_cast<T>(0)) +
sycl::fmin(x, static_cast<T>(0.0f)) * neg_slope_T;
}
template<typename T>
static __dpct_inline__ T op_sqr(T x) {
return x * x;
}
template<typename T>
static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) {
return x < static_cast<T>(min_val) ? static_cast<T>(min_val) : (x > static_cast<T>(max_val) ? static_cast<T>(max_val) : x);
}
template<typename T>
static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
auto x_i = x[i];
dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
dst[i] = op_sgn(x[i]);
}
}
template<typename T>
static void tanh(const T *x, T *dst, int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::tanh((x[i]));
dst[i] = op_abs(x[i]);
}
}
template<typename T>
static void relu(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
dst[i] = op_elu(x[i]);
}
}
template<typename T>
static void sigmoid(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
dst[i] = op_gelu(x[i]);
}
}
template<typename T>
static void sqrt(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::sqrt(x[i]);
dst[i] = op_silu(x[i]);
}
}
template<typename T>
static void sin(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::sin(x[i]);
dst[i] = op_gelu_quick(x[i]);
}
}
template<typename T>
static void cos(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::cos(x[i]);
dst[i] = op_gelu_erf(x[i]);
}
}
template<typename T>
static void hardsigmoid(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
dst[i] = op_tanh(x[i]);
}
}
template<typename T>
static void hardswish(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
dst[i] = op_relu(x[i]);
}
}
template<typename T>
static void exp(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = sycl::exp(x[i]);
dst[i] = op_sigmoid(x[i]);
}
}
template<typename T>
static void log(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
T xi = x[i];
if (xi <= static_cast<T>(0)) {
dst[i] = neg_infinity<T>();
} else {
dst[i] = sycl::log(xi);
}
dst[i] = op_sqrt(x[i]);
}
}
template<typename T>
static void neg(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = -x[i];
dst[i] = op_sin(x[i]);
}
}
template<typename T>
static void step(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = (x[i] > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
dst[i] = op_cos(x[i]);
}
}
template<typename T>
static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
T neg_slope_T = static_cast<T>(negative_slope);
dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) +
sycl::fmin((x[i]), static_cast<T>(0.0f)) * neg_slope_T;
dst[i] = op_hardsigmoid(x[i]);
}
}
template<typename T>
static void sqr(const T * x, T * dst, const int k,
const sycl::nd_item<1> &item_ct1) {
static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = x[i] * x[i];
dst[i] = op_hardswish(x[i]);
}
}
template<typename T>
static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_exp(x[i]);
}
}
template<typename T>
static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_log(x[i]);
}
}
template<typename T>
static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_neg(x[i]);
}
}
template<typename T>
static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_step(x[i]);
}
}
template<typename T>
static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_leaky_relu(x[i], negative_slope);
}
}
template<typename T>
static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_sqr(x[i]);
}
}
template<typename T>
static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) {
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
dst[i] = op_clamp(x[i], min_val, max_val);
}
}
@ -262,18 +358,10 @@ static void clamp(const T * x, T * dst, const float min, const float max, const
template<typename T>
static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
const T GELU_COEF_A = static_cast<T>(0.044715f);
const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
const int64_t j0 = (i / n) * o0 + (i % n);
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
const T x_val = x[j0];
const T x_cubed_term = static_cast<T>(1.0f) + GELU_COEF_A * x_val * x_val;
const T tanh_input = SQRT_2_OVER_PI * x_val * x_cubed_term;
const T gelu_val = static_cast<T>(0.5f) * x_val * (static_cast<T>(1.0f) + sycl::tanh(tanh_input));
dst[i] = gelu_val * g[j1];
dst[i] = op_gelu(x[j0]) * g[j1];
}
}
@ -282,7 +370,7 @@ static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
const int64_t j0 = (i / n) * o0 + (i % n);
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
dst[i] = sycl::max((x[j0]), static_cast<T>(0)) * g[j1];
dst[i] = op_relu(x[j0]) * g[j1];
}
}
@ -291,13 +379,11 @@ static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint6
SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
const int64_t j0 = (i / n) * o0 + (i % n);
const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
dst[i] = (x[j0] / (static_cast<T>(1) + sycl::native::exp(-x[j0]))) * g[j1];
dst[i] = op_silu(x[j0]) * g[j1];
}
}
// --- Generic SYCL Kernel Launchers ---
namespace ggml_sycl_detail {
// acc_f32_sycl remains specific
static void acc_f32_sycl(const float *x, const float *y, float *dst,
const int n_elements, const int ne10, const int ne11,
const int ne12, const int nb1, const int nb2,
@ -313,7 +399,6 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
});
}
// upscale_sycl remains specific
template<typename T>
static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
const int nb02, const int nb03, const int ne10, const int ne11,
@ -328,7 +413,6 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
});
}
// pad_sycl remains specific
template<typename T>
static void pad_sycl(const T *x, T *dst, const int ne00,
const int ne01, const int ne02, const int ne0,
@ -341,10 +425,8 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
[=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
}
// Common dispatcher for 1-input, 1-output element-wise ops, handling type switching.
// KernelInvoker is a lambda that takes (const T* src, T* dst, int k, queue_ptr stream, Args...)
template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -375,9 +457,8 @@ inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_te
}
}
// Dispatcher for fused GLU ops, handling specific input pointer setup and type switching.
template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -455,9 +536,8 @@ inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggm
}
}
// Dispatcher for upscale
template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -498,9 +578,8 @@ inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_
}
}
// Dispatcher for pad
template<typename KernelInvoker, typename... Args>
inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
static inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
#if defined (GGML_SYCL_F16)
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -537,9 +616,8 @@ inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tens
} // namespace ggml_sycl_detail
// --- Backend Operation Functions (ggml_sycl_op_...) ---
inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256);
@ -547,12 +625,12 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) {
sgn(src, dst_ptr, k_elements, item_ct1);
unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256);
@ -560,12 +638,12 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) {
abs_op(src, dst_ptr, k_elements, item_ct1);
unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, 256);
@ -573,12 +651,12 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
sycl::range<1>(256)),
[=](sycl::nd_item<1> item_ct1) {
elu_op(src, dst_ptr, k_elements, item_ct1);
unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
@ -586,12 +664,12 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
silu(src, dst_ptr, k_elements, item_ct1);
unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -599,12 +677,12 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
gelu(src, dst_ptr, k_elements, item_ct1);
unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -612,12 +690,12 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
gelu_quick(src, dst_ptr, k_elements, item_ct1);
unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -625,12 +703,12 @@ inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
gelu_erf(src, dst_ptr, k_elements, item_ct1);
unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
@ -638,12 +716,12 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
tanh(src, dst_ptr, k_elements, item_ct1);
unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
@ -651,12 +729,12 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
relu(src, dst_ptr, k_elements, item_ct1);
unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
@ -664,12 +742,12 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
hardsigmoid(src, dst_ptr, k_elements, item_ct1);
unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
@ -677,12 +755,12 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
hardswish(src, dst_ptr, k_elements, item_ct1);
unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
@ -690,12 +768,12 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
exp(src, dst_ptr, k_elements, item_ct1);
unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
@ -703,12 +781,12 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
log(src, dst_ptr, k_elements, item_ct1);
unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
@ -716,12 +794,12 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
neg(src, dst_ptr, k_elements, item_ct1);
unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
@ -729,12 +807,12 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
step(src, dst_ptr, k_elements, item_ct1);
unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
@ -742,12 +820,12 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
sigmoid(src, dst_ptr, k_elements, item_ct1);
unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
@ -755,12 +833,12 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
sqrt(src, dst_ptr, k_elements, item_ct1);
unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
@ -768,12 +846,12 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
sin(src, dst_ptr, k_elements, item_ct1);
unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
@ -781,12 +859,12 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
cos(src, dst_ptr, k_elements, item_ct1);
unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
float negative_slope;
memcpy(&negative_slope, dst->op_params, sizeof(float));
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
@ -796,12 +874,12 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
leaky_relu(src, dst_ptr, k_elements, slope, item_ct1);
unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1);
});
}, negative_slope);
}
inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
[](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
@ -809,12 +887,12 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
[=](sycl::nd_item<1> item_ct1) {
sqr(src, dst_ptr, k_elements, item_ct1);
unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1);
});
});
}
inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst,
[](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03,
int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3,
@ -823,7 +901,7 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
});
}
inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_pad(ctx, dst,
[](const auto* src, auto* dst_ptr, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2,
queue_ptr stream) {
@ -831,7 +909,7 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
});
}
inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
float min_val;
float max_val;
memcpy(&min_val, dst->op_params, sizeof(float));
@ -848,7 +926,7 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds
}, min_val, max_val);
}
inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -867,7 +945,7 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
}
inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
@ -878,7 +956,7 @@ inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * ds
});
}
inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
@ -889,7 +967,7 @@ inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * ds
});
}
inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
[](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu