Refactor: Optimize SYCL element-wise operations with unary function inlining

This commit refactors the SYCL element-wise operations to improve performance by: - Inlining unary operations (sgn, abs, elu, gelu, silu, etc.) to reduce kernel launch overhead. - Introducing helper functions `op_xxx` for each unary operation to encapsulate the logic. - Replacing direct kernel calls with calls to these inlined functions. - Using `__dpct_inline__` to encourage compiler inlining. - Minor code cleanup and consistency improvements. The changes aim to reduce kernel launch overhead and improve the overall efficiency of element-wise operations on SYCL devices.
2025-06-26 11:45:21 +00:00 · 2025-06-22 19:21:19 +05:30
parent a234e09f41
commit ab46d11de5
1 changed files with 235 additions and 157 deletions
--- a/ggml/src/ggml-sycl/element_wise.cpp
+++ b/ggml/src/ggml-sycl/element_wise.cpp
@ -3,14 +3,12 @@
 #include "ggml.h"
 #include "element_wise.hpp"

-// --- Helper Macros for Kernel Indexing ---
 #define SYCL_GLOBAL_ID_LOOP(K, ITEM) \
    for (auto i = ITEM.get_global_id(0); i < (size_t)K; i += ITEM.get_global_range(0))

 #define SYCL_LOCAL_ID_CALC(ITEM, IDX) \
    (ITEM.get_local_range(IDX) * ITEM.get_group(IDX) + ITEM.get_local_id(IDX))

-// --- Original Kernels (non-_sycl) - Modified to use indexing macros and cast literals ---

 static void acc_f32(const float * x, const float * y, float * dst, const int ne,
    const int ne10, const int ne11, const int ne12,
@ -30,181 +28,279 @@ static void acc_f32(const float * x, const float * y, float * dst, const int ne,
    }
 }

+/* Unary OP funcs */
 template<typename T>
-static void sgn(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = x[i] > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x[i] < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
-    }
+static __dpct_inline__ T op_sgn(T x) {
+    return x > static_cast<T>(0.f) ? static_cast<T>(1.f) : ((x < static_cast<T>(0.f) ? static_cast<T>(-1.f) : static_cast<T>(0.f)));
 }

 template<typename T>
-static void abs_op(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::fabs(x[i]);
-    }
+static __dpct_inline__ T op_abs(T x) {
+    return sycl::fabs(x);
 }

 template<typename T>
-static void elu_op(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = (x[i] > static_cast<T>(0.f)) ? x[i] : sycl::expm1(x[i]);
-    }
+static __dpct_inline__ T op_elu(T x) {
+    return (x > static_cast<T>(0.f)) ? x : sycl::expm1(x);
 }

 template<typename T>
-static void gelu(const T * x, T * dst, const int k,
-                     const sycl::nd_item<1> &item_ct1) {
+static __dpct_inline__ T op_gelu(T x) {
    const T GELU_COEF_A    = static_cast<T>(0.044715f);
    const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = static_cast<T>(0.5f) * x[i] *
-                 (static_cast<T>(1.0f) +
-                  sycl::tanh(SQRT_2_OVER_PI * x[i] * (static_cast<T>(1.0f) + GELU_COEF_A * x[i] * x[i])));
-    }
+    return static_cast<T>(0.5f) * x *
+           (static_cast<T>(1.0f) +
+            sycl::tanh(SQRT_2_OVER_PI * x * (static_cast<T>(1.0f) + GELU_COEF_A * x * x)));
 }

 template<typename T>
-static void silu(const T * x, T * dst, const int k,
-                     const sycl::nd_item<1> &item_ct1) {
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = x[i] / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
-    }
+static __dpct_inline__ T op_silu(T x) {
+    return x / (static_cast<T>(1.0f) + sycl::native::exp(-x));
 }

 template<typename T>
-static void gelu_quick(const T *x, T *dst, int k,
-                           const sycl::nd_item<1> &item_ct1) {
+static __dpct_inline__ T op_gelu_quick(T x) {
    const T GELU_QUICK_COEF_LOCAL = static_cast<T>(-1.702f);
-    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = x[i] * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x[i])));
-    }
+    return x * (static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(GELU_QUICK_COEF_LOCAL * x)));
 }

 template<typename T>
-static void gelu_erf(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+static __dpct_inline__ T op_gelu_erf(T x) {
    const T SQRT_2_INV = static_cast<T>(0.70710678118654752440084436210484f);
+    return static_cast<T>(0.5f) * x * (static_cast<T>(1.0f) + sycl::erf(x * SQRT_2_INV));
+}
+
+template<typename T>
+static __dpct_inline__ T op_tanh(T x) {
+    return sycl::tanh(x);
+}
+
+template<typename T>
+static __dpct_inline__ T op_relu(T x) {
+    return sycl::fmax(x, static_cast<T>(0));
+}
+
+template<typename T>
+static __dpct_inline__ T op_sigmoid(T x) {
+    return static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x));
+}
+
+template<typename T>
+static __dpct_inline__ T op_sqrt(T x) {
+    return sycl::sqrt(x);
+}
+
+template<typename T>
+static __dpct_inline__ T op_sin(T x) {
+    return sycl::sin(x);
+}
+
+template<typename T>
+static __dpct_inline__ T op_cos(T x) {
+    return sycl::cos(x);
+}
+
+template<typename T>
+static __dpct_inline__ T op_hardsigmoid(T x) {
+    return sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
+}
+
+template<typename T>
+static __dpct_inline__ T op_hardswish(T x) {
+    return x * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
+}
+
+template<typename T>
+static __dpct_inline__ T op_exp(T x) {
+    return sycl::exp(x);
+}
+
+template<typename T>
+static __dpct_inline__ T op_log(T x) {
+    if (x <= static_cast<T>(0)) {
+        return neg_infinity<T>();
+    }
+    return sycl::log(x);
+}
+
+template<typename T>
+static __dpct_inline__ T op_neg(T x) {
+    return -x;
+}
+
+template<typename T>
+static __dpct_inline__ T op_step(T x) {
+    return (x > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
+}
+
+template<typename T>
+static __dpct_inline__ T op_leaky_relu(T x, float negative_slope) {
+    T neg_slope_T = static_cast<T>(negative_slope);
+    return sycl::fmax(x, static_cast<T>(0)) +
+           sycl::fmin(x, static_cast<T>(0.0f)) * neg_slope_T;
+}
+
+template<typename T>
+static __dpct_inline__ T op_sqr(T x) {
+    return x * x;
+}
+
+template<typename T>
+static __dpct_inline__ T op_clamp(T x, float min_val, float max_val) {
+    return x < static_cast<T>(min_val) ? static_cast<T>(min_val) : (x > static_cast<T>(max_val) ? static_cast<T>(max_val) : x);
+}
+
+template<typename T>
+static void unary_op_sgn_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-       auto x_i = x[i];
-        dst[i] = static_cast<T>(0.5f) * x_i * (static_cast<T>(1.0f) + sycl::erf(x_i * SQRT_2_INV));
+        dst[i] = op_sgn(x[i]);
    }
 }

 template<typename T>
-static void tanh(const T *x, T *dst, int k,
-                     const sycl::nd_item<1> &item_ct1) {
+static void unary_op_abs_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::tanh((x[i]));
+        dst[i] = op_abs(x[i]);
    }
 }

 template<typename T>
-static void relu(const T * x, T * dst, const int k,
-                     const sycl::nd_item<1> &item_ct1) {
+static void unary_op_elu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::fmax((x[i]), static_cast<T>(0));
+        dst[i] = op_elu(x[i]);
    }
 }

 template<typename T>
-static void sigmoid(const T * x, T * dst, const int k,
-                            const sycl::nd_item<1> &item_ct1) {
+static void unary_op_gelu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = static_cast<T>(1.0f) / (static_cast<T>(1.0f) + sycl::native::exp(-x[i]));
+        dst[i] = op_gelu(x[i]);
    }
 }

 template<typename T>
-static void sqrt(const T * x, T * dst, const int k,
-                            const sycl::nd_item<1> &item_ct1) {
+static void unary_op_silu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::sqrt(x[i]);
+        dst[i] = op_silu(x[i]);
    }
 }

 template<typename T>
-static void sin(const T * x, T * dst, const int k,
-                            const sycl::nd_item<1> &item_ct1) {
+static void unary_op_gelu_quick_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::sin(x[i]);
+        dst[i] = op_gelu_quick(x[i]);
    }
 }

 template<typename T>
-static void cos(const T * x, T * dst, const int k,
-                            const sycl::nd_item<1> &item_ct1) {
+static void unary_op_gelu_erf_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::cos(x[i]);
+        dst[i] = op_gelu_erf(x[i]);
    }
 }

 template<typename T>
-static void hardsigmoid(const T * x, T * dst, const int k,
-                            const sycl::nd_item<1> &item_ct1) {
+static void unary_op_tanh_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
+        dst[i] = op_tanh(x[i]);
    }
 }

 template<typename T>
-static void hardswish(const T * x, T * dst, const int k,
-                          const sycl::nd_item<1> &item_ct1) {
+static void unary_op_relu_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = x[i] * sycl::fmin(static_cast<T>(1.0f), sycl::fmax(static_cast<T>(0.0f), (x[i] + static_cast<T>(3.0f)) / static_cast<T>(6.0f)));
+        dst[i] = op_relu(x[i]);
    }
 }

 template<typename T>
-static void exp(const T * x, T * dst, const int k,
-                          const sycl::nd_item<1> &item_ct1) {
+static void unary_op_sigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = sycl::exp(x[i]);
+        dst[i] = op_sigmoid(x[i]);
    }
 }

 template<typename T>
-static void log(const T * x, T * dst, const int k,
-                          const sycl::nd_item<1> &item_ct1) {
+static void unary_op_sqrt_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        T xi = x[i];
-        if (xi <= static_cast<T>(0)) {
-            dst[i] = neg_infinity<T>();
-        } else {
-            dst[i] = sycl::log(xi);
-        }
+        dst[i] = op_sqrt(x[i]);
    }
 }

 template<typename T>
-static void neg(const T * x, T * dst, const int k,
-                          const sycl::nd_item<1> &item_ct1) {
+static void unary_op_sin_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = -x[i];
+        dst[i] = op_sin(x[i]);
    }
 }

 template<typename T>
-static void step(const T * x, T * dst, const int k,
-                          const sycl::nd_item<1> &item_ct1) {
+static void unary_op_cos_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = (x[i] > static_cast<T>(0.0f)) ? static_cast<T>(1.0f) : static_cast<T>(0.0f);
+        dst[i] = op_cos(x[i]);
    }
 }

 template<typename T>
-static void leaky_relu(const T *x, T *dst, const int k, const float negative_slope,
-                           const sycl::nd_item<1> &item_ct1) {
+static void unary_op_hardsigmoid_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        T neg_slope_T = static_cast<T>(negative_slope);
-        dst[i] = sycl::fmax((x[i]), static_cast<T>(0)) +
-                 sycl::fmin((x[i]), static_cast<T>(0.0f)) * neg_slope_T;
+        dst[i] = op_hardsigmoid(x[i]);
    }
 }

 template<typename T>
-static void sqr(const T * x, T * dst, const int k,
-                    const sycl::nd_item<1> &item_ct1) {
+static void unary_op_hardswish_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
-        dst[i] = x[i] * x[i];
+        dst[i] = op_hardswish(x[i]);
+    }
+}
+
+template<typename T>
+static void unary_op_exp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_exp(x[i]);
+    }
+}
+
+template<typename T>
+static void unary_op_log_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_log(x[i]);
+    }
+}
+
+template<typename T>
+static void unary_op_neg_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_neg(x[i]);
+    }
+}
+
+template<typename T>
+static void unary_op_step_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_step(x[i]);
+    }
+}
+
+template<typename T>
+static void unary_op_leaky_relu_kernel(const T * x, T * dst, const int k, float negative_slope, const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_leaky_relu(x[i], negative_slope);
+    }
+}
+
+template<typename T>
+static void unary_op_sqr_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_sqr(x[i]);
+    }
+}
+
+template<typename T>
+static void unary_op_clamp_kernel(const T * x, T * dst, const int k, const sycl::nd_item<1> &item_ct1, float min_val, float max_val) {
+    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
+        dst[i] = op_clamp(x[i], min_val, max_val);
    }
 }

@ -262,18 +358,10 @@ static void clamp(const T * x, T * dst, const float min, const float max, const

 template<typename T>
 static void gated_op_fused_geglu(const T * x, const T * g, T * dst, const uint64_t k, const uint64_t n, const uint64_t o0, const uint64_t o1, const sycl::nd_item<1> &item_ct1) {
-    const T GELU_COEF_A = static_cast<T>(0.044715f);
-    const T SQRT_2_OVER_PI = static_cast<T>(0.79788456080286535587989211986876f);
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
        const int64_t j0 = (i / n) * o0 + (i % n);
        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
-        const T x_val = x[j0];
-
-        const T x_cubed_term = static_cast<T>(1.0f) + GELU_COEF_A * x_val * x_val;
-        const T tanh_input = SQRT_2_OVER_PI * x_val * x_cubed_term;
-        const T gelu_val = static_cast<T>(0.5f) * x_val * (static_cast<T>(1.0f) + sycl::tanh(tanh_input));
-
-        dst[i] = gelu_val * g[j1];
+        dst[i] = op_gelu(x[j0]) * g[j1];
    }
 }

@ -282,7 +370,7 @@ static void gated_op_fused_reglu(const T * x, const T * g, T * dst, const uint64
    SYCL_GLOBAL_ID_LOOP(k, item_ct1) {
        const int64_t j0 = (i / n) * o0 + (i % n);
        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
-        dst[i] = sycl::max((x[j0]), static_cast<T>(0)) * g[j1];
+        dst[i] = op_relu(x[j0]) * g[j1];
    }
 }

@ -291,13 +379,11 @@ static void gated_op_fused_swiglu(const T * x, const T * g, T * dst, const uint6
    SYCL_GLOBAL_ID_LOOP(k, item_ct1)  {
        const int64_t j0 = (i / n) * o0 + (i % n);
        const int64_t j1 = o0 == o1 ? j0 : (i / n) * o1 + (i % n);
-        dst[i] = (x[j0] / (static_cast<T>(1) + sycl::native::exp(-x[j0]))) * g[j1];
+        dst[i] = op_silu(x[j0]) * g[j1];
    }
 }

-// --- Generic SYCL Kernel Launchers ---
 namespace ggml_sycl_detail {
-// acc_f32_sycl remains specific
 static void acc_f32_sycl(const float *x, const float *y, float *dst,
                         const int n_elements, const int ne10, const int ne11,
                         const int ne12, const int nb1, const int nb2,
@ -313,7 +399,6 @@ static void acc_f32_sycl(const float *x, const float *y, float *dst,
        });
 }

-// upscale_sycl remains specific
 template<typename T>
 static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
                             const int nb02, const int nb03, const int ne10, const int ne11,
@ -328,7 +413,6 @@ static void upscale_sycl(const T *x, T *dst, const int nb00, const int nb01,
        });
 }

-// pad_sycl remains specific
 template<typename T>
 static void pad_sycl(const T *x, T *dst, const int ne00,
                         const int ne01, const int ne02, const int ne0,
@ -341,10 +425,8 @@ static void pad_sycl(const T *x, T *dst, const int ne00,
                      [=](sycl::nd_item<3> item_ct1) { pad(x, dst, ne0, ne00, ne01, ne02, item_ct1); });
 }

-// Common dispatcher for 1-input, 1-output element-wise ops, handling type switching.
-// KernelInvoker is a lambda that takes (const T* src, T* dst, int k, queue_ptr stream, Args...)
 template<typename KernelInvoker, typename... Args>
-inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
+static inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
 #if defined (GGML_SYCL_F16)
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -375,9 +457,8 @@ inline void dispatch_ggml_sycl_op_unary(ggml_backend_sycl_context & ctx, ggml_te
    }
 }

-// Dispatcher for fused GLU ops, handling specific input pointer setup and type switching.
 template<typename KernelInvoker, typename... Args>
-inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
+static inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
 #if defined (GGML_SYCL_F16)
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -455,9 +536,8 @@ inline void dispatch_ggml_sycl_op_fused_glu(ggml_backend_sycl_context & ctx, ggm
    }
 }

-// Dispatcher for upscale
 template<typename KernelInvoker, typename... Args>
-inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
+static inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
 #if defined (GGML_SYCL_F16)
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -498,9 +578,8 @@ inline void dispatch_ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_
    }
 }

-// Dispatcher for pad
 template<typename KernelInvoker, typename... Args>
-inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
+static inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst, KernelInvoker kernel_invoker, Args&&... args) {
 #if defined (GGML_SYCL_F16)
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32 || dst->src[0]->type == GGML_TYPE_F16);
    GGML_ASSERT(dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16);
@ -537,9 +616,8 @@ inline void dispatch_ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tens
 } // namespace ggml_sycl_detail


-// --- Backend Operation Functions (ggml_sycl_op_...) ---

-inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, 256);
@ -547,12 +625,12 @@ inline void ggml_sycl_op_sgn(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                  sycl::range<1>(256)),
                [=](sycl::nd_item<1> item_ct1) {
-                    sgn(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_sgn_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, 256);
@ -560,12 +638,12 @@ inline void ggml_sycl_op_abs(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                  sycl::range<1>(256)),
                [=](sycl::nd_item<1> item_ct1) {
-                    abs_op(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_abs_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, 256);
@ -573,12 +651,12 @@ inline void ggml_sycl_op_elu(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(256),
                                  sycl::range<1>(256)),
                [=](sycl::nd_item<1> item_ct1) {
-                    elu_op(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_elu_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SILU_BLOCK_SIZE);
@ -586,12 +664,12 @@ inline void ggml_sycl_op_silu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SILU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SILU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    silu(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_silu_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -599,12 +677,12 @@ inline void ggml_sycl_op_gelu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    gelu(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_gelu_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+static inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -612,12 +690,12 @@ inline void ggml_sycl_op_gelu_quick(ggml_backend_sycl_context & ctx, ggml_tensor
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    gelu_quick(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_gelu_quick_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+static inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_GELU_BLOCK_SIZE);
@ -625,12 +703,12 @@ inline void ggml_sycl_op_gelu_erf(ggml_backend_sycl_context & ctx, ggml_tensor *
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_GELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_GELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    gelu_erf(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_gelu_erf_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_TANH_BLOCK_SIZE);
@ -638,12 +716,12 @@ inline void ggml_sycl_op_tanh(ggml_backend_sycl_context & ctx, ggml_tensor * dst
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_TANH_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_TANH_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    tanh(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_tanh_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_RELU_BLOCK_SIZE);
@ -651,12 +729,12 @@ inline void ggml_sycl_op_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    relu(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_relu_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_HARDSIGMOID_BLOCK_SIZE);
@ -664,12 +742,12 @@ inline void ggml_sycl_op_hardsigmoid(ggml_backend_sycl_context & ctx, ggml_tenso
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_HARDSIGMOID_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    hardsigmoid(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_hardsigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_HARDSWISH_BLOCK_SIZE);
@ -677,12 +755,12 @@ inline void ggml_sycl_op_hardswish(ggml_backend_sycl_context & ctx, ggml_tensor
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_HARDSWISH_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    hardswish(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_hardswish_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE);
@ -690,12 +768,12 @@ inline void ggml_sycl_op_exp(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    exp(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_exp_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_EXP_BLOCK_SIZE); // Using EXP block size
@ -703,12 +781,12 @@ inline void ggml_sycl_op_log(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_EXP_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_EXP_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    log(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_log_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE);
@ -716,12 +794,12 @@ inline void ggml_sycl_op_neg(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    neg(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_neg_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_NEG_BLOCK_SIZE); // Using NEG block size
@ -729,12 +807,12 @@ inline void ggml_sycl_op_step(ggml_backend_sycl_context & ctx, ggml_tensor * dst
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_NEG_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_NEG_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    step(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_step_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SIGMOID_BLOCK_SIZE);
@ -742,12 +820,12 @@ inline void ggml_sycl_op_sigmoid(ggml_backend_sycl_context & ctx, ggml_tensor *
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SIGMOID_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    sigmoid(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_sigmoid_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SQRT_BLOCK_SIZE);
@ -755,12 +833,12 @@ inline void ggml_sycl_op_sqrt(ggml_backend_sycl_context & ctx, ggml_tensor * dst
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQRT_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SQRT_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    sqrt(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_sqrt_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE);
@ -768,12 +846,12 @@ inline void ggml_sycl_op_sin(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    sin(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_sin_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SIN_BLOCK_SIZE); // Using SIN block size
@ -781,12 +859,12 @@ inline void ggml_sycl_op_cos(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SIN_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SIN_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    cos(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_cos_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    float negative_slope;
    memcpy(&negative_slope, dst->op_params, sizeof(float));
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
@ -796,12 +874,12 @@ inline void ggml_sycl_op_leaky_relu(ggml_backend_sycl_context & ctx, ggml_tensor
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_RELU_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_RELU_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    leaky_relu(src, dst_ptr, k_elements, slope, item_ct1);
+                    unary_op_leaky_relu_kernel(src, dst_ptr, k_elements, slope, item_ct1);
                });
        }, negative_slope);
 }

-inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_unary(ctx, dst,
        [](const auto* src, auto* dst_ptr, int k_elements, queue_ptr stream) {
            const int num_blocks = ceil_div(k_elements, SYCL_SQR_BLOCK_SIZE);
@ -809,12 +887,12 @@ inline void ggml_sycl_op_sqr(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
                sycl::nd_range<1>(sycl::range<1>(num_blocks) * sycl::range<1>(SYCL_SQR_BLOCK_SIZE),
                                  sycl::range<1>(SYCL_SQR_BLOCK_SIZE)),
                [=](sycl::nd_item<1> item_ct1) {
-                    sqr(src, dst_ptr, k_elements, item_ct1);
+                    unary_op_sqr_kernel(src, dst_ptr, k_elements, item_ct1);
                });
        });
 }

-inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_upscale(ctx, dst,
        [](const auto* src, auto* dst_ptr, int nb00, int nb01, int nb02, int nb03,
           int ne10, int ne11, int ne12, int ne13, float sf0, float sf1, float sf2, float sf3,
@ -823,7 +901,7 @@ inline void ggml_sycl_op_upscale(ggml_backend_sycl_context & ctx, ggml_tensor *
        });
 }

-inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_pad(ctx, dst,
        [](const auto* src, auto* dst_ptr, int ne00, int ne01, int ne02, int ne0, int ne1, int ne2,
           queue_ptr stream) {
@ -831,7 +909,7 @@ inline void ggml_sycl_op_pad(ggml_backend_sycl_context & ctx, ggml_tensor * dst)
        });
 }

-inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    float min_val;
    float max_val;
    memcpy(&min_val, dst->op_params, sizeof(float));
@ -848,7 +926,7 @@ inline void ggml_sycl_op_clamp(ggml_backend_sycl_context & ctx, ggml_tensor * ds
        }, min_val, max_val);
 }

-inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
+static inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst) {
    GGML_ASSERT(dst->src[0]->type == GGML_TYPE_F32);
    GGML_ASSERT(dst->src[1]->type == GGML_TYPE_F32);
    GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -867,7 +945,7 @@ inline void ggml_sycl_op_acc(ggml_backend_sycl_context & ctx, ggml_tensor *dst)
    ggml_sycl_detail::acc_f32_sycl(src0_dd, src1_dd, dst_dd, (int)ggml_nelements(dst), (int)dst->src[1]->ne[0], (int)dst->src[1]->ne[1], (int)dst->src[1]->ne[2], nb1, nb2, offset, main_stream);
 }

-inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div(k, SYCL_GELU_BLOCK_SIZE);
@ -878,7 +956,7 @@ inline void ggml_sycl_op_geglu(ggml_backend_sycl_context & ctx, ggml_tensor * ds
        });
 }

-inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_RELU_BLOCK_SIZE); // Using RELU block size for reglu
@ -889,7 +967,7 @@ inline void ggml_sycl_op_reglu(ggml_backend_sycl_context & ctx, ggml_tensor * ds
        });
 }

-inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
+static inline void ggml_sycl_op_swiglu(ggml_backend_sycl_context & ctx, ggml_tensor * dst) {
    ggml_sycl_detail::dispatch_ggml_sycl_op_fused_glu(ctx, dst,
        [](const auto* x_ptr, const auto* g_ptr, auto* dst_ptr, uint64_t k, uint64_t n, uint64_t o0, uint64_t o1, queue_ptr main_stream) {
            const uint32_t num_blocks = ceil_div((uint32_t)k, SYCL_SILU_BLOCK_SIZE); // Using SILU block size for swiglu