falcon : fix CUDA inference by making K and Q contiguous (#2830)

* falcon : fix CUDA inference by making K and Q contiguous ggml-ci * cuda : add assert to guard from non-cont ropes
2025-08-18 05:56:00 -04:00 · 2023-08-27 16:40:48 +03:00
parent da7455d046
commit eaa13a48ff
2 changed files with 8 additions and 4 deletions
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -6337,9 +6337,11 @@ void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml

 void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
    GGML_ASSERT(src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32);
+    GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented

    const int mode = ((int32_t *) dst->op_params)[2];
    const bool is_glm = mode & 4;
+
    ggml_cuda_op(src0, src1, dst, ggml_cuda_op_rope, true, !is_glm); // flatten support not implemented for glm
 }