mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-13 11:57:43 -04:00
CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID (#13014)
* CUDA: noncont MMVQ + batched bs1 MUL_MAT_ID * fix logic for RoPE support, CUDA graphs
This commit is contained in:
@@ -3,7 +3,7 @@
|
||||
// maximum number of src0 rows with which to use mul_mat_vec over cuBLAS if FP16 tensor cores are available
|
||||
#define MMV_MAX_ROWS 512
|
||||
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst);
|
||||
void ggml_cuda_mul_mat_vec(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, const ggml_tensor * ids, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_mul_mat_vec(
|
||||
ggml_backend_cuda_context & ctx,
|
||||
|
Reference in New Issue
Block a user