mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-21 07:03:43 -04:00
ggml, llama : avoid heavy V transpose + improvements (#775)
ggml : - added ggml_view_3d() - ggml_view_tensor() now inherits the stride too - reimplement ggml_cpy() to account for dst stride - no longer require tensor->data to be memory aligned llama : - compute RoPE on 32-bit tensors (should be more accurate) - store RoPE-ed K in the KV cache - store transposed V in the KV cache (significant speed-up) - avoid unnecessary Q copy
This commit is contained in:
10
ggml.h
10
ggml.h
@@ -558,6 +558,16 @@ struct ggml_tensor * ggml_view_2d(
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_view_3d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
int64_t ne0,
|
||||
int64_t ne1,
|
||||
int64_t ne2,
|
||||
size_t nb1, // row stride in bytes
|
||||
size_t nb2, // slice stride in bytes
|
||||
size_t offset);
|
||||
|
||||
struct ggml_tensor * ggml_permute(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
Reference in New Issue
Block a user