mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 20:05:20 +00:00
cuBLAS: use host pinned memory and dequantize while copying (#1207)
* cuBLAS: dequantize simultaneously while copying memory * cuBLAS: use host pinned memory * cuBLAS: improve ggml_compute_forward_mul_mat_f16_f32 with pinned memory * cuBLAS: also pin kv cache * fix rebase
This commit is contained in:
@ -136,7 +136,7 @@ struct llama_kv_cache {
|
||||
|
||||
struct ggml_context * ctx = NULL;
|
||||
|
||||
llama_buffer buf;
|
||||
llama_ctx_buffer buf;
|
||||
|
||||
int n; // number of tokens currently in the cache
|
||||
|
||||
@ -167,7 +167,7 @@ struct llama_model {
|
||||
struct llama_kv_cache kv_self;
|
||||
|
||||
// the model memory buffer
|
||||
llama_buffer buf;
|
||||
llama_ctx_buffer buf;
|
||||
|
||||
// model memory mapped file
|
||||
std::unique_ptr<llama_mmap> mapping;
|
||||
@ -228,8 +228,8 @@ struct llama_context {
|
||||
|
||||
// memory buffers used to evaluate the model
|
||||
// TODO: move in llama_state
|
||||
llama_buffer buf_compute;
|
||||
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
llama_ctx_buffer buf_compute;
|
||||
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
|
||||
|
||||
int buf_last = 0;
|
||||
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
|
||||
|
Reference in New Issue
Block a user