mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-16 13:12:51 -04:00
metal : use shared buffers between CPU and GPU (#1696)
* Use MTLDevice.newBufferWithBytesNoCopy to share buffers between CPU and GPU * Page-align buffers used by Metal * Remove trailing whitespace * Only import unistd.h for Metal builds * metal : remove unnecessary copies --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
13
llama.cpp
13
llama.cpp
@@ -53,7 +53,6 @@ enum e_model {
|
||||
MODEL_65B,
|
||||
};
|
||||
|
||||
|
||||
static const size_t MB = 1024*1024;
|
||||
|
||||
// computed for n_ctx == 2048
|
||||
@@ -1281,12 +1280,6 @@ static bool llama_eval_internal(
|
||||
ggml_set_name(embd, "embd");
|
||||
memcpy(embd->data, tokens, N*ggml_element_size(embd));
|
||||
|
||||
#ifdef GGML_USE_METAL
|
||||
if (lctx.ctx_metal && N == 1) {
|
||||
ggml_metal_set_tensor(lctx.ctx_metal, embd);
|
||||
}
|
||||
#endif
|
||||
|
||||
struct ggml_tensor * cur;
|
||||
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||
|
||||
@@ -1484,12 +1477,6 @@ static bool llama_eval_internal(
|
||||
}
|
||||
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
||||
if (lctx.ctx_metal) {
|
||||
// We need to sync the CPU KV cache with the GPU KV cache
|
||||
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.k);
|
||||
ggml_metal_set_tensor(lctx.ctx_metal, kv_self.v);
|
||||
}
|
||||
}
|
||||
#else
|
||||
ggml_graph_compute(ctx0, &gf);
|
||||
|
Reference in New Issue
Block a user