From 6ee86e5e0f45e99fe2f0c3b322fe3ab82e632f9b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 12 Feb 2025 16:29:15 +0200 Subject: [PATCH] graph : restore ubatch in build_cb ggml-ci --- src/llama-context.cpp | 6 ++---- src/llama-context.h | 1 + src/llama-graph.h | 1 + src/llama-model.cpp | 3 ++- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 74d6a67bb..62f76f48b 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -196,6 +196,7 @@ bool llama_context::apply_adapter_cvec( void llama_context::build_cb( ggml_tensor * cur, const char * name, + const llama_ubatch & ubatch, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -213,10 +214,7 @@ void llama_context::build_cb( // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends // FIXME: fix in ggml_backend_sched const bool full_offload = model.params.n_gpu_layers > (int) model.hparams.n_layer; - // TODO: during #11213, the requirement for ubatch.n_tokens < 32 was removed to simplify - // not sure if this is still needed, but it can be brought back if needed - //if (ubatch.n_tokens < 32 || full_offload) { - if (full_offload) { + if (ubatch.n_tokens < 32 || full_offload) { if (il != -1 && strcmp(name, "norm") == 0) { const auto & dev_layer = model.dev_layer(il); for (auto & backend : backends) { diff --git a/src/llama-context.h b/src/llama-context.h index 8d7a6ad58..dc85c7971 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -85,6 +85,7 @@ struct llama_context : public llama_graph_i { virtual void build_cb( ggml_tensor * cur, const char * name, + const llama_ubatch & ubatch, int il); // TODO: add encode/decode graphs diff --git a/src/llama-graph.h b/src/llama-graph.h index 0084d99cc..d111d76e9 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -14,6 +14,7 @@ public: virtual void build_cb( ggml_tensor * cur, const char * name, + const llama_ubatch & ubatch, int il) = 0; // apply control vector for layer il diff --git a/src/llama-model.cpp b/src/llama-model.cpp index bded48be6..ba11f1e15 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -248,6 +248,7 @@ static ggml_backend_buffer_type_t select_weight_buft(const llama_hparams & hpara return cur_buft; } } + return nullptr; } @@ -3888,7 +3889,7 @@ struct llm_build_context { // TODO: tmp void cb(struct ggml_tensor * cur, const char * name, int il) { - lgf.build_cb(cur, name, il); + lgf.build_cb(cur, name, ubatch, il); } // TODO: tmp