diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 9e77fe6d8..bd637f3df 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -113,6 +113,15 @@ llama_context::llama_context( } } + { + const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE"); + graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable; + + if (graph_reuse_disable) { + LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__); + } + } + const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max; LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max); @@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll // in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters const auto gparams = graph_params(res, ubatch, mctx, gtype); - if (res->can_reuse(gparams)) { + if (!graph_reuse_disable && res->can_reuse(gparams)) { //LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__); n_reused++; diff --git a/src/llama-context.h b/src/llama-context.h index 5c3a1c098..7cfdc6a51 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -291,6 +291,9 @@ private: // ref: https://github.com/ggml-org/llama.cpp/pull/14285 bool supports_set_rows = false; + // env: LLAMA_GRAPH_REUSE_DISABLE + bool graph_reuse_disable = false; + // perf mutable int64_t t_start_us = 0; mutable int64_t t_load_us = 0; diff --git a/src/llama-graph.h b/src/llama-graph.h index 94d778f38..8614d4967 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -423,7 +423,9 @@ struct llm_graph_params { (!ubatch.embd && !other.ubatch.embd) ); - if (can_reuse_ubatch && !ubatch.equal_seqs()) { + // when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same + // the reason is because the set of attention streams would be different for different sequences + if (can_reuse_ubatch && ubatch.equal_seqs()) { if (!ubatch.data) { // if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and // therefore we cannot perform the sequence id check. normally should never happen