mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-15 04:33:06 -04:00
@@ -113,6 +113,15 @@ llama_context::llama_context(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
const char * LLAMA_GRAPH_REUSE_DISABLE = getenv("LLAMA_GRAPH_REUSE_DISABLE");
|
||||||
|
graph_reuse_disable = LLAMA_GRAPH_REUSE_DISABLE ? (atoi(LLAMA_GRAPH_REUSE_DISABLE) != 0) : graph_reuse_disable;
|
||||||
|
|
||||||
|
if (graph_reuse_disable) {
|
||||||
|
LLAMA_LOG_WARN("%s: graph reuse disabled\n", __func__);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
const uint32_t n_ctx_per_seq = cparams.n_ctx / cparams.n_seq_max;
|
||||||
|
|
||||||
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
LLAMA_LOG_INFO("%s: n_seq_max = %u\n", __func__, cparams.n_seq_max);
|
||||||
@@ -716,7 +725,7 @@ llm_graph_result * llama_context::process_ubatch(const llama_ubatch & ubatch, ll
|
|||||||
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
// in order to correctly reuse a graph, it's full topology has to be uniquely determined by these parameters
|
||||||
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
const auto gparams = graph_params(res, ubatch, mctx, gtype);
|
||||||
|
|
||||||
if (res->can_reuse(gparams)) {
|
if (!graph_reuse_disable && res->can_reuse(gparams)) {
|
||||||
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
//LLAMA_LOG_DEBUG("%s: reusing previous graph\n", __func__);
|
||||||
|
|
||||||
n_reused++;
|
n_reused++;
|
||||||
|
@@ -291,6 +291,9 @@ private:
|
|||||||
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
// ref: https://github.com/ggml-org/llama.cpp/pull/14285
|
||||||
bool supports_set_rows = false;
|
bool supports_set_rows = false;
|
||||||
|
|
||||||
|
// env: LLAMA_GRAPH_REUSE_DISABLE
|
||||||
|
bool graph_reuse_disable = false;
|
||||||
|
|
||||||
// perf
|
// perf
|
||||||
mutable int64_t t_start_us = 0;
|
mutable int64_t t_start_us = 0;
|
||||||
mutable int64_t t_load_us = 0;
|
mutable int64_t t_load_us = 0;
|
||||||
|
@@ -423,7 +423,9 @@ struct llm_graph_params {
|
|||||||
(!ubatch.embd && !other.ubatch.embd)
|
(!ubatch.embd && !other.ubatch.embd)
|
||||||
);
|
);
|
||||||
|
|
||||||
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
|
// when we split the batch using "equal_seqs" we have to verify that the participating sequences are the same
|
||||||
|
// the reason is because the set of attention streams would be different for different sequences
|
||||||
|
if (can_reuse_ubatch && ubatch.equal_seqs()) {
|
||||||
if (!ubatch.data) {
|
if (!ubatch.data) {
|
||||||
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
||||||
// therefore we cannot perform the sequence id check. normally should never happen
|
// therefore we cannot perform the sequence id check. normally should never happen
|
||||||
|
Reference in New Issue
Block a user