mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-19 17:17:40 +00:00
graph : update attn/kv_self names
This commit is contained in:
@ -2491,7 +2491,7 @@ void llama_context_kv_self::kv_self_update() {
|
|||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
build_k_shift(ctx0, gf);
|
build_kv_self_shift(ctx0, gf);
|
||||||
|
|
||||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||||
|
|
||||||
@ -2520,7 +2520,7 @@ void llama_context_kv_self::kv_self_update() {
|
|||||||
|
|
||||||
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
build_defrag(ctx0, gf);
|
build_kv_self_defrag(ctx0, gf);
|
||||||
|
|
||||||
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
ggml_backend_sched_alloc_graph(sched.get(), gf);
|
||||||
|
|
||||||
@ -2762,7 +2762,7 @@ ggml_tensor * llama_context_kv_self::build_attn_qkv(
|
|||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * llama_context_kv_self::build_soft_max_ext(
|
ggml_tensor * llama_context_kv_self::build_attn_soft_max(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * kq,
|
ggml_tensor * kq,
|
||||||
float kq_scale) {
|
float kq_scale) {
|
||||||
@ -2771,7 +2771,7 @@ ggml_tensor * llama_context_kv_self::build_soft_max_ext(
|
|||||||
return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
|
return ggml_soft_max_ext(ctx0, kq, inp_KQ_mask_cnv, kq_scale, hparams.f_max_alibi_bias);
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_context_kv_self::build_k_shift(
|
void llama_context_kv_self::build_kv_self_shift(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * graph) {
|
ggml_cgraph * graph) {
|
||||||
const auto & n_ctx = cparams.n_ctx;
|
const auto & n_ctx = cparams.n_ctx;
|
||||||
@ -2843,7 +2843,7 @@ void llama_context_kv_self::build_k_shift(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_context_kv_self::build_defrag(
|
void llama_context_kv_self::build_kv_self_defrag(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * graph) {
|
ggml_cgraph * graph) {
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
@ -2860,7 +2860,7 @@ void llama_context_kv_self::build_defrag(
|
|||||||
// number of cells moved
|
// number of cells moved
|
||||||
uint32_t n_moves = 0;
|
uint32_t n_moves = 0;
|
||||||
|
|
||||||
// each move requires 6*n_layer tensors (see build_defrag)
|
// each move requires 6*n_layer tensors (see build_kv_self_defrag)
|
||||||
// - source view, destination view, copy operation
|
// - source view, destination view, copy operation
|
||||||
// - x2 for keys and values
|
// - x2 for keys and values
|
||||||
//const uint32_t max_moves = model.max_nodes()/(6*n_layer);
|
//const uint32_t max_moves = model.max_nodes()/(6*n_layer);
|
||||||
|
@ -379,17 +379,17 @@ public:
|
|||||||
int il,
|
int il,
|
||||||
bool worst_case) override;
|
bool worst_case) override;
|
||||||
|
|
||||||
virtual ggml_tensor * build_soft_max_ext(
|
virtual ggml_tensor * build_attn_soft_max(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * kq,
|
ggml_tensor * kq,
|
||||||
float kq_scale) override;
|
float kq_scale) override;
|
||||||
|
|
||||||
virtual void build_k_shift(
|
virtual void build_kv_self_shift(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * graph) override;
|
ggml_cgraph * graph) override;
|
||||||
|
|
||||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||||
virtual void build_defrag(
|
virtual void build_kv_self_defrag(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * graph) override;
|
ggml_cgraph * graph) override;
|
||||||
|
|
||||||
|
@ -92,17 +92,17 @@ public:
|
|||||||
int il,
|
int il,
|
||||||
bool worst_case) = 0;
|
bool worst_case) = 0;
|
||||||
|
|
||||||
virtual ggml_tensor * build_soft_max_ext(
|
virtual ggml_tensor * build_attn_soft_max(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_tensor * kq,
|
ggml_tensor * kq,
|
||||||
float kq_scale) = 0;
|
float kq_scale) = 0;
|
||||||
|
|
||||||
virtual void build_k_shift(
|
virtual void build_kv_self_shift(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * graph) = 0;
|
ggml_cgraph * graph) = 0;
|
||||||
|
|
||||||
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
||||||
virtual void build_defrag(
|
virtual void build_kv_self_defrag(
|
||||||
ggml_context * ctx0,
|
ggml_context * ctx0,
|
||||||
ggml_cgraph * graph) = 0;
|
ggml_cgraph * graph) = 0;
|
||||||
|
|
||||||
|
@ -4251,18 +4251,18 @@ struct llm_build_context {
|
|||||||
return cur;
|
return cur;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_k_shift() {
|
struct ggml_cgraph * build_kv_self_shift() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
lgf.build_k_shift(ctx0, gf);
|
lgf.build_kv_self_shift(ctx0, gf);
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_cgraph * build_defrag() {
|
struct ggml_cgraph * build_kv_self_defrag() {
|
||||||
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, model.max_nodes(), false);
|
||||||
|
|
||||||
lgf.build_defrag(ctx0, gf);
|
lgf.build_kv_self_defrag(ctx0, gf);
|
||||||
|
|
||||||
return gf;
|
return gf;
|
||||||
}
|
}
|
||||||
@ -5638,7 +5638,7 @@ struct llm_build_context {
|
|||||||
cb(kq, "kq", il);
|
cb(kq, "kq", il);
|
||||||
|
|
||||||
//kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
//kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
|
||||||
kq = lgf.build_soft_max_ext(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
|
kq = lgf.build_attn_soft_max(ctx0, kq, 1.0f/sqrtf(float(n_embd_head)));
|
||||||
cb(kq, "kq_soft_max_ext", il);
|
cb(kq, "kq_soft_max_ext", il);
|
||||||
|
|
||||||
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
|
||||||
|
Reference in New Issue
Block a user