mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-29 04:35:05 +00:00
@ -350,6 +350,12 @@ void llm_graph_input_mem_hybrid::set_input(const llama_ubatch * ubatch) {
|
||||
}
|
||||
}
|
||||
|
||||
void llm_graph_input_one::set_input(const llama_ubatch *) {
|
||||
GGML_ASSERT(one && ggml_nelements(one) == 1);
|
||||
float f_one = 1.0f;
|
||||
ggml_backend_tensor_set(one, &f_one, 0, sizeof(float));
|
||||
}
|
||||
|
||||
//
|
||||
// llm_graph_context
|
||||
//
|
||||
@ -1267,8 +1273,14 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
// these nodes are added to the graph together so that they are not reordered
|
||||
// by doing so, the number of splits in the graph is reduced
|
||||
ggml_build_forward_expand(gf, q_cur);
|
||||
ggml_build_forward_expand(gf, k_cur);
|
||||
ggml_build_forward_expand(gf, v_cur);
|
||||
|
||||
if (k_cur) {
|
||||
ggml_build_forward_expand(gf, k_cur);
|
||||
}
|
||||
|
||||
if (v_cur) {
|
||||
ggml_build_forward_expand(gf, v_cur);
|
||||
}
|
||||
|
||||
const auto * mctx_iswa = static_cast<const llama_kv_cache_unified_iswa_context *>(mctx);
|
||||
|
||||
@ -1276,9 +1288,12 @@ ggml_tensor * llm_graph_context::build_attn(
|
||||
|
||||
const auto * mctx_cur = is_swa ? mctx_iswa->get_swa() : mctx_iswa->get_base();
|
||||
|
||||
// store to KV cache
|
||||
{
|
||||
// optionally store to KV cache
|
||||
if (k_cur) {
|
||||
ggml_build_forward_expand(gf, mctx_cur->cpy_k(ctx0, k_cur, il));
|
||||
}
|
||||
|
||||
if (v_cur) {
|
||||
ggml_build_forward_expand(gf, mctx_cur->cpy_v(ctx0, v_cur, il));
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user