mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-06 01:05:03 -04:00
@@ -329,6 +329,17 @@ public:
|
||||
const llama_memory_hybrid_context * mctx;
|
||||
};
|
||||
|
||||
// TODO: remove this when ggml_scale_add is implemented
|
||||
class llm_graph_input_one : public llm_graph_input_i {
|
||||
public:
|
||||
llm_graph_input_one() {}
|
||||
virtual ~llm_graph_input_one() = default;
|
||||
|
||||
void set_input(const llama_ubatch *) override;
|
||||
|
||||
ggml_tensor * one = nullptr; // F32
|
||||
};
|
||||
|
||||
//
|
||||
// llm_graph_result
|
||||
//
|
||||
@@ -589,14 +600,15 @@ struct llm_graph_context {
|
||||
|
||||
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
|
||||
|
||||
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
||||
ggml_tensor * build_attn(
|
||||
llm_graph_input_attn_kv_unified_iswa * inp,
|
||||
ggml_cgraph * gf,
|
||||
ggml_tensor * wo,
|
||||
ggml_tensor * wo_b,
|
||||
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
||||
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
||||
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
||||
ggml_tensor * kq_b,
|
||||
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
||||
float kq_scale,
|
||||
|
Reference in New Issue
Block a user