2025-03-13 12:35:44 +02:00
|
|
|
#pragma once
|
|
|
|
|
|
|
|
#include "llama-arch.h"
|
2025-07-17 19:08:33 +03:00
|
|
|
#include "llama-batch.h"
|
2025-03-13 12:35:44 +02:00
|
|
|
#include "llama-hparams.h"
|
|
|
|
#include "llama-adapter.h"
|
|
|
|
|
|
|
|
#include <cstdint>
|
|
|
|
#include <vector>
|
|
|
|
#include <memory>
|
|
|
|
#include <set>
|
|
|
|
#include <functional>
|
|
|
|
|
|
|
|
struct ggml_cgraph;
|
|
|
|
struct ggml_context;
|
|
|
|
struct ggml_tensor;
|
|
|
|
|
|
|
|
struct llama_cparams;
|
|
|
|
|
2025-06-21 08:03:46 +03:00
|
|
|
struct llama_memory_context_i;
|
2025-05-31 10:24:04 +03:00
|
|
|
|
2025-06-21 08:03:46 +03:00
|
|
|
class llama_kv_cache_unified_context;
|
|
|
|
class llama_kv_cache_unified_iswa_context;
|
|
|
|
class llama_memory_recurrent_context;
|
|
|
|
class llama_memory_hybrid_context;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
// certain models (typically multi-modal) can produce different types of graphs
|
|
|
|
enum llm_graph_type {
|
|
|
|
LLM_GRAPH_TYPE_DEFAULT,
|
|
|
|
LLM_GRAPH_TYPE_ENCODER,
|
|
|
|
LLM_GRAPH_TYPE_DECODER,
|
|
|
|
};
|
|
|
|
|
|
|
|
enum llm_ffn_op_type {
|
|
|
|
LLM_FFN_SILU,
|
|
|
|
LLM_FFN_GELU,
|
|
|
|
LLM_FFN_RELU,
|
|
|
|
LLM_FFN_RELU_SQR,
|
|
|
|
LLM_FFN_SWIGLU,
|
2025-06-09 13:15:31 +09:00
|
|
|
LLM_FFN_GEGLU,
|
2025-06-29 11:04:10 +02:00
|
|
|
LLM_FFN_REGLU,
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
enum llm_ffn_gate_type {
|
|
|
|
LLM_FFN_SEQ,
|
|
|
|
LLM_FFN_PAR, // ffn_gate is parallel to ffn_up
|
|
|
|
};
|
|
|
|
|
|
|
|
enum llm_norm_type {
|
|
|
|
LLM_NORM,
|
|
|
|
LLM_NORM_RMS,
|
|
|
|
LLM_NORM_GROUP,
|
|
|
|
};
|
|
|
|
|
|
|
|
// TODO: tmp - need something better to pass the data from the encoder to the decoder
|
|
|
|
struct llama_cross {
|
|
|
|
// the output embeddings from the encoder as a ggml tensor
|
|
|
|
// TODO: this needs more work to be correct, for now copy the embeddings data to host memory
|
|
|
|
// ref: https://github.com/ggml-org/llama.cpp/pull/11213#discussion_r1969892524
|
|
|
|
//ggml_tensor * t_embd = nullptr;
|
|
|
|
|
|
|
|
int64_t n_embd = 0;
|
|
|
|
int64_t n_enc = 0;
|
|
|
|
|
|
|
|
// embeddings data copied to host memory (tmp)
|
|
|
|
std::vector<float> v_embd;
|
|
|
|
|
|
|
|
// needed to construct the cross-attention mask in the decoder
|
|
|
|
std::vector<std::set<llama_seq_id>> seq_ids_enc;
|
|
|
|
};
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
struct llm_graph_params;
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
//
|
|
|
|
// llm_graph_input
|
|
|
|
//
|
|
|
|
|
|
|
|
class llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
virtual ~llm_graph_input_i() = default;
|
|
|
|
|
|
|
|
virtual void set_input(const llama_ubatch * ubatch) = 0;
|
2025-07-17 19:08:33 +03:00
|
|
|
|
|
|
|
// return true if the resulting input tensors using the provided graph parameters would be
|
|
|
|
// the same as the previous input tensors that we have currently stored in the object
|
|
|
|
virtual bool can_reuse(const llm_graph_params & params) {
|
|
|
|
// returning false here by default will prevent from reusing the graph if the check
|
|
|
|
// for the input type has not been implemented yet
|
|
|
|
GGML_UNUSED(params);
|
|
|
|
return false;
|
|
|
|
}
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
using llm_graph_input_ptr = std::unique_ptr<llm_graph_input_i>;
|
|
|
|
|
|
|
|
class llm_graph_input_embd : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_embd() = default;
|
|
|
|
virtual ~llm_graph_input_embd() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
bool can_reuse(const llm_graph_params & params) override;
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * tokens = nullptr; // I32 [n_batch]
|
|
|
|
ggml_tensor * embd = nullptr; // F32 [n_embd, n_batch]
|
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_pos : public llm_graph_input_i {
|
|
|
|
public:
|
2025-06-20 10:14:14 +03:00
|
|
|
llm_graph_input_pos(uint32_t n_pos_per_embd) : n_pos_per_embd(n_pos_per_embd) {}
|
2025-03-13 12:35:44 +02:00
|
|
|
virtual ~llm_graph_input_pos() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
bool can_reuse(const llm_graph_params & params) override;
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * pos = nullptr; // I32 [n_batch]
|
|
|
|
|
2025-06-20 10:14:14 +03:00
|
|
|
const uint32_t n_pos_per_embd = 1;
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
2025-04-07 23:06:44 +02:00
|
|
|
// temperature tuning, used by llama4
|
|
|
|
class llm_graph_input_attn_temp : public llm_graph_input_i {
|
|
|
|
public:
|
2025-04-28 14:20:56 +02:00
|
|
|
llm_graph_input_attn_temp(uint32_t n_attn_temp_floor_scale, float f_attn_temp_scale)
|
|
|
|
: n_attn_temp_floor_scale(n_attn_temp_floor_scale), f_attn_temp_scale(f_attn_temp_scale) {}
|
2025-04-07 23:06:44 +02:00
|
|
|
virtual ~llm_graph_input_attn_temp() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * attn_scale = nullptr; // F32 [n_batch]
|
|
|
|
|
|
|
|
const uint32_t n_attn_temp_floor_scale;
|
|
|
|
const float f_attn_temp_scale;
|
|
|
|
};
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
class llm_graph_input_pos_bucket : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
|
|
|
|
virtual ~llm_graph_input_pos_bucket() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * pos_bucket = nullptr; // I32 [n_batch, n_batch]
|
|
|
|
|
|
|
|
const llama_hparams & hparams;
|
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_pos_bucket_kv : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_pos_bucket_kv(
|
|
|
|
const llama_hparams & hparams,
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_kv_cache_unified_context * mctx) : hparams(hparams), mctx(mctx) {}
|
2025-03-13 12:35:44 +02:00
|
|
|
virtual ~llm_graph_input_pos_bucket_kv() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * pos_bucket = nullptr; // I32 [n_kv, n_batch]
|
|
|
|
|
|
|
|
const llama_hparams & hparams;
|
2025-06-21 08:03:46 +03:00
|
|
|
|
|
|
|
const llama_kv_cache_unified_context * mctx;
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_out_ids : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_out_ids(
|
|
|
|
const llama_hparams & hparams,
|
|
|
|
const llama_cparams & cparams,
|
2025-07-17 19:08:33 +03:00
|
|
|
uint32_t n_outputs) : hparams(hparams), cparams(cparams), n_outputs(n_outputs) {}
|
2025-03-13 12:35:44 +02:00
|
|
|
virtual ~llm_graph_input_out_ids() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
bool can_reuse(const llm_graph_params & params) override;
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * out_ids; // I32 [n_outputs]
|
|
|
|
|
|
|
|
const llama_hparams & hparams;
|
|
|
|
const llama_cparams & cparams;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
const uint32_t n_outputs;
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_mean : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_mean(const llama_cparams & cparams) : cparams(cparams) {}
|
|
|
|
virtual ~llm_graph_input_mean() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * mean; // F32 [n_batch, n_batch]
|
|
|
|
|
|
|
|
const llama_cparams & cparams;
|
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_cls : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_cls(const llama_cparams & cparams) : cparams(cparams) {}
|
|
|
|
virtual ~llm_graph_input_cls() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * cls; // I32 [n_batch]
|
|
|
|
|
|
|
|
const llama_cparams & cparams;
|
|
|
|
};
|
|
|
|
|
2025-06-19 00:08:14 -05:00
|
|
|
class llm_graph_input_rs : public llm_graph_input_i {
|
2025-03-13 12:35:44 +02:00
|
|
|
public:
|
2025-06-21 08:03:46 +03:00
|
|
|
llm_graph_input_rs(const llama_memory_recurrent_context * mctx) : mctx(mctx) {}
|
2025-06-19 00:08:14 -05:00
|
|
|
virtual ~llm_graph_input_rs() = default;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * s_copy; // I32 [kv_size]
|
|
|
|
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_memory_recurrent_context * mctx;
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_cross_embd : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_cross_embd(
|
|
|
|
const llama_cross * cross) : cross(cross) {}
|
|
|
|
virtual ~llm_graph_input_cross_embd() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * cross_embd; // F32 [n_embd, n_outputs_enc]
|
|
|
|
|
|
|
|
const llama_cross * cross;
|
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_attn_no_cache : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_attn_no_cache(const llama_hparams & hparams, const llama_cparams & cparams) :
|
|
|
|
hparams(hparams),
|
|
|
|
cparams(cparams) {
|
|
|
|
}
|
|
|
|
~llm_graph_input_attn_no_cache() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * get_kq_mask() const { return kq_mask_cnv; }
|
|
|
|
|
2025-07-04 09:05:36 +03:00
|
|
|
ggml_tensor * kq_mask = nullptr; // F32 [n_tokens, n_batch, 1, 1]
|
|
|
|
ggml_tensor * kq_mask_cnv = nullptr; // [n_tokens, n_batch, 1, 1]
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
const llama_hparams & hparams;
|
|
|
|
const llama_cparams & cparams;
|
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_attn_kv_unified : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_attn_kv_unified(
|
|
|
|
const llama_hparams & hparams,
|
|
|
|
const llama_cparams & cparams,
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_kv_cache_unified_context * mctx) :
|
2025-03-13 12:35:44 +02:00
|
|
|
hparams(hparams),
|
|
|
|
cparams(cparams),
|
2025-06-21 08:03:46 +03:00
|
|
|
mctx(mctx) {
|
2025-03-13 12:35:44 +02:00
|
|
|
}
|
|
|
|
~llm_graph_input_attn_kv_unified() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
bool can_reuse(const llm_graph_params & params) override;
|
|
|
|
|
2025-07-03 10:53:35 +03:00
|
|
|
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
|
|
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
|
|
|
|
2025-05-20 08:05:46 +03:00
|
|
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
|
|
|
2025-07-03 10:53:35 +03:00
|
|
|
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
2025-07-16 16:35:42 +03:00
|
|
|
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
2025-07-03 10:53:35 +03:00
|
|
|
|
2025-07-16 16:35:42 +03:00
|
|
|
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
|
|
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
2025-05-20 08:05:46 +03:00
|
|
|
|
|
|
|
const llama_hparams & hparams;
|
|
|
|
const llama_cparams & cparams;
|
|
|
|
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_kv_cache_unified_context * mctx;
|
2025-05-20 08:05:46 +03:00
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_attn_kv_unified_iswa : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_attn_kv_unified_iswa(
|
|
|
|
const llama_hparams & hparams,
|
|
|
|
const llama_cparams & cparams,
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_kv_cache_unified_iswa_context * mctx) :
|
2025-05-20 08:05:46 +03:00
|
|
|
hparams(hparams),
|
|
|
|
cparams(cparams),
|
2025-06-21 08:03:46 +03:00
|
|
|
mctx(mctx) {
|
2025-05-20 08:05:46 +03:00
|
|
|
}
|
|
|
|
~llm_graph_input_attn_kv_unified_iswa() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
bool can_reuse(const llm_graph_params & params) override;
|
|
|
|
|
2025-07-03 10:53:35 +03:00
|
|
|
ggml_tensor * get_k_idxs() const { return self_k_idxs; }
|
|
|
|
ggml_tensor * get_v_idxs() const { return self_v_idxs; }
|
|
|
|
ggml_tensor * get_k_idxs_swa() const { return self_k_idxs_swa; }
|
|
|
|
ggml_tensor * get_v_idxs_swa() const { return self_v_idxs_swa; }
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * get_kq_mask() const { return self_kq_mask_cnv; }
|
|
|
|
ggml_tensor * get_kq_mask_swa() const { return self_kq_mask_swa_cnv; }
|
|
|
|
|
2025-07-03 10:53:35 +03:00
|
|
|
ggml_tensor * self_k_idxs = nullptr; // I64 [n_batch]
|
2025-07-16 16:35:42 +03:00
|
|
|
ggml_tensor * self_v_idxs = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
2025-07-03 10:53:35 +03:00
|
|
|
ggml_tensor * self_k_idxs_swa = nullptr; // I64 [n_batch]
|
2025-07-16 16:35:42 +03:00
|
|
|
ggml_tensor * self_v_idxs_swa = nullptr; // I64 [n_batch] or [n_batch*n_embd_v_gqa]
|
2025-07-03 10:53:35 +03:00
|
|
|
|
2025-07-16 16:35:42 +03:00
|
|
|
ggml_tensor * self_kq_mask = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
|
|
ggml_tensor * self_kq_mask_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
|
|
|
ggml_tensor * self_kq_mask_swa = nullptr; // F32 [n_kv, n_batch/n_stream, 1, n_stream]
|
|
|
|
ggml_tensor * self_kq_mask_swa_cnv = nullptr; // [n_kv, n_batch/n_stream, 1, n_stream]
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
const llama_hparams & hparams;
|
|
|
|
const llama_cparams & cparams;
|
|
|
|
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_kv_cache_unified_iswa_context * mctx;
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
|
|
|
class llm_graph_input_attn_cross : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_attn_cross(const llama_cross * cross) : cross(cross) {}
|
|
|
|
~llm_graph_input_attn_cross() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
|
|
|
ggml_tensor * get_kq_mask_cross() const { return cross_kq_mask_cnv; }
|
|
|
|
|
2025-07-04 09:05:36 +03:00
|
|
|
ggml_tensor * cross_kq_mask = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
|
|
|
|
ggml_tensor * cross_kq_mask_cnv = nullptr; // F32 [n_outputs_enc, n_batch, 1, 1]
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
const llama_cross * cross = nullptr;
|
|
|
|
};
|
|
|
|
|
2025-06-19 00:08:14 -05:00
|
|
|
class llm_graph_input_mem_hybrid : public llm_graph_input_i {
|
|
|
|
public:
|
|
|
|
llm_graph_input_mem_hybrid(
|
2025-07-09 14:59:57 -04:00
|
|
|
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn,
|
|
|
|
std::unique_ptr<llm_graph_input_rs> inp_rs,
|
|
|
|
const llama_memory_hybrid_context * mctx) :
|
|
|
|
inp_attn(std::move(inp_attn)),
|
|
|
|
inp_rs(std::move(inp_rs)),
|
|
|
|
mctx(mctx) { }
|
2025-06-19 00:08:14 -05:00
|
|
|
virtual ~llm_graph_input_mem_hybrid() = default;
|
|
|
|
|
|
|
|
void set_input(const llama_ubatch * ubatch) override;
|
|
|
|
|
2025-07-09 14:59:57 -04:00
|
|
|
std::unique_ptr<llm_graph_input_attn_kv_unified> inp_attn;
|
|
|
|
std::unique_ptr<llm_graph_input_rs> inp_rs;
|
2025-07-03 10:53:35 +03:00
|
|
|
|
2025-07-09 14:59:57 -04:00
|
|
|
llm_graph_input_attn_kv_unified * get_attn() const { return inp_attn.get(); }
|
|
|
|
llm_graph_input_rs * get_recr() const { return inp_rs.get(); }
|
2025-06-19 00:08:14 -05:00
|
|
|
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_memory_hybrid_context * mctx;
|
2025-06-19 00:08:14 -05:00
|
|
|
};
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
//
|
|
|
|
// llm_graph_result
|
|
|
|
//
|
|
|
|
|
|
|
|
// these objects deliver the result from the graph build process back to the llama_context
|
|
|
|
// note that the input tensors created for the graph are referenced here - the goal is to be able to populate their
|
|
|
|
// specific data, by calling the set_inputs() method
|
|
|
|
// along with the input tensors, the object also provides commonly used outputs tensors, such as logits, embeddings, etc.
|
|
|
|
// these are used by the llama_context to extact the relevant data, based on the compute parameters
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
// callback that allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
|
|
|
using llm_graph_cb = std::function<void(const llama_ubatch & ubatch, ggml_tensor * cur, const char * name, int il)>;
|
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
class llm_graph_result;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
struct llm_graph_params {
|
|
|
|
llm_arch arch = LLM_ARCH_UNKNOWN;
|
|
|
|
|
|
|
|
llama_hparams hparams;
|
|
|
|
llama_cparams cparams;
|
|
|
|
|
|
|
|
llama_ubatch ubatch; // note: intentionally make a copy
|
|
|
|
|
|
|
|
llm_graph_type gtype;
|
|
|
|
|
|
|
|
ggml_backend_sched_t sched;
|
|
|
|
ggml_backend_t backend_cpu;
|
|
|
|
|
|
|
|
const llama_adapter_cvec * cvec;
|
|
|
|
const llama_adapter_loras * loras;
|
|
|
|
const llama_memory_context_i * mctx;
|
|
|
|
const llama_cross * cross;
|
|
|
|
|
|
|
|
uint32_t n_outputs;
|
|
|
|
|
|
|
|
llm_graph_cb cb;
|
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
llm_graph_result * res;
|
2025-07-17 19:08:33 +03:00
|
|
|
|
|
|
|
// return true if the "other" params would result in a graph with the same topology as with the current params
|
|
|
|
// having the same topology allows us to reuse the graph in some cases
|
|
|
|
bool allow_reuse(const llm_graph_params & other) const {
|
|
|
|
// first check the ubatch
|
|
|
|
bool can_reuse_ubatch =
|
|
|
|
ubatch.equal_seqs() == other.ubatch.equal_seqs() &&
|
|
|
|
ubatch.n_tokens == other.ubatch.n_tokens &&
|
|
|
|
ubatch.n_seq_tokens == other.ubatch.n_seq_tokens &&
|
|
|
|
ubatch.n_seqs == other.ubatch.n_seqs &&
|
|
|
|
ubatch.n_seqs_unq == other.ubatch.n_seqs_unq &&
|
|
|
|
(
|
|
|
|
(!ubatch.token && !other.ubatch.token) ||
|
|
|
|
(!ubatch.embd && !other.ubatch.embd)
|
|
|
|
);
|
|
|
|
|
|
|
|
if (can_reuse_ubatch && !ubatch.equal_seqs()) {
|
|
|
|
if (!ubatch.data) {
|
|
|
|
// if the old ubatch does not own it's data, then we cannot guarantee that it is still alive, and
|
|
|
|
// therefore we cannot perform the sequence id check. normally should never happen
|
|
|
|
can_reuse_ubatch = false;
|
|
|
|
} else {
|
|
|
|
for (uint32_t s = 0; s < ubatch.n_seqs_unq; ++s) {
|
|
|
|
can_reuse_ubatch &= ubatch.seq_id_unq[s] == other.ubatch.seq_id_unq[s];
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!can_reuse_ubatch) {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
|
|
|
return
|
|
|
|
cparams.embeddings == other.cparams.embeddings &&
|
|
|
|
cparams.causal_attn == other.cparams.causal_attn &&
|
|
|
|
arch == other.arch &&
|
|
|
|
gtype == other.gtype &&
|
|
|
|
cvec == other.cvec &&
|
|
|
|
loras == other.loras &&
|
|
|
|
cross == other.cross &&
|
|
|
|
n_outputs == other.n_outputs;
|
|
|
|
}
|
|
|
|
};
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
class llm_graph_result {
|
2025-03-13 12:35:44 +02:00
|
|
|
public:
|
2025-07-17 19:08:33 +03:00
|
|
|
llm_graph_result(int64_t max_nodes);
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
virtual ~llm_graph_result() = default;
|
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
ggml_tensor * get_tokens() const { return t_tokens; }
|
|
|
|
ggml_tensor * get_logits() const { return t_logits; }
|
|
|
|
ggml_tensor * get_embd() const { return t_embd; }
|
|
|
|
ggml_tensor * get_embd_pooled() const { return t_embd_pooled; }
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
ggml_cgraph * get_gf() const { return gf; }
|
|
|
|
ggml_context * get_ctx() const { return ctx_compute.get(); }
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
int64_t get_max_nodes() const;
|
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
void reset();
|
2025-07-17 19:08:33 +03:00
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
void set_inputs(const llama_ubatch * ubatch);
|
2025-07-17 19:08:33 +03:00
|
|
|
|
|
|
|
// try to update the existing graph result using the new graph parameters in order to reuse it
|
|
|
|
// this can only be done if we determine that the resulting graph using the new graph parameters
|
|
|
|
// would be identical to the existing graph. in that case, we simply have to update the memory
|
|
|
|
// contexts of the input tensors of the graph and we can reuse it for another computation
|
|
|
|
// return true if the graph was updated and can be reused
|
2025-07-18 08:29:28 +03:00
|
|
|
bool can_reuse(const llm_graph_params & params);
|
2025-07-17 19:08:33 +03:00
|
|
|
|
|
|
|
llm_graph_input_i * add_input(llm_graph_input_ptr input);
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
void set_params(const llm_graph_params & params);
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
// important graph nodes
|
2025-05-12 14:44:49 +02:00
|
|
|
ggml_tensor * t_tokens = nullptr;
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * t_logits = nullptr;
|
|
|
|
ggml_tensor * t_embd = nullptr;
|
|
|
|
ggml_tensor * t_embd_pooled = nullptr;
|
|
|
|
|
|
|
|
std::vector<llm_graph_input_ptr> inputs;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
ggml_context_ptr ctx_compute;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
// memory buffers used to evaluate the model
|
|
|
|
std::vector<uint8_t> buf_compute_meta;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
ggml_cgraph * gf;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
int64_t max_nodes;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
private:
|
2025-07-17 19:08:33 +03:00
|
|
|
// keep a copy of the previous graph parameters
|
|
|
|
// we will use this to determine whether the graph can be reused by comparing them with the new parameters
|
|
|
|
// note: these are updated after constructing the new graph
|
|
|
|
llm_graph_params params;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
// env: LLAMA_GRAPH_RESULT_DEBUG
|
|
|
|
int debug = 0;
|
2025-03-13 12:35:44 +02:00
|
|
|
};
|
|
|
|
|
2025-07-18 08:29:28 +03:00
|
|
|
using llm_graph_result_ptr = std::unique_ptr<llm_graph_result>;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
//
|
|
|
|
// llm_graph_context
|
|
|
|
//
|
|
|
|
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
// used in build_rs to properly order writes and avoid unnecessary copies
|
|
|
|
using llm_graph_get_rows_fn = std::function<ggml_tensor * (ggml_context *, ggml_tensor * states, ggml_tensor * ids)>;
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
struct llm_graph_context {
|
|
|
|
const llm_arch arch;
|
|
|
|
|
|
|
|
const llama_hparams & hparams;
|
|
|
|
const llama_cparams & cparams;
|
|
|
|
const llama_ubatch & ubatch;
|
|
|
|
|
|
|
|
const int64_t n_embd;
|
|
|
|
const int64_t n_layer;
|
|
|
|
const int64_t n_rot;
|
|
|
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
|
|
|
const int64_t n_head;
|
|
|
|
const int64_t n_head_kv;
|
|
|
|
const int64_t n_embd_head_k;
|
|
|
|
const int64_t n_embd_k_gqa;
|
|
|
|
const int64_t n_embd_head_v;
|
|
|
|
const int64_t n_embd_v_gqa;
|
|
|
|
const int64_t n_expert;
|
|
|
|
const int64_t n_expert_used;
|
|
|
|
|
|
|
|
const float freq_base;
|
|
|
|
const float freq_scale;
|
|
|
|
const float ext_factor;
|
|
|
|
const float attn_factor;
|
|
|
|
const float beta_fast;
|
|
|
|
const float beta_slow;
|
|
|
|
const float norm_eps;
|
|
|
|
const float norm_rms_eps;
|
|
|
|
|
2025-06-13 13:47:55 +03:00
|
|
|
const int64_t n_tokens;
|
|
|
|
const int64_t n_outputs;
|
2025-03-13 12:35:44 +02:00
|
|
|
const int32_t n_ctx_orig; // yarn
|
|
|
|
|
|
|
|
const enum llama_pooling_type pooling_type;
|
|
|
|
const enum llama_rope_type rope_type;
|
|
|
|
|
2025-05-02 17:48:36 +03:00
|
|
|
ggml_backend_sched_t sched;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-05-02 17:48:36 +03:00
|
|
|
ggml_backend_t backend_cpu; // TODO: needed by build_attn_mha, figure out a way to remove?
|
2025-03-13 12:35:44 +02:00
|
|
|
|
2025-06-21 08:03:46 +03:00
|
|
|
const llama_adapter_cvec * cvec;
|
|
|
|
const llama_adapter_loras * loras;
|
|
|
|
const llama_memory_context_i * mctx;
|
|
|
|
const llama_cross * cross;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
const llm_graph_cb & cb_func;
|
|
|
|
|
2025-07-17 19:08:33 +03:00
|
|
|
llm_graph_result * res;
|
|
|
|
|
|
|
|
ggml_context * ctx0 = nullptr;
|
2025-07-18 08:29:28 +03:00
|
|
|
ggml_cgraph * gf = nullptr;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
llm_graph_context(const llm_graph_params & params);
|
2025-06-27 21:42:02 +03:00
|
|
|
virtual ~llm_graph_context() = default;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
void cb(ggml_tensor * cur, const char * name, int il) const;
|
|
|
|
|
|
|
|
//
|
|
|
|
// common
|
|
|
|
//
|
|
|
|
|
|
|
|
ggml_tensor * build_cvec(
|
|
|
|
ggml_tensor * cur,
|
|
|
|
int il) const;
|
|
|
|
|
|
|
|
// do mat_mul, while optionally apply lora
|
|
|
|
ggml_tensor * build_lora_mm(
|
|
|
|
ggml_tensor * w,
|
|
|
|
ggml_tensor * cur) const;
|
|
|
|
|
|
|
|
// do mat_mul_id, while optionally apply lora
|
|
|
|
ggml_tensor * build_lora_mm_id(
|
|
|
|
ggml_tensor * w, // ggml_tensor * as
|
|
|
|
ggml_tensor * cur, // ggml_tensor * b
|
|
|
|
ggml_tensor * ids) const;
|
|
|
|
|
|
|
|
ggml_tensor * build_norm(
|
|
|
|
ggml_tensor * cur,
|
|
|
|
ggml_tensor * mw,
|
|
|
|
ggml_tensor * mb,
|
|
|
|
llm_norm_type type,
|
|
|
|
int il) const;
|
|
|
|
|
|
|
|
ggml_tensor * build_ffn(
|
|
|
|
ggml_tensor * cur,
|
|
|
|
ggml_tensor * up,
|
|
|
|
ggml_tensor * up_b,
|
|
|
|
ggml_tensor * up_s,
|
|
|
|
ggml_tensor * gate,
|
|
|
|
ggml_tensor * gate_b,
|
|
|
|
ggml_tensor * gate_s,
|
|
|
|
ggml_tensor * down,
|
|
|
|
ggml_tensor * down_b,
|
|
|
|
ggml_tensor * down_s,
|
|
|
|
ggml_tensor * act_scales,
|
|
|
|
llm_ffn_op_type type_op,
|
|
|
|
llm_ffn_gate_type type_gate,
|
|
|
|
int il) const;
|
|
|
|
|
|
|
|
ggml_tensor * build_moe_ffn(
|
|
|
|
ggml_tensor * cur,
|
|
|
|
ggml_tensor * gate_inp,
|
|
|
|
ggml_tensor * up_exps,
|
|
|
|
ggml_tensor * gate_exps,
|
|
|
|
ggml_tensor * down_exps,
|
|
|
|
ggml_tensor * exp_probs_b,
|
|
|
|
int64_t n_expert,
|
|
|
|
int64_t n_expert_used,
|
|
|
|
llm_ffn_op_type type_op,
|
|
|
|
bool norm_w,
|
|
|
|
bool scale_w,
|
|
|
|
float w_scale,
|
|
|
|
llama_expert_gating_func_type gating_op,
|
|
|
|
int il) const;
|
|
|
|
|
|
|
|
//
|
|
|
|
// inputs
|
|
|
|
//
|
|
|
|
|
|
|
|
ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
|
|
|
|
ggml_tensor * build_inp_pos() const;
|
2025-04-07 23:06:44 +02:00
|
|
|
ggml_tensor * build_inp_attn_scale() const;
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * build_inp_out_ids() const;
|
|
|
|
ggml_tensor * build_inp_mean() const;
|
|
|
|
ggml_tensor * build_inp_cls() const;
|
|
|
|
|
|
|
|
ggml_tensor * build_inp_cross_embd() const;
|
|
|
|
ggml_tensor * build_inp_pos_bucket_enc() const;
|
|
|
|
ggml_tensor * build_inp_pos_bucket_dec() const;
|
|
|
|
ggml_tensor * build_pos_bias(ggml_tensor * pos_bucket, ggml_tensor * attn_rel_b) const;
|
|
|
|
|
|
|
|
//
|
|
|
|
// attention
|
|
|
|
//
|
|
|
|
|
|
|
|
ggml_tensor * build_attn_mha(
|
2025-05-20 08:05:46 +03:00
|
|
|
ggml_tensor * q, // [n_embd_head_q, n_head_q, n_tokens]
|
|
|
|
ggml_tensor * k, // [n_embd_head_k, n_head_k, n_tokens]
|
|
|
|
ggml_tensor * v, // [n_embd_head_v, n_head_v, n_tokens] (v_trans == false)
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * kq_b,
|
|
|
|
ggml_tensor * kq_mask,
|
2025-05-20 08:05:46 +03:00
|
|
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
2025-03-13 12:35:44 +02:00
|
|
|
float kq_scale) const;
|
|
|
|
|
|
|
|
llm_graph_input_attn_no_cache * build_attn_inp_no_cache() const;
|
|
|
|
|
|
|
|
ggml_tensor * build_attn(
|
|
|
|
llm_graph_input_attn_no_cache * inp,
|
|
|
|
ggml_tensor * wo,
|
|
|
|
ggml_tensor * wo_b,
|
2025-03-18 21:35:19 +02:00
|
|
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
|
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
|
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * kq_b,
|
2025-04-15 07:49:57 +01:00
|
|
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
2025-03-13 12:35:44 +02:00
|
|
|
float kq_scale,
|
|
|
|
int il) const;
|
|
|
|
|
2025-03-14 10:47:44 +02:00
|
|
|
llm_graph_input_attn_kv_unified * build_attn_inp_kv_unified() const;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
ggml_tensor * build_attn(
|
|
|
|
llm_graph_input_attn_kv_unified * inp,
|
|
|
|
ggml_tensor * wo,
|
|
|
|
ggml_tensor * wo_b,
|
2025-03-18 21:35:19 +02:00
|
|
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
|
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
|
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * kq_b,
|
2025-04-15 07:49:57 +01:00
|
|
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
2025-03-13 12:35:44 +02:00
|
|
|
float kq_scale,
|
|
|
|
int il) const;
|
|
|
|
|
2025-05-20 08:05:46 +03:00
|
|
|
llm_graph_input_attn_kv_unified_iswa * build_attn_inp_kv_unified_iswa() const;
|
|
|
|
|
2025-06-26 19:34:02 +02:00
|
|
|
// note: if k_cur or v_cur are not provided, they will not be stored in the memory
|
2025-05-20 08:05:46 +03:00
|
|
|
ggml_tensor * build_attn(
|
|
|
|
llm_graph_input_attn_kv_unified_iswa * inp,
|
|
|
|
ggml_tensor * wo,
|
|
|
|
ggml_tensor * wo_b,
|
|
|
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
2025-06-26 19:34:02 +02:00
|
|
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens] optional
|
|
|
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens] optional
|
2025-05-20 08:05:46 +03:00
|
|
|
ggml_tensor * kq_b,
|
|
|
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
|
|
|
float kq_scale,
|
|
|
|
int il) const;
|
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
llm_graph_input_attn_cross * build_attn_inp_cross() const;
|
|
|
|
|
|
|
|
ggml_tensor * build_attn(
|
|
|
|
llm_graph_input_attn_cross * inp,
|
|
|
|
ggml_tensor * wo,
|
|
|
|
ggml_tensor * wo_b,
|
2025-03-18 21:35:19 +02:00
|
|
|
ggml_tensor * q_cur, // [n_embd_head_q, n_head_q, n_tokens]
|
|
|
|
ggml_tensor * k_cur, // [n_embd_head_k, n_head_k, n_tokens]
|
|
|
|
ggml_tensor * v_cur, // [n_embd_head_v, n_head_v, n_tokens]
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * kq_b,
|
2025-04-15 07:49:57 +01:00
|
|
|
ggml_tensor * v_mla, // [n_embd_head_v_mla, n_embd_head_v, n_head_v]
|
2025-03-13 12:35:44 +02:00
|
|
|
float kq_scale,
|
|
|
|
int il) const;
|
|
|
|
|
|
|
|
//
|
|
|
|
// recurrent
|
|
|
|
//
|
|
|
|
|
2025-06-19 00:08:14 -05:00
|
|
|
// TODO: avoid notion of "kv"
|
|
|
|
// TODO: move this implementation to llama_memory_recurrent.
|
|
|
|
// this is analogous to llama_kv_cache_unified::cpy_k / cpy_v
|
|
|
|
// when moving, avoid passing `ggml_cgraph` - only pass `ggml_context`. would likely need to split the
|
|
|
|
// implementation in 2 separate methods. the goal is to avoid calling `ggml_build_forward_expand` in
|
|
|
|
// `llama_memory_recurrent`
|
|
|
|
ggml_tensor * build_rs(
|
|
|
|
ggml_tensor * s,
|
|
|
|
ggml_tensor * state_copy,
|
|
|
|
int32_t state_size,
|
|
|
|
int32_t n_seqs,
|
|
|
|
uint32_t n_kv,
|
|
|
|
uint32_t kv_head,
|
|
|
|
uint32_t kv_size,
|
|
|
|
int32_t rs_zero,
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
2025-06-19 00:08:14 -05:00
|
|
|
|
|
|
|
llm_graph_input_rs * build_rs_inp() const;
|
|
|
|
|
|
|
|
ggml_tensor * build_rs(
|
|
|
|
llm_graph_input_rs * inp,
|
|
|
|
ggml_tensor * s,
|
|
|
|
int32_t state_size,
|
|
|
|
int32_t n_seqs,
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
const llm_graph_get_rows_fn & get_state_rows = ggml_get_rows) const;
|
2025-06-19 00:08:14 -05:00
|
|
|
|
2025-03-13 12:35:44 +02:00
|
|
|
ggml_tensor * build_rwkv_token_shift_load(
|
2025-06-19 00:08:14 -05:00
|
|
|
llm_graph_input_rs * inp,
|
|
|
|
const llama_ubatch & ubatch,
|
2025-07-18 08:29:28 +03:00
|
|
|
int il) const;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
ggml_tensor * build_rwkv_token_shift_store(
|
|
|
|
ggml_tensor * token_shift,
|
|
|
|
const llama_ubatch & ubatch,
|
|
|
|
int il) const;
|
2025-07-09 14:59:57 -04:00
|
|
|
//
|
|
|
|
// hybrid
|
|
|
|
//
|
|
|
|
|
|
|
|
llm_graph_input_mem_hybrid * build_inp_mem_hybrid() const;
|
2025-03-13 12:35:44 +02:00
|
|
|
|
|
|
|
//
|
|
|
|
// pooling
|
|
|
|
//
|
|
|
|
|
|
|
|
void build_pooling(
|
|
|
|
ggml_tensor * cls,
|
|
|
|
ggml_tensor * cls_b,
|
|
|
|
ggml_tensor * cls_out,
|
|
|
|
ggml_tensor * cls_out_b) const;
|
|
|
|
};
|
2025-05-20 08:05:46 +03:00
|
|
|
|
|
|
|
// TODO: better name
|
|
|
|
int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional);
|