2023-11-10 22:04:50 -07:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
from enum import Enum, IntEnum, auto
|
|
|
|
from typing import Any
|
|
|
|
|
|
|
|
#
|
|
|
|
# constants
|
|
|
|
#
|
|
|
|
|
|
|
|
GGUF_MAGIC = 0x46554747 # "GGUF"
|
|
|
|
GGUF_VERSION = 3
|
|
|
|
GGUF_DEFAULT_ALIGNMENT = 32
|
2024-05-11 11:06:26 -04:00
|
|
|
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
#
|
|
|
|
# metadata keys
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
class Keys:
|
|
|
|
class General:
|
2024-07-18 20:40:15 +10:00
|
|
|
TYPE = "general.type"
|
|
|
|
ARCHITECTURE = "general.architecture"
|
|
|
|
QUANTIZATION_VERSION = "general.quantization_version"
|
|
|
|
ALIGNMENT = "general.alignment"
|
|
|
|
FILE_TYPE = "general.file_type"
|
|
|
|
|
|
|
|
# Authorship Metadata
|
|
|
|
NAME = "general.name"
|
|
|
|
AUTHOR = "general.author"
|
|
|
|
VERSION = "general.version"
|
|
|
|
ORGANIZATION = "general.organization"
|
|
|
|
|
|
|
|
FINETUNE = "general.finetune"
|
|
|
|
BASENAME = "general.basename"
|
|
|
|
|
|
|
|
DESCRIPTION = "general.description"
|
|
|
|
QUANTIZED_BY = "general.quantized_by"
|
|
|
|
|
|
|
|
SIZE_LABEL = "general.size_label"
|
|
|
|
|
|
|
|
# Licensing details
|
|
|
|
LICENSE = "general.license"
|
|
|
|
LICENSE_NAME = "general.license.name"
|
|
|
|
LICENSE_LINK = "general.license.link"
|
|
|
|
|
|
|
|
# Typically represents the converted GGUF repo (Unless native)
|
|
|
|
URL = "general.url" # Model Website/Paper
|
|
|
|
DOI = "general.doi"
|
|
|
|
UUID = "general.uuid"
|
|
|
|
REPO_URL = "general.repo_url" # Model Source Repository (git/svn/etc...)
|
|
|
|
|
|
|
|
# Model Source during conversion
|
|
|
|
SOURCE_URL = "general.source.url" # Model Website/Paper
|
|
|
|
SOURCE_DOI = "general.source.doi"
|
|
|
|
SOURCE_UUID = "general.source.uuid"
|
|
|
|
SOURCE_REPO_URL = "general.source.repo_url" # Model Source Repository (git/svn/etc...)
|
|
|
|
|
|
|
|
# Base Model Source. There can be more than one source if it's a merged
|
|
|
|
# model like with 'Mistral-7B-Merge-14-v0.1'. This will assist in
|
|
|
|
# tracing linage of models as it is finetuned or merged over time.
|
|
|
|
BASE_MODEL_COUNT = "general.base_model.count"
|
|
|
|
BASE_MODEL_NAME = "general.base_model.{id}.name"
|
|
|
|
BASE_MODEL_AUTHOR = "general.base_model.{id}.author"
|
|
|
|
BASE_MODEL_VERSION = "general.base_model.{id}.version"
|
|
|
|
BASE_MODEL_ORGANIZATION = "general.base_model.{id}.organization"
|
2024-11-13 21:10:38 +11:00
|
|
|
BASE_MODEL_DESCRIPTION = "general.base_model.{id}.description"
|
2024-07-18 20:40:15 +10:00
|
|
|
BASE_MODEL_URL = "general.base_model.{id}.url" # Model Website/Paper
|
|
|
|
BASE_MODEL_DOI = "general.base_model.{id}.doi"
|
|
|
|
BASE_MODEL_UUID = "general.base_model.{id}.uuid"
|
|
|
|
BASE_MODEL_REPO_URL = "general.base_model.{id}.repo_url" # Model Source Repository (git/svn/etc...)
|
|
|
|
|
2024-11-13 21:10:38 +11:00
|
|
|
# Dataset Source
|
|
|
|
DATASET_COUNT = "general.dataset.count"
|
|
|
|
DATASET_NAME = "general.dataset.{id}.name"
|
|
|
|
DATASET_AUTHOR = "general.dataset.{id}.author"
|
|
|
|
DATASET_VERSION = "general.dataset.{id}.version"
|
|
|
|
DATASET_ORGANIZATION = "general.dataset.{id}.organization"
|
|
|
|
DATASET_DESCRIPTION = "general.dataset.{id}.description"
|
|
|
|
DATASET_URL = "general.dataset.{id}.url" # Model Website/Paper
|
|
|
|
DATASET_DOI = "general.dataset.{id}.doi"
|
|
|
|
DATASET_UUID = "general.dataset.{id}.uuid"
|
|
|
|
DATASET_REPO_URL = "general.dataset.{id}.repo_url" # Model Source Repository (git/svn/etc...)
|
|
|
|
|
2024-07-18 20:40:15 +10:00
|
|
|
# Array based KV stores
|
|
|
|
TAGS = "general.tags"
|
|
|
|
LANGUAGES = "general.languages"
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
class LLM:
|
2024-06-17 22:08:46 +03:00
|
|
|
VOCAB_SIZE = "{arch}.vocab_size"
|
|
|
|
CONTEXT_LENGTH = "{arch}.context_length"
|
|
|
|
EMBEDDING_LENGTH = "{arch}.embedding_length"
|
2024-12-18 19:27:21 +02:00
|
|
|
FEATURES_LENGTH = "{arch}.features_length"
|
2024-06-17 22:08:46 +03:00
|
|
|
BLOCK_COUNT = "{arch}.block_count"
|
|
|
|
LEADING_DENSE_BLOCK_COUNT = "{arch}.leading_dense_block_count"
|
|
|
|
FEED_FORWARD_LENGTH = "{arch}.feed_forward_length"
|
|
|
|
EXPERT_FEED_FORWARD_LENGTH = "{arch}.expert_feed_forward_length"
|
|
|
|
EXPERT_SHARED_FEED_FORWARD_LENGTH = "{arch}.expert_shared_feed_forward_length"
|
|
|
|
USE_PARALLEL_RESIDUAL = "{arch}.use_parallel_residual"
|
|
|
|
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
|
|
|
|
EXPERT_COUNT = "{arch}.expert_count"
|
|
|
|
EXPERT_USED_COUNT = "{arch}.expert_used_count"
|
|
|
|
EXPERT_SHARED_COUNT = "{arch}.expert_shared_count"
|
|
|
|
EXPERT_WEIGHTS_SCALE = "{arch}.expert_weights_scale"
|
2025-01-04 21:06:11 +01:00
|
|
|
EXPERT_WEIGHTS_NORM = "{arch}.expert_weights_norm"
|
|
|
|
EXPERT_GATING_FUNC = "{arch}.expert_gating_func"
|
2025-04-28 15:52:15 -04:00
|
|
|
MOE_EVERY_N_LAYERS = "{arch}.moe_every_n_layers"
|
2024-06-17 22:08:46 +03:00
|
|
|
POOLING_TYPE = "{arch}.pooling_type"
|
|
|
|
LOGIT_SCALE = "{arch}.logit_scale"
|
2024-06-24 07:06:05 +02:00
|
|
|
DECODER_START_TOKEN_ID = "{arch}.decoder_start_token_id"
|
2024-06-29 20:44:08 -07:00
|
|
|
ATTN_LOGIT_SOFTCAPPING = "{arch}.attn_logit_softcapping"
|
|
|
|
FINAL_LOGIT_SOFTCAPPING = "{arch}.final_logit_softcapping"
|
2024-09-28 12:08:43 +00:00
|
|
|
SWIN_NORM = "{arch}.swin_norm"
|
2024-09-01 22:38:17 +08:00
|
|
|
RESCALE_EVERY_N_LAYERS = "{arch}.rescale_every_n_layers"
|
|
|
|
TIME_MIX_EXTRA_DIM = "{arch}.time_mix_extra_dim"
|
|
|
|
TIME_DECAY_EXTRA_DIM = "{arch}.time_decay_extra_dim"
|
2024-09-17 00:44:58 -06:00
|
|
|
RESIDUAL_SCALE = "{arch}.residual_scale"
|
|
|
|
EMBEDDING_SCALE = "{arch}.embedding_scale"
|
2025-01-10 09:58:08 +08:00
|
|
|
TOKEN_SHIFT_COUNT = "{arch}.token_shift_count"
|
2025-04-07 23:06:44 +02:00
|
|
|
INTERLEAVE_MOE_LAYER_STEP = "{arch}.interleave_moe_layer_step"
|
2025-06-26 19:34:02 +02:00
|
|
|
ACTIVATION_SPARSITY_SCALE = "{arch}.activation_sparsity_scale"
|
|
|
|
ALTUP_ACTIVE_IDX = "{arch}.altup.active_idx"
|
|
|
|
ALTUP_NUM_INPUTS = "{arch}.altup.num_inputs"
|
|
|
|
EMBD_LENGTH_PER_LAYER_INP = "{arch}.embedding_length_per_layer_input"
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
class Attention:
|
2025-03-18 07:27:50 +08:00
|
|
|
HEAD_COUNT = "{arch}.attention.head_count"
|
|
|
|
HEAD_COUNT_KV = "{arch}.attention.head_count_kv"
|
|
|
|
MAX_ALIBI_BIAS = "{arch}.attention.max_alibi_bias"
|
|
|
|
CLAMP_KQV = "{arch}.attention.clamp_kqv"
|
|
|
|
KEY_LENGTH = "{arch}.attention.key_length"
|
|
|
|
VALUE_LENGTH = "{arch}.attention.value_length"
|
|
|
|
LAYERNORM_EPS = "{arch}.attention.layer_norm_epsilon"
|
|
|
|
LAYERNORM_RMS_EPS = "{arch}.attention.layer_norm_rms_epsilon"
|
|
|
|
GROUPNORM_EPS = "{arch}.attention.group_norm_epsilon"
|
|
|
|
GROUPNORM_GROUPS = "{arch}.attention.group_norm_groups"
|
|
|
|
CAUSAL = "{arch}.attention.causal"
|
|
|
|
Q_LORA_RANK = "{arch}.attention.q_lora_rank"
|
|
|
|
KV_LORA_RANK = "{arch}.attention.kv_lora_rank"
|
|
|
|
DECAY_LORA_RANK = "{arch}.attention.decay_lora_rank"
|
|
|
|
ICLR_LORA_RANK = "{arch}.attention.iclr_lora_rank"
|
|
|
|
VALUE_RESIDUAL_MIX_LORA_RANK = "{arch}.attention.value_residual_mix_lora_rank"
|
|
|
|
GATE_LORA_RANK = "{arch}.attention.gate_lora_rank"
|
|
|
|
REL_BUCKETS_COUNT = "{arch}.attention.relative_buckets_count"
|
|
|
|
SLIDING_WINDOW = "{arch}.attention.sliding_window"
|
|
|
|
SCALE = "{arch}.attention.scale"
|
2025-04-15 07:49:57 +01:00
|
|
|
KEY_LENGTH_MLA = "{arch}.attention.key_length_mla"
|
|
|
|
VALUE_LENGTH_MLA = "{arch}.attention.value_length_mla"
|
2025-06-26 19:34:02 +02:00
|
|
|
SHARED_KV_LAYERS = "{arch}.attention.shared_kv_layers"
|
|
|
|
SLIDING_WINDOW_PATTERN = "{arch}.attention.sliding_window_pattern"
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
class Rope:
|
2024-05-22 04:28:32 +08:00
|
|
|
DIMENSION_COUNT = "{arch}.rope.dimension_count"
|
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
|
|
|
DIMENSION_SECTIONS = "{arch}.rope.dimension_sections"
|
2024-05-22 04:28:32 +08:00
|
|
|
FREQ_BASE = "{arch}.rope.freq_base"
|
|
|
|
SCALING_TYPE = "{arch}.rope.scaling.type"
|
|
|
|
SCALING_FACTOR = "{arch}.rope.scaling.factor"
|
|
|
|
SCALING_ATTN_FACTOR = "{arch}.rope.scaling.attn_factor"
|
|
|
|
SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
|
|
|
|
SCALING_FINETUNED = "{arch}.rope.scaling.finetuned"
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
SCALING_YARN_LOG_MUL = "{arch}.rope.scaling.yarn_log_multiplier"
|
2023-11-10 22:04:50 -07:00
|
|
|
|
2024-06-24 05:42:03 -04:00
|
|
|
class Split:
|
|
|
|
LLM_KV_SPLIT_NO = "split.no"
|
|
|
|
LLM_KV_SPLIT_COUNT = "split.count"
|
|
|
|
LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"
|
|
|
|
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
|
|
|
class SSM:
|
|
|
|
CONV_KERNEL = "{arch}.ssm.conv_kernel"
|
|
|
|
INNER_SIZE = "{arch}.ssm.inner_size"
|
|
|
|
STATE_SIZE = "{arch}.ssm.state_size"
|
|
|
|
TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
GROUP_COUNT = "{arch}.ssm.group_count"
|
2024-08-21 12:06:36 +04:00
|
|
|
DT_B_C_RMS = "{arch}.ssm.dt_b_c_rms"
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
|
|
|
|
2024-09-01 22:38:17 +08:00
|
|
|
class WKV:
|
|
|
|
HEAD_SIZE = "{arch}.wkv.head_size"
|
|
|
|
|
2024-12-18 19:27:21 +02:00
|
|
|
class PosNet:
|
|
|
|
EMBEDDING_LENGTH = "{arch}.posnet.embedding_length"
|
|
|
|
BLOCK_COUNT = "{arch}.posnet.block_count"
|
|
|
|
|
|
|
|
class ConvNext:
|
|
|
|
EMBEDDING_LENGTH = "{arch}.convnext.embedding_length"
|
|
|
|
BLOCK_COUNT = "{arch}.convnext.block_count"
|
|
|
|
|
2025-05-29 08:15:01 +02:00
|
|
|
class Classifier:
|
|
|
|
OUTPUT_LABELS = "{arch}.classifier.output_labels"
|
|
|
|
|
2025-07-11 20:27:01 +02:00
|
|
|
class ShortConv:
|
|
|
|
L_CACHE = "{arch}.shortconv.l_cache"
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
class Tokenizer:
|
2024-06-24 07:06:05 +02:00
|
|
|
MODEL = "tokenizer.ggml.model"
|
|
|
|
PRE = "tokenizer.ggml.pre"
|
|
|
|
LIST = "tokenizer.ggml.tokens"
|
|
|
|
TOKEN_TYPE = "tokenizer.ggml.token_type"
|
|
|
|
TOKEN_TYPE_COUNT = "tokenizer.ggml.token_type_count" # for BERT-style token types
|
|
|
|
SCORES = "tokenizer.ggml.scores"
|
|
|
|
MERGES = "tokenizer.ggml.merges"
|
|
|
|
BOS_ID = "tokenizer.ggml.bos_token_id"
|
|
|
|
EOS_ID = "tokenizer.ggml.eos_token_id"
|
2024-10-12 08:21:51 +03:00
|
|
|
EOT_ID = "tokenizer.ggml.eot_token_id"
|
|
|
|
EOM_ID = "tokenizer.ggml.eom_token_id"
|
2024-06-24 07:06:05 +02:00
|
|
|
UNK_ID = "tokenizer.ggml.unknown_token_id"
|
|
|
|
SEP_ID = "tokenizer.ggml.seperator_token_id"
|
|
|
|
PAD_ID = "tokenizer.ggml.padding_token_id"
|
|
|
|
MASK_ID = "tokenizer.ggml.mask_token_id"
|
|
|
|
ADD_BOS = "tokenizer.ggml.add_bos_token"
|
|
|
|
ADD_EOS = "tokenizer.ggml.add_eos_token"
|
2025-06-20 14:04:09 +02:00
|
|
|
ADD_SEP = "tokenizer.ggml.add_sep_token"
|
2024-06-24 07:06:05 +02:00
|
|
|
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
|
|
|
|
REMOVE_EXTRA_WS = "tokenizer.ggml.remove_extra_whitespaces"
|
|
|
|
PRECOMPILED_CHARSMAP = "tokenizer.ggml.precompiled_charsmap"
|
|
|
|
HF_JSON = "tokenizer.huggingface.json"
|
|
|
|
RWKV = "tokenizer.rwkv.world"
|
|
|
|
CHAT_TEMPLATE = "tokenizer.chat_template"
|
|
|
|
CHAT_TEMPLATE_N = "tokenizer.chat_template.{name}"
|
|
|
|
CHAT_TEMPLATES = "tokenizer.chat_templates"
|
2024-04-16 08:13:13 +02:00
|
|
|
# FIM/Infill special tokens constants
|
2024-10-12 08:21:51 +03:00
|
|
|
FIM_PRE_ID = "tokenizer.ggml.fim_pre_token_id"
|
|
|
|
FIM_SUF_ID = "tokenizer.ggml.fim_suf_token_id"
|
|
|
|
FIM_MID_ID = "tokenizer.ggml.fim_mid_token_id"
|
|
|
|
FIM_PAD_ID = "tokenizer.ggml.fim_pad_token_id"
|
|
|
|
FIM_REP_ID = "tokenizer.ggml.fim_rep_token_id"
|
|
|
|
FIM_SEP_ID = "tokenizer.ggml.fim_sep_token_id"
|
|
|
|
# deprecated:
|
2024-06-24 07:06:05 +02:00
|
|
|
PREFIX_ID = "tokenizer.ggml.prefix_token_id"
|
|
|
|
SUFFIX_ID = "tokenizer.ggml.suffix_token_id"
|
|
|
|
MIDDLE_ID = "tokenizer.ggml.middle_token_id"
|
2023-11-10 22:04:50 -07:00
|
|
|
|
2024-07-15 20:50:47 +02:00
|
|
|
class Adapter:
|
|
|
|
TYPE = "adapter.type"
|
|
|
|
LORA_ALPHA = "adapter.lora.alpha"
|
|
|
|
|
2025-05-22 20:42:48 +02:00
|
|
|
class Clip:
|
2025-04-20 23:29:36 +02:00
|
|
|
PROJECTOR_TYPE = "clip.projector_type"
|
|
|
|
HAS_VISION_ENCODER = "clip.has_vision_encoder"
|
2025-05-22 20:42:48 +02:00
|
|
|
HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
|
2025-04-20 23:29:36 +02:00
|
|
|
HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
|
2025-05-22 20:42:48 +02:00
|
|
|
|
|
|
|
class ClipVision:
|
2025-04-20 23:29:36 +02:00
|
|
|
IMAGE_SIZE = "clip.vision.image_size"
|
|
|
|
PATCH_SIZE = "clip.vision.patch_size"
|
|
|
|
EMBEDDING_LENGTH = "clip.vision.embedding_length"
|
|
|
|
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
|
|
|
|
PROJECTION_DIM = "clip.vision.projection_dim"
|
|
|
|
BLOCK_COUNT = "clip.vision.block_count"
|
|
|
|
IMAGE_MEAN = "clip.vision.image_mean"
|
|
|
|
IMAGE_STD = "clip.vision.image_std"
|
2025-05-01 17:05:42 +02:00
|
|
|
SPATIAL_MERGE_SIZE = "clip.vision.spatial_merge_size"
|
2025-04-20 23:29:36 +02:00
|
|
|
USE_GELU = "clip.use_gelu"
|
2025-04-22 16:24:54 +02:00
|
|
|
USE_SILU = "clip.use_silu"
|
2025-05-02 17:17:15 +02:00
|
|
|
N_WA_PATTERN = "clip.vision.n_wa_pattern" # used by qwen2.5vl
|
2025-04-20 23:29:36 +02:00
|
|
|
|
|
|
|
class Attention:
|
|
|
|
HEAD_COUNT = "clip.vision.attention.head_count"
|
|
|
|
LAYERNORM_EPS = "clip.vision.attention.layer_norm_epsilon"
|
|
|
|
|
2025-04-22 16:24:54 +02:00
|
|
|
class Projector:
|
|
|
|
SCALE_FACTOR = "clip.vision.projector.scale_factor"
|
|
|
|
|
2025-05-22 20:42:48 +02:00
|
|
|
class ClipAudio:
|
|
|
|
NUM_MEL_BINS = "clip.audio.num_mel_bins"
|
|
|
|
EMBEDDING_LENGTH = "clip.audio.embedding_length"
|
|
|
|
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
|
|
|
|
PROJECTION_DIM = "clip.audio.projection_dim"
|
|
|
|
BLOCK_COUNT = "clip.audio.block_count"
|
|
|
|
|
|
|
|
class Attention:
|
|
|
|
HEAD_COUNT = "clip.audio.attention.head_count"
|
|
|
|
LAYERNORM_EPS = "clip.audio.attention.layer_norm_epsilon"
|
|
|
|
|
|
|
|
class Projector:
|
|
|
|
STACK_FACTOR = "clip.audio.projector.stack_factor"
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
#
|
|
|
|
# recommended mapping of model tensor names for storage in gguf
|
|
|
|
#
|
|
|
|
|
|
|
|
|
2024-07-15 20:50:47 +02:00
|
|
|
class GGUFType:
|
2025-05-22 20:42:48 +02:00
|
|
|
MODEL = "model"
|
|
|
|
ADAPTER = "adapter"
|
|
|
|
MMPROJ = "mmproj" # dummy, unused for now
|
2024-07-15 20:50:47 +02:00
|
|
|
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
class MODEL_ARCH(IntEnum):
|
2025-05-22 20:42:48 +02:00
|
|
|
MMPROJ = auto() # dummy arch for clip.cpp
|
2024-12-18 19:27:21 +02:00
|
|
|
LLAMA = auto()
|
2025-04-07 23:06:44 +02:00
|
|
|
LLAMA4 = auto()
|
2024-12-23 08:22:33 +08:00
|
|
|
DECI = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
FALCON = auto()
|
2025-07-09 12:03:49 +04:00
|
|
|
FALCON_H1 = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
BAICHUAN = auto()
|
|
|
|
GROK = auto()
|
|
|
|
GPT2 = auto()
|
|
|
|
GPTJ = auto()
|
|
|
|
GPTNEOX = auto()
|
|
|
|
MPT = auto()
|
|
|
|
STARCODER = auto()
|
|
|
|
REFACT = auto()
|
|
|
|
BERT = auto()
|
|
|
|
NOMIC_BERT = auto()
|
2025-04-28 15:52:15 -04:00
|
|
|
NOMIC_BERT_MOE = auto()
|
2025-06-16 21:53:41 +09:00
|
|
|
NEO_BERT = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
JINA_BERT_V2 = auto()
|
|
|
|
BLOOM = auto()
|
|
|
|
STABLELM = auto()
|
|
|
|
QWEN = auto()
|
|
|
|
QWEN2 = auto()
|
|
|
|
QWEN2MOE = auto()
|
|
|
|
QWEN2VL = auto()
|
2025-04-09 17:47:36 +08:00
|
|
|
QWEN3 = auto()
|
|
|
|
QWEN3MOE = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
PHI2 = auto()
|
|
|
|
PHI3 = auto()
|
2025-01-09 11:21:41 +01:00
|
|
|
PHIMOE = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
PLAMO = auto()
|
2025-07-16 01:11:42 +09:00
|
|
|
PLAMO2 = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
CODESHELL = auto()
|
|
|
|
ORION = auto()
|
|
|
|
INTERNLM2 = auto()
|
|
|
|
MINICPM = auto()
|
|
|
|
MINICPM3 = auto()
|
|
|
|
GEMMA = auto()
|
|
|
|
GEMMA2 = auto()
|
2025-03-12 09:30:24 +01:00
|
|
|
GEMMA3 = auto()
|
2025-06-26 19:34:02 +02:00
|
|
|
GEMMA3N = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
STARCODER2 = auto()
|
|
|
|
RWKV6 = auto()
|
2025-01-10 09:58:08 +08:00
|
|
|
RWKV6QWEN2 = auto()
|
2025-03-18 07:27:50 +08:00
|
|
|
RWKV7 = auto()
|
|
|
|
ARWKV7 = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
MAMBA = auto()
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
MAMBA2 = auto()
|
2025-07-09 14:59:57 -04:00
|
|
|
JAMBA = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
XVERSE = auto()
|
|
|
|
COMMAND_R = auto()
|
2025-01-04 09:33:31 -05:00
|
|
|
COHERE2 = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
DBRX = auto()
|
|
|
|
OLMO = auto()
|
|
|
|
OLMO2 = auto()
|
|
|
|
OLMOE = auto()
|
|
|
|
OPENELM = auto()
|
|
|
|
ARCTIC = auto()
|
|
|
|
DEEPSEEK = auto()
|
|
|
|
DEEPSEEK2 = auto()
|
|
|
|
CHATGLM = auto()
|
2025-04-11 18:10:10 +08:00
|
|
|
GLM4 = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
BITNET = auto()
|
|
|
|
T5 = auto()
|
|
|
|
T5ENCODER = auto()
|
|
|
|
JAIS = auto()
|
|
|
|
NEMOTRON = auto()
|
|
|
|
EXAONE = auto()
|
|
|
|
GRANITE = auto()
|
|
|
|
GRANITE_MOE = auto()
|
model : Granite Four (#13550)
* wip: llama : separate recurrent states from the KV cache
This will be necessary to support Jamba
(and other recurrent models mixed with Attention).
Doesn't compile yet, and finding a slot isn't yet done correctly for recurrent states.
* llama : use std::find for seq_nodes in llama_rs_cache
* llama : state checkpoints for recurrent models
* llama : correctly handle more edge cases for the rs cache
* llama : rename many llama_kv_cache_* functions
* llama : remove useless return value for some llama_cache_* functions
* llama : rethink recurrent state cell counts
* llama : begin work on support for variable GQA
This will also be useful for Jamba if we consider the Mamba layers
to have 0 KV heads.
* llama : gracefully fail when not finding hybrid slot
* llama : support Jamba
* llama : fix BERT inference without KV cache
* convert-hf : check for unprocessed Jamba experts
* convert-hf : support Mini-Jamba conversion
* llama : fix Jamba quantization sanity checks
* llama : sequence-length-aware batch splitting
* llama : use equal-sequence-length sub-batches for recurrent models
* ggml : simplify SSM-related operators
* llama : make recurrent state slot allocation contiguous
* llama : adapt internal uses of batches to llama_ubatch
* llama : fix batch split output count for embeddings
* llama : minimize swaps when reordering logits
This reduces overhead when running hellaswag
on thousands of sequences with very small 100k params Mamba models.
* llama : fix edge case finding batch seq_id of split recurrent cell
This otherwise was a problem when running the HellaSwag benchmark
with small batch sizes, making it crash.
* llama : avoid copies for simple batch splits
* llama : use im2col and mul_mat to perform convolution for Mamba
This removes the need for ggml_ssm_conv!!!
But performance seems slighly worse on my system,
especially for prompt processing.
Maybe ggml_mul_mat isn't optimized for small row sizes?
More performance testing is necessary until GGML_OP_SSM_CONV is removed.
* ggml : make ggml_ssm_scan not modify its source tensors
* llama : fix shared recurrent tail cell count for small ubatch sizes
Otherwise it was impossible to run the 'parallel' example with '-ub 1'
with a Mamba or Jamba model.
* llama : fix .base() compilation error on Windows
* llama : allow doing the equivalent of SSM_CONV with SUM_ROWS and MUL
* ggml : allow GGML_OP_CONCAT to work on non-contiguous tensors
The implementation already supported it,
and this makes Mamba's conv step slightly faster.
* llama : rename llama_cache to llama_past
This can be changed back later if the name change is wrong.
I was renaming the functions anyway to generalize kv-cache-related
functions to hybrid and recurrent model architectures.
I think llama_past is a better name than llama_cache for a combined
kv cache and recurrent state cache, because the states it contains
pretty much always come before the newly-added ones for any particular
sequence. Also 'llama_past_clear' sounds more obvious in what it does
than 'llama_kv_cache_clear'. The future is what the models generate.
(For embeddings, the kv cache isn't really used anyway)
Still, I'm open to better suggestions.
* examples : replace llama_kv_cache_seq_* with llama_past_seq_*
* mamba : fix non-contiguous usage of ggml_silu
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : session saving and reloading for hybrid models
* convert_hf : fix Jamba conversion
* llama : fix mixed signedness comparison
* llama : use unused n_embd_k_gqa in k_shift
This also slightly reduces the diff from the master branch
* llama : begin renaming llama_past back to llama_kv_cache
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* llama : remove implicit recurrent state rollbacks
* llama : partially apply clang-format style
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* feat: Add conversion for Bamba models
This is borrowed and adapted from the original implementation
https://github.com/ggml-org/llama.cpp/pull/10810
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add Granite 4 conversion
This is a manual copy from my draft branch
https://github.com/gabe-l-hart/llama.cpp/blob/GraniteFourDraft/convert_hf_to_gguf.py#L5076
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Plumb bamba through llama-arch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add bamba to llama_arch_is_hybrid_recurrent
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add optional mamba ssm_in bias tensor
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add template specialization for get_arr to load a vector<uint32_t> for layer index arr in hparams
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Use an explicit bool to determine mamaba vs mamba2
This allows other architectures like bamba and granitemoehybrid to use
mamab2 without a growing architecture `if` statement inside the mamba
implementation.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Isolate mamba(2) and granite attention layer building in static methods
This will allow these layer-builder methods to be used from other build
structs without complex inheritance.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use per-layer sizes in granite build_attention_layer
Also no need to pass in kv cache since it's already in the inp_attn
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: First (broken) pass at end-to-end Bamba implementation
It generates (garbage) tokens! Still lots of debugging to do.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Only do Granite multipliers if set
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Pull granite ffn portion into a static function and reuse in hybrid
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat(py): Allow gguf duplicate keys if they match by value and type
This is helpful for hybrid models that want to do gguf param setting by
calling multiple parent classes without needing to make those parent
classes try/except on every attempt to set a gguf value.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor(py): Simplify granitemoehybrid conversion to use parents better
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add GRANITE_MOE_HYBRID through llama-arch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Support GRANITE_MOE_HYBRID in llama-model
This re-uses the Bamba code paths heavily and simply adds the missing parts
for loading MoE and the shared expert.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* style: Fix flake8 errors
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix recurrent cache get after rebase
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix hybrid granite implementation for signature changes in build_mamba*_layer
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Refactor relationship between non-hybrid classes and hybrid impl to use mixins
The challenge here is to give both the non-hybrid classes (llm_build_mamba
and llm_build_granite) AND the hybrid class (llm_build_hybrid_mamba) access
to the same intermediate "base class" functionality (build_mamba*_layer,
build_granite_attention_layer) without running into trouble with diamond
inheritance of llm_graph_context. Due to the non-trivial initialization
that happens in llm_graph_context, diamond inheritance results in multiple
initializations of the common base which cause problems around the unique
ptrs. I wanted to get away from `self->` everywhere, but this is still a
bit cleaner than making those methods static I think.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Implement the full copy-paste version to duplicate the layer builders
This follows the pattern where the type of input is pinned to the type of
memory and that is used to dispatch to the correct version of `build_rs` /
`build_attn`. There's a lot of code duplication that can hopefully be
pulled into common functions in the graph later.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Rename llm_build_hybrid_mamba -> llm_build_granite_hybrid
I've got back-and-forth a lot about how/if to try to implement reuse of the
"child model" layer types for hybrid models. At the end of the day, I think
hybrid models are their own beast and even if their layers are inspired by
other models, they should maintain control of their own layer building (in
other words, the copy-paste method). Given that, the name should reflect
that this is not a generic hybrid model builder, but rather a granite-
specific hybrid model builder that can do MoE (granite 4) or dense (bamba).
As part if this, I also cleaned up dangling comments from previous attempts
at using static methods for reusability.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* memory : correctly handle failure in apply()
ggml-ci
* style: Remove TODO for adding first hybrid models to the switch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix bad merge in tensor_mapping.py w/ SSM_NORM
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix bad merge resolution with variable renames/moves in llm_build_mamba
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* docs: Fix comment about duplicate key check
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Conform to standard way of initializing inp_out_ids
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* convert : fix jamba conv1d shape squeezing
* fix: Fix input initialization in granite_hybrid after removal of hybrid inputs
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use llm_graph_context_mamba in llm_build_granite_hybrid
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Refactor mamba2/granite/jamba/granite_hybrid relationships as mixins
The key is for the mixin classes (llm_graph_context_mamba,
llm_graph_context_granite) to use virtual inheritance from
llm_graph_context. This allows the common members to exist only once in the
class hierarchy. The downside is that llm_graph_context will be
re-initialized once for each parent (ie 2x for single mixin, 3x for two
mixins, etc...).
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* graph : add back hybrid memory graph input
But this time it contains the sub-cache graph inputs.
This *should* make it easier to handle updating the inputs
when caching the graph (eventually).
* model : add Jamba to Mamba-specific hparams printing
* fix: Fix input setup after upstream merge
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* jamba : remove redundant nullptr initializations
* model : remove unnecessary prefix for tensor loading constants
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* model : use ggml_swiglu_split for Mamba
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* feat: Add support for dense FFN in GraniteMoeHybrid
This was already partially supported via reusing the granite ffn builder,
and there may be models that leverage this architecture going forward. The
naming is a bit odd, but in the transformers version, it reuses the same
model class and simply has zero regular experts and a single shared expert
(which is the same as a single dense FFN).
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add support for dense FFN tensor names on c++ side
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use child inputs for Falcon H1 after merge resolution
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unnecessary prefix on tensor constants
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* model : make falcon-h1 use shared mamba2 layer builder
* memory : avoid referring to KV in recurrent cache logs
* fix: Revert order changes for Falcon H1 to stay consistent with upstream
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* gguf-py : avoid adding duplicate tensor mappings for Jamba
Some of the tensor names are common with Llama4
* refactor: Collapse Bamba and GraniteMoeHybrid into GraniteHybrid
The only key difference is the use of rope which is now set via
rope_finetuned in the hparams
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Remove use of diamond inheritance
Per PR discussion, it's simpler to keep this with basic inheritance and not
introduce the complexity of virtual inheritance and multiple inheritance
https://github.com/ggml-org/llama.cpp/pull/13550#issuecomment-3053787556
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Log mamba params for Granite Hybrid
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unused ssm_in_b
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Remove ATTENTION_LAYER_INDICES hparam in favor of n_head_kv
This matches how recurrent vs attention heads are identified for Jamba
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unused template expansion for get_arr
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Review cleanup in convert_hf_to_gguf
The gist is to be explicit about which base class is being used with the
multiple inheritance setup
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Undo hidden warnings about duplicate identical keys in add_key_value
After further discussion, this encourages sloppy overwriting in the model
converters
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: If not using ROPE, context is "infinite"
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* doc: Add a comment outlining expected duplicate key warnings
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unnecessary duplicate keys in converter
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
(thanks for the sharp eyes and patience!)
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---------
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-07-10 18:20:13 -06:00
|
|
|
GRANITE_HYBRID = auto()
|
2024-12-18 19:27:21 +02:00
|
|
|
CHAMELEON = auto()
|
|
|
|
WAVTOKENIZER_DEC = auto()
|
2025-03-27 10:49:15 +00:00
|
|
|
PLM = auto()
|
2025-03-30 22:21:03 +02:00
|
|
|
BAILINGMOE = auto()
|
2025-06-15 00:52:06 -07:00
|
|
|
DOTS1 = auto()
|
2025-06-16 00:04:06 +01:00
|
|
|
ARCEE = auto()
|
2025-06-28 22:08:21 +08:00
|
|
|
ERNIE4_5 = auto()
|
2025-07-08 10:24:06 +02:00
|
|
|
HUNYUAN_MOE = auto()
|
2025-07-08 18:07:01 +02:00
|
|
|
SMOLLM3 = auto()
|
2025-07-11 20:27:01 +02:00
|
|
|
LFM2 = auto()
|
2025-07-16 20:03:51 +08:00
|
|
|
DREAM = auto()
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
|
2025-04-20 23:29:36 +02:00
|
|
|
class VISION_PROJECTOR_TYPE(IntEnum):
|
|
|
|
MLP = auto()
|
|
|
|
LDP = auto()
|
|
|
|
LDPV2 = auto()
|
|
|
|
RESAMPLER = auto()
|
|
|
|
GLM_EDGE = auto()
|
|
|
|
MERGER = auto()
|
|
|
|
GEMMA3 = auto()
|
|
|
|
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
class MODEL_TENSOR(IntEnum):
|
2024-06-24 07:06:05 +02:00
|
|
|
TOKEN_EMBD = auto()
|
|
|
|
TOKEN_EMBD_NORM = auto()
|
|
|
|
TOKEN_TYPES = auto()
|
|
|
|
POS_EMBD = auto()
|
|
|
|
OUTPUT = auto()
|
|
|
|
OUTPUT_NORM = auto()
|
|
|
|
ROPE_FREQS = auto()
|
|
|
|
ROPE_FACTORS_LONG = auto()
|
|
|
|
ROPE_FACTORS_SHORT = auto()
|
|
|
|
ATTN_Q = auto()
|
|
|
|
ATTN_K = auto()
|
|
|
|
ATTN_V = auto()
|
|
|
|
ATTN_QKV = auto()
|
|
|
|
ATTN_OUT = auto()
|
|
|
|
ATTN_NORM = auto()
|
|
|
|
ATTN_NORM_2 = auto()
|
|
|
|
ATTN_OUT_NORM = auto()
|
2024-06-28 00:00:43 -04:00
|
|
|
ATTN_POST_NORM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
ATTN_ROT_EMBD = auto()
|
|
|
|
FFN_GATE_INP = auto()
|
|
|
|
FFN_GATE_INP_SHEXP = auto()
|
|
|
|
FFN_NORM = auto()
|
2024-06-28 00:00:43 -04:00
|
|
|
FFN_PRE_NORM = auto()
|
|
|
|
FFN_POST_NORM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
FFN_GATE = auto()
|
|
|
|
FFN_DOWN = auto()
|
|
|
|
FFN_UP = auto()
|
|
|
|
FFN_ACT = auto()
|
|
|
|
FFN_NORM_EXP = auto()
|
|
|
|
FFN_GATE_EXP = auto()
|
|
|
|
FFN_DOWN_EXP = auto()
|
|
|
|
FFN_UP_EXP = auto()
|
|
|
|
FFN_GATE_SHEXP = auto()
|
|
|
|
FFN_DOWN_SHEXP = auto()
|
|
|
|
FFN_UP_SHEXP = auto()
|
2025-01-04 21:06:11 +01:00
|
|
|
FFN_EXP_PROBS_B = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
ATTN_Q_NORM = auto()
|
|
|
|
ATTN_K_NORM = auto()
|
|
|
|
LAYER_OUT_NORM = auto()
|
2025-06-26 19:34:02 +02:00
|
|
|
PER_LAYER_TOKEN_EMBD = auto() # gemma3n
|
|
|
|
PER_LAYER_MODEL_PROJ = auto() # gemma3n
|
|
|
|
PER_LAYER_INP_GATE = auto() # gemma3n
|
|
|
|
PER_LAYER_PROJ = auto() # gemma3n
|
|
|
|
PER_LAYER_PROJ_NORM = auto() # gemma3n
|
|
|
|
PER_LAYER_POST_NORM = auto() # gemma3n
|
|
|
|
ALTUP_PROJ = auto() # gemma3n
|
|
|
|
ALTUP_UNEMBD_PROJ = auto() # gemma3n
|
|
|
|
ALTUP_CORRECT_COEF = auto() # gemma3n
|
|
|
|
ALTUP_CORRECT_SCALE = auto() # gemma3n
|
|
|
|
ALTUP_PREDICT_COEF = auto() # gemma3n
|
|
|
|
ALTUP_ROUTER = auto() # gemma3n
|
|
|
|
ALTUP_ROUTER_NORM = auto() # gemma3n
|
|
|
|
LAUREL_L = auto() # gemma3n
|
|
|
|
LAUREL_R = auto() # gemma3n
|
|
|
|
LAUREL_POST_NORM = auto() # gemma3n
|
2024-06-24 07:06:05 +02:00
|
|
|
SSM_IN = auto()
|
|
|
|
SSM_CONV1D = auto()
|
|
|
|
SSM_X = auto()
|
|
|
|
SSM_DT = auto()
|
2025-07-09 14:59:57 -04:00
|
|
|
SSM_DT_NORM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
SSM_A = auto()
|
2025-07-09 14:59:57 -04:00
|
|
|
SSM_B_NORM = auto()
|
|
|
|
SSM_C_NORM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
SSM_D = auto()
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
SSM_NORM = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
SSM_OUT = auto()
|
2025-03-18 07:27:50 +08:00
|
|
|
TIME_MIX_W0 = auto()
|
2024-09-01 22:38:17 +08:00
|
|
|
TIME_MIX_W1 = auto()
|
|
|
|
TIME_MIX_W2 = auto()
|
2025-03-18 07:27:50 +08:00
|
|
|
TIME_MIX_A0 = auto()
|
|
|
|
TIME_MIX_A1 = auto()
|
|
|
|
TIME_MIX_A2 = auto()
|
|
|
|
TIME_MIX_V0 = auto()
|
|
|
|
TIME_MIX_V1 = auto()
|
|
|
|
TIME_MIX_V2 = auto()
|
|
|
|
TIME_MIX_G1 = auto()
|
|
|
|
TIME_MIX_G2 = auto()
|
|
|
|
TIME_MIX_K_K = auto()
|
|
|
|
TIME_MIX_K_A = auto()
|
|
|
|
TIME_MIX_R_K = auto()
|
2024-09-01 22:38:17 +08:00
|
|
|
TIME_MIX_LERP_X = auto()
|
|
|
|
TIME_MIX_LERP_K = auto()
|
|
|
|
TIME_MIX_LERP_V = auto()
|
|
|
|
TIME_MIX_LERP_R = auto()
|
|
|
|
TIME_MIX_LERP_G = auto()
|
2025-01-10 09:58:08 +08:00
|
|
|
TIME_MIX_LERP_FUSED = auto()
|
2024-09-01 22:38:17 +08:00
|
|
|
TIME_MIX_LERP_W = auto()
|
|
|
|
TIME_MIX_FIRST = auto()
|
|
|
|
TIME_MIX_DECAY = auto()
|
|
|
|
TIME_MIX_DECAY_W1 = auto()
|
|
|
|
TIME_MIX_DECAY_W2 = auto()
|
|
|
|
TIME_MIX_KEY = auto()
|
|
|
|
TIME_MIX_VALUE = auto()
|
|
|
|
TIME_MIX_RECEPTANCE = auto()
|
|
|
|
TIME_MIX_GATE = auto()
|
|
|
|
TIME_MIX_LN = auto()
|
|
|
|
TIME_MIX_OUTPUT = auto()
|
|
|
|
CHANNEL_MIX_LERP_K = auto()
|
|
|
|
CHANNEL_MIX_LERP_R = auto()
|
|
|
|
CHANNEL_MIX_KEY = auto()
|
|
|
|
CHANNEL_MIX_RECEPTANCE = auto()
|
|
|
|
CHANNEL_MIX_VALUE = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
ATTN_Q_A = auto()
|
|
|
|
ATTN_Q_B = auto()
|
|
|
|
ATTN_KV_A_MQA = auto()
|
|
|
|
ATTN_KV_B = auto()
|
2025-04-15 07:49:57 +01:00
|
|
|
ATTN_K_B = auto()
|
|
|
|
ATTN_V_B = auto()
|
2024-06-24 07:06:05 +02:00
|
|
|
ATTN_Q_A_NORM = auto()
|
|
|
|
ATTN_KV_A_NORM = auto()
|
|
|
|
FFN_SUB_NORM = auto()
|
|
|
|
ATTN_SUB_NORM = auto()
|
|
|
|
DEC_ATTN_NORM = auto()
|
|
|
|
DEC_ATTN_Q = auto()
|
|
|
|
DEC_ATTN_K = auto()
|
|
|
|
DEC_ATTN_V = auto()
|
|
|
|
DEC_ATTN_OUT = auto()
|
|
|
|
DEC_ATTN_REL_B = auto()
|
|
|
|
DEC_CROSS_ATTN_NORM = auto()
|
|
|
|
DEC_CROSS_ATTN_Q = auto()
|
|
|
|
DEC_CROSS_ATTN_K = auto()
|
|
|
|
DEC_CROSS_ATTN_V = auto()
|
|
|
|
DEC_CROSS_ATTN_OUT = auto()
|
|
|
|
DEC_CROSS_ATTN_REL_B = auto()
|
|
|
|
DEC_FFN_NORM = auto()
|
|
|
|
DEC_FFN_GATE = auto()
|
|
|
|
DEC_FFN_DOWN = auto()
|
|
|
|
DEC_FFN_UP = auto()
|
|
|
|
DEC_OUTPUT_NORM = auto()
|
|
|
|
ENC_ATTN_NORM = auto()
|
|
|
|
ENC_ATTN_Q = auto()
|
|
|
|
ENC_ATTN_K = auto()
|
|
|
|
ENC_ATTN_V = auto()
|
|
|
|
ENC_ATTN_OUT = auto()
|
|
|
|
ENC_ATTN_REL_B = auto()
|
|
|
|
ENC_FFN_NORM = auto()
|
|
|
|
ENC_FFN_GATE = auto()
|
|
|
|
ENC_FFN_DOWN = auto()
|
|
|
|
ENC_FFN_UP = auto()
|
|
|
|
ENC_OUTPUT_NORM = auto()
|
2024-09-28 17:42:03 +03:00
|
|
|
CLS = auto() # classifier
|
|
|
|
CLS_OUT = auto() # classifier output projection
|
2024-12-18 19:27:21 +02:00
|
|
|
CONV1D = auto()
|
|
|
|
CONVNEXT_DW = auto()
|
|
|
|
CONVNEXT_NORM = auto()
|
|
|
|
CONVNEXT_PW1 = auto()
|
|
|
|
CONVNEXT_PW2 = auto()
|
|
|
|
CONVNEXT_GAMMA = auto()
|
|
|
|
POSNET_CONV1 = auto()
|
|
|
|
POSNET_CONV2 = auto()
|
|
|
|
POSNET_NORM = auto()
|
|
|
|
POSNET_NORM1 = auto()
|
|
|
|
POSNET_NORM2 = auto()
|
|
|
|
POSNET_ATTN_NORM = auto()
|
|
|
|
POSNET_ATTN_Q = auto()
|
|
|
|
POSNET_ATTN_K = auto()
|
|
|
|
POSNET_ATTN_V = auto()
|
|
|
|
POSNET_ATTN_OUT = auto()
|
2025-07-11 20:27:01 +02:00
|
|
|
SHORTCONV_CONV = auto()
|
|
|
|
SHORTCONV_INPROJ = auto()
|
|
|
|
SHORTCONV_OUTPROJ = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
# vision
|
|
|
|
V_MMPROJ = auto()
|
|
|
|
V_MMPROJ_FC = auto()
|
|
|
|
V_MMPROJ_MLP = auto()
|
|
|
|
V_MMPROJ_PEG = auto()
|
|
|
|
V_ENC_EMBD_CLS = auto()
|
|
|
|
V_ENC_EMBD_PATCH = auto()
|
|
|
|
V_ENC_EMBD_POS = auto()
|
2025-05-19 13:04:14 +02:00
|
|
|
V_ENC_INPUT_NORM = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
V_ENC_ATTN_Q = auto()
|
2025-05-11 11:35:52 +02:00
|
|
|
V_ENC_ATTN_Q_NORM = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
V_ENC_ATTN_K = auto()
|
2025-05-11 11:35:52 +02:00
|
|
|
V_ENC_ATTN_K_NORM = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
V_ENC_ATTN_V = auto()
|
2025-05-19 13:04:14 +02:00
|
|
|
V_ENC_ATTN_O = auto()
|
|
|
|
V_ENC_ATTN_O_NORM = auto()
|
|
|
|
V_ENC_POST_ATTN_NORM = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
V_ENC_FFN_UP = auto()
|
2025-04-23 20:21:59 +02:00
|
|
|
V_ENC_FFN_GATE = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
V_ENC_FFN_DOWN = auto()
|
2025-05-10 16:26:42 +02:00
|
|
|
V_LAYER_SCALE_1 = auto()
|
|
|
|
V_LAYER_SCALE_2 = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
V_PRE_NORM = auto()
|
|
|
|
V_POST_NORM = auto()
|
2025-05-01 17:05:42 +02:00
|
|
|
V_MM_INP_NORM = auto()
|
2025-04-20 23:29:36 +02:00
|
|
|
V_MM_INP_PROJ = auto() # gemma3
|
|
|
|
V_MM_SOFT_EMB_NORM = auto() # gemma3
|
|
|
|
V_RESMPL_POS_EMBD_K = auto() # minicpmv
|
|
|
|
V_RESMPL_ATTN_Q = auto() # minicpmv
|
|
|
|
V_RESMPL_ATTN_K = auto() # minicpmv
|
|
|
|
V_RESMPL_ATTN_V = auto() # minicpmv
|
|
|
|
V_RESMPL_ATTN_OUT = auto() # minicpmv
|
|
|
|
V_RESMPL_KV = auto() # minicpmv
|
|
|
|
V_RESMPL_KV_NORM = auto() # minicpmv
|
|
|
|
V_RESMPL_POST_NORM = auto() # minicpmv
|
|
|
|
V_RESMPL_Q_NORM = auto() # minicpmv
|
|
|
|
V_RESMPL_PROJ = auto() # minicpmv
|
|
|
|
V_RESMPL_QUERY = auto() # minicpmv
|
2025-04-23 20:21:59 +02:00
|
|
|
V_TOK_EMBD_IMG_BREAK = auto() # pixtral
|
2025-05-01 17:05:42 +02:00
|
|
|
V_MM_PATCH_MERGER = auto() # mistral small 3.1
|
2025-05-22 20:42:48 +02:00
|
|
|
# audio (mtmd)
|
|
|
|
A_ENC_EMBD_POS = auto()
|
|
|
|
A_ENC_CONV1D = auto()
|
|
|
|
A_PRE_NORM = auto()
|
|
|
|
A_POST_NORM = auto()
|
|
|
|
A_ENC_ATTN_Q = auto()
|
|
|
|
A_ENC_ATTN_K = auto()
|
|
|
|
A_ENC_ATTN_V = auto()
|
|
|
|
A_ENC_INPUT_NORM = auto()
|
|
|
|
A_ENC_OUTPUT = auto()
|
|
|
|
A_ENC_OUTPUT_NORM = auto()
|
|
|
|
A_ENC_FFN_UP = auto()
|
|
|
|
A_ENC_FFN_GATE = auto()
|
|
|
|
A_ENC_FFN_DOWN = auto()
|
|
|
|
A_MMPROJ = auto()
|
2025-05-25 14:06:32 +02:00
|
|
|
A_MMPROJ_FC = auto()
|
2025-05-22 20:42:48 +02:00
|
|
|
A_MM_NORM_PRE = auto()
|
|
|
|
A_MM_NORM_MID = auto()
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
|
|
|
|
MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
|
2025-05-22 20:42:48 +02:00
|
|
|
MODEL_ARCH.MMPROJ: "clip", # dummy arch for clip.cpp
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.LLAMA: "llama",
|
2025-04-07 23:06:44 +02:00
|
|
|
MODEL_ARCH.LLAMA4: "llama4",
|
2024-12-23 08:22:33 +08:00
|
|
|
MODEL_ARCH.DECI: "deci",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.FALCON: "falcon",
|
|
|
|
MODEL_ARCH.BAICHUAN: "baichuan",
|
|
|
|
MODEL_ARCH.GROK: "grok",
|
|
|
|
MODEL_ARCH.GPT2: "gpt2",
|
|
|
|
MODEL_ARCH.GPTJ: "gptj",
|
|
|
|
MODEL_ARCH.GPTNEOX: "gptneox",
|
|
|
|
MODEL_ARCH.MPT: "mpt",
|
|
|
|
MODEL_ARCH.STARCODER: "starcoder",
|
|
|
|
MODEL_ARCH.REFACT: "refact",
|
|
|
|
MODEL_ARCH.BERT: "bert",
|
|
|
|
MODEL_ARCH.NOMIC_BERT: "nomic-bert",
|
2025-04-28 15:52:15 -04:00
|
|
|
MODEL_ARCH.NOMIC_BERT_MOE: "nomic-bert-moe",
|
2025-06-16 21:53:41 +09:00
|
|
|
MODEL_ARCH.NEO_BERT: "neo-bert",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.JINA_BERT_V2: "jina-bert-v2",
|
|
|
|
MODEL_ARCH.BLOOM: "bloom",
|
|
|
|
MODEL_ARCH.STABLELM: "stablelm",
|
|
|
|
MODEL_ARCH.QWEN: "qwen",
|
|
|
|
MODEL_ARCH.QWEN2: "qwen2",
|
|
|
|
MODEL_ARCH.QWEN2MOE: "qwen2moe",
|
|
|
|
MODEL_ARCH.QWEN2VL: "qwen2vl",
|
2025-04-09 17:47:36 +08:00
|
|
|
MODEL_ARCH.QWEN3: "qwen3",
|
|
|
|
MODEL_ARCH.QWEN3MOE: "qwen3moe",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.PHI2: "phi2",
|
|
|
|
MODEL_ARCH.PHI3: "phi3",
|
2025-01-09 11:21:41 +01:00
|
|
|
MODEL_ARCH.PHIMOE: "phimoe",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.PLAMO: "plamo",
|
2025-07-16 01:11:42 +09:00
|
|
|
MODEL_ARCH.PLAMO2: "plamo2",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.CODESHELL: "codeshell",
|
|
|
|
MODEL_ARCH.ORION: "orion",
|
|
|
|
MODEL_ARCH.INTERNLM2: "internlm2",
|
|
|
|
MODEL_ARCH.MINICPM: "minicpm",
|
|
|
|
MODEL_ARCH.MINICPM3: "minicpm3",
|
|
|
|
MODEL_ARCH.GEMMA: "gemma",
|
|
|
|
MODEL_ARCH.GEMMA2: "gemma2",
|
2025-03-12 09:30:24 +01:00
|
|
|
MODEL_ARCH.GEMMA3: "gemma3",
|
2025-06-26 19:34:02 +02:00
|
|
|
MODEL_ARCH.GEMMA3N: "gemma3n",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.STARCODER2: "starcoder2",
|
|
|
|
MODEL_ARCH.RWKV6: "rwkv6",
|
2025-01-10 09:58:08 +08:00
|
|
|
MODEL_ARCH.RWKV6QWEN2: "rwkv6qwen2",
|
2025-03-18 07:27:50 +08:00
|
|
|
MODEL_ARCH.RWKV7: "rwkv7",
|
|
|
|
MODEL_ARCH.ARWKV7: "arwkv7",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.MAMBA: "mamba",
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
MODEL_ARCH.MAMBA2: "mamba2",
|
2025-07-09 14:59:57 -04:00
|
|
|
MODEL_ARCH.JAMBA: "jamba",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.XVERSE: "xverse",
|
|
|
|
MODEL_ARCH.COMMAND_R: "command-r",
|
2025-01-04 09:33:31 -05:00
|
|
|
MODEL_ARCH.COHERE2: "cohere2",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.DBRX: "dbrx",
|
|
|
|
MODEL_ARCH.OLMO: "olmo",
|
|
|
|
MODEL_ARCH.OLMO2: "olmo2",
|
|
|
|
MODEL_ARCH.OLMOE: "olmoe",
|
|
|
|
MODEL_ARCH.OPENELM: "openelm",
|
|
|
|
MODEL_ARCH.ARCTIC: "arctic",
|
|
|
|
MODEL_ARCH.DEEPSEEK: "deepseek",
|
|
|
|
MODEL_ARCH.DEEPSEEK2: "deepseek2",
|
|
|
|
MODEL_ARCH.CHATGLM: "chatglm",
|
2025-04-11 18:10:10 +08:00
|
|
|
MODEL_ARCH.GLM4: "glm4",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.BITNET: "bitnet",
|
|
|
|
MODEL_ARCH.T5: "t5",
|
|
|
|
MODEL_ARCH.T5ENCODER: "t5encoder",
|
|
|
|
MODEL_ARCH.JAIS: "jais",
|
|
|
|
MODEL_ARCH.NEMOTRON: "nemotron",
|
|
|
|
MODEL_ARCH.EXAONE: "exaone",
|
|
|
|
MODEL_ARCH.GRANITE: "granite",
|
|
|
|
MODEL_ARCH.GRANITE_MOE: "granitemoe",
|
model : Granite Four (#13550)
* wip: llama : separate recurrent states from the KV cache
This will be necessary to support Jamba
(and other recurrent models mixed with Attention).
Doesn't compile yet, and finding a slot isn't yet done correctly for recurrent states.
* llama : use std::find for seq_nodes in llama_rs_cache
* llama : state checkpoints for recurrent models
* llama : correctly handle more edge cases for the rs cache
* llama : rename many llama_kv_cache_* functions
* llama : remove useless return value for some llama_cache_* functions
* llama : rethink recurrent state cell counts
* llama : begin work on support for variable GQA
This will also be useful for Jamba if we consider the Mamba layers
to have 0 KV heads.
* llama : gracefully fail when not finding hybrid slot
* llama : support Jamba
* llama : fix BERT inference without KV cache
* convert-hf : check for unprocessed Jamba experts
* convert-hf : support Mini-Jamba conversion
* llama : fix Jamba quantization sanity checks
* llama : sequence-length-aware batch splitting
* llama : use equal-sequence-length sub-batches for recurrent models
* ggml : simplify SSM-related operators
* llama : make recurrent state slot allocation contiguous
* llama : adapt internal uses of batches to llama_ubatch
* llama : fix batch split output count for embeddings
* llama : minimize swaps when reordering logits
This reduces overhead when running hellaswag
on thousands of sequences with very small 100k params Mamba models.
* llama : fix edge case finding batch seq_id of split recurrent cell
This otherwise was a problem when running the HellaSwag benchmark
with small batch sizes, making it crash.
* llama : avoid copies for simple batch splits
* llama : use im2col and mul_mat to perform convolution for Mamba
This removes the need for ggml_ssm_conv!!!
But performance seems slighly worse on my system,
especially for prompt processing.
Maybe ggml_mul_mat isn't optimized for small row sizes?
More performance testing is necessary until GGML_OP_SSM_CONV is removed.
* ggml : make ggml_ssm_scan not modify its source tensors
* llama : fix shared recurrent tail cell count for small ubatch sizes
Otherwise it was impossible to run the 'parallel' example with '-ub 1'
with a Mamba or Jamba model.
* llama : fix .base() compilation error on Windows
* llama : allow doing the equivalent of SSM_CONV with SUM_ROWS and MUL
* ggml : allow GGML_OP_CONCAT to work on non-contiguous tensors
The implementation already supported it,
and this makes Mamba's conv step slightly faster.
* llama : rename llama_cache to llama_past
This can be changed back later if the name change is wrong.
I was renaming the functions anyway to generalize kv-cache-related
functions to hybrid and recurrent model architectures.
I think llama_past is a better name than llama_cache for a combined
kv cache and recurrent state cache, because the states it contains
pretty much always come before the newly-added ones for any particular
sequence. Also 'llama_past_clear' sounds more obvious in what it does
than 'llama_kv_cache_clear'. The future is what the models generate.
(For embeddings, the kv cache isn't really used anyway)
Still, I'm open to better suggestions.
* examples : replace llama_kv_cache_seq_* with llama_past_seq_*
* mamba : fix non-contiguous usage of ggml_silu
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : session saving and reloading for hybrid models
* convert_hf : fix Jamba conversion
* llama : fix mixed signedness comparison
* llama : use unused n_embd_k_gqa in k_shift
This also slightly reduces the diff from the master branch
* llama : begin renaming llama_past back to llama_kv_cache
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* llama : remove implicit recurrent state rollbacks
* llama : partially apply clang-format style
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* feat: Add conversion for Bamba models
This is borrowed and adapted from the original implementation
https://github.com/ggml-org/llama.cpp/pull/10810
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add Granite 4 conversion
This is a manual copy from my draft branch
https://github.com/gabe-l-hart/llama.cpp/blob/GraniteFourDraft/convert_hf_to_gguf.py#L5076
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Plumb bamba through llama-arch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add bamba to llama_arch_is_hybrid_recurrent
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add optional mamba ssm_in bias tensor
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add template specialization for get_arr to load a vector<uint32_t> for layer index arr in hparams
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Use an explicit bool to determine mamaba vs mamba2
This allows other architectures like bamba and granitemoehybrid to use
mamab2 without a growing architecture `if` statement inside the mamba
implementation.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Isolate mamba(2) and granite attention layer building in static methods
This will allow these layer-builder methods to be used from other build
structs without complex inheritance.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use per-layer sizes in granite build_attention_layer
Also no need to pass in kv cache since it's already in the inp_attn
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: First (broken) pass at end-to-end Bamba implementation
It generates (garbage) tokens! Still lots of debugging to do.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Only do Granite multipliers if set
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Pull granite ffn portion into a static function and reuse in hybrid
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat(py): Allow gguf duplicate keys if they match by value and type
This is helpful for hybrid models that want to do gguf param setting by
calling multiple parent classes without needing to make those parent
classes try/except on every attempt to set a gguf value.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor(py): Simplify granitemoehybrid conversion to use parents better
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add GRANITE_MOE_HYBRID through llama-arch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Support GRANITE_MOE_HYBRID in llama-model
This re-uses the Bamba code paths heavily and simply adds the missing parts
for loading MoE and the shared expert.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* style: Fix flake8 errors
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix recurrent cache get after rebase
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix hybrid granite implementation for signature changes in build_mamba*_layer
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Refactor relationship between non-hybrid classes and hybrid impl to use mixins
The challenge here is to give both the non-hybrid classes (llm_build_mamba
and llm_build_granite) AND the hybrid class (llm_build_hybrid_mamba) access
to the same intermediate "base class" functionality (build_mamba*_layer,
build_granite_attention_layer) without running into trouble with diamond
inheritance of llm_graph_context. Due to the non-trivial initialization
that happens in llm_graph_context, diamond inheritance results in multiple
initializations of the common base which cause problems around the unique
ptrs. I wanted to get away from `self->` everywhere, but this is still a
bit cleaner than making those methods static I think.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Implement the full copy-paste version to duplicate the layer builders
This follows the pattern where the type of input is pinned to the type of
memory and that is used to dispatch to the correct version of `build_rs` /
`build_attn`. There's a lot of code duplication that can hopefully be
pulled into common functions in the graph later.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Rename llm_build_hybrid_mamba -> llm_build_granite_hybrid
I've got back-and-forth a lot about how/if to try to implement reuse of the
"child model" layer types for hybrid models. At the end of the day, I think
hybrid models are their own beast and even if their layers are inspired by
other models, they should maintain control of their own layer building (in
other words, the copy-paste method). Given that, the name should reflect
that this is not a generic hybrid model builder, but rather a granite-
specific hybrid model builder that can do MoE (granite 4) or dense (bamba).
As part if this, I also cleaned up dangling comments from previous attempts
at using static methods for reusability.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* memory : correctly handle failure in apply()
ggml-ci
* style: Remove TODO for adding first hybrid models to the switch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix bad merge in tensor_mapping.py w/ SSM_NORM
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix bad merge resolution with variable renames/moves in llm_build_mamba
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* docs: Fix comment about duplicate key check
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Conform to standard way of initializing inp_out_ids
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* convert : fix jamba conv1d shape squeezing
* fix: Fix input initialization in granite_hybrid after removal of hybrid inputs
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use llm_graph_context_mamba in llm_build_granite_hybrid
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Refactor mamba2/granite/jamba/granite_hybrid relationships as mixins
The key is for the mixin classes (llm_graph_context_mamba,
llm_graph_context_granite) to use virtual inheritance from
llm_graph_context. This allows the common members to exist only once in the
class hierarchy. The downside is that llm_graph_context will be
re-initialized once for each parent (ie 2x for single mixin, 3x for two
mixins, etc...).
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* graph : add back hybrid memory graph input
But this time it contains the sub-cache graph inputs.
This *should* make it easier to handle updating the inputs
when caching the graph (eventually).
* model : add Jamba to Mamba-specific hparams printing
* fix: Fix input setup after upstream merge
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* jamba : remove redundant nullptr initializations
* model : remove unnecessary prefix for tensor loading constants
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* model : use ggml_swiglu_split for Mamba
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* feat: Add support for dense FFN in GraniteMoeHybrid
This was already partially supported via reusing the granite ffn builder,
and there may be models that leverage this architecture going forward. The
naming is a bit odd, but in the transformers version, it reuses the same
model class and simply has zero regular experts and a single shared expert
(which is the same as a single dense FFN).
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add support for dense FFN tensor names on c++ side
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use child inputs for Falcon H1 after merge resolution
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unnecessary prefix on tensor constants
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* model : make falcon-h1 use shared mamba2 layer builder
* memory : avoid referring to KV in recurrent cache logs
* fix: Revert order changes for Falcon H1 to stay consistent with upstream
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* gguf-py : avoid adding duplicate tensor mappings for Jamba
Some of the tensor names are common with Llama4
* refactor: Collapse Bamba and GraniteMoeHybrid into GraniteHybrid
The only key difference is the use of rope which is now set via
rope_finetuned in the hparams
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Remove use of diamond inheritance
Per PR discussion, it's simpler to keep this with basic inheritance and not
introduce the complexity of virtual inheritance and multiple inheritance
https://github.com/ggml-org/llama.cpp/pull/13550#issuecomment-3053787556
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Log mamba params for Granite Hybrid
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unused ssm_in_b
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Remove ATTENTION_LAYER_INDICES hparam in favor of n_head_kv
This matches how recurrent vs attention heads are identified for Jamba
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unused template expansion for get_arr
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Review cleanup in convert_hf_to_gguf
The gist is to be explicit about which base class is being used with the
multiple inheritance setup
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Undo hidden warnings about duplicate identical keys in add_key_value
After further discussion, this encourages sloppy overwriting in the model
converters
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: If not using ROPE, context is "infinite"
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* doc: Add a comment outlining expected duplicate key warnings
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unnecessary duplicate keys in converter
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
(thanks for the sharp eyes and patience!)
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---------
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-07-10 18:20:13 -06:00
|
|
|
MODEL_ARCH.GRANITE_HYBRID: "granitehybrid",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.CHAMELEON: "chameleon",
|
|
|
|
MODEL_ARCH.WAVTOKENIZER_DEC: "wavtokenizer-dec",
|
2025-03-27 10:49:15 +00:00
|
|
|
MODEL_ARCH.PLM: "plm",
|
2025-03-30 22:21:03 +02:00
|
|
|
MODEL_ARCH.BAILINGMOE: "bailingmoe",
|
2025-06-16 00:04:06 +01:00
|
|
|
MODEL_ARCH.DOTS1: "dots1",
|
|
|
|
MODEL_ARCH.ARCEE: "arcee",
|
2025-06-28 22:08:21 +08:00
|
|
|
MODEL_ARCH.ERNIE4_5: "ernie4_5",
|
2025-07-09 12:03:49 +04:00
|
|
|
MODEL_ARCH.FALCON_H1: "falcon-h1",
|
2025-07-08 10:24:06 +02:00
|
|
|
MODEL_ARCH.HUNYUAN_MOE: "hunyuan-moe",
|
2025-07-08 18:07:01 +02:00
|
|
|
MODEL_ARCH.SMOLLM3: "smollm3",
|
2025-07-11 20:27:01 +02:00
|
|
|
MODEL_ARCH.LFM2: "lfm2",
|
2025-07-16 20:03:51 +08:00
|
|
|
MODEL_ARCH.DREAM: "dream",
|
2023-11-10 22:04:50 -07:00
|
|
|
}
|
|
|
|
|
2025-04-20 23:29:36 +02:00
|
|
|
VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
|
|
|
|
VISION_PROJECTOR_TYPE.MLP: "mlp",
|
|
|
|
VISION_PROJECTOR_TYPE.LDP: "ldp",
|
|
|
|
VISION_PROJECTOR_TYPE.LDPV2: "ldpv2",
|
|
|
|
VISION_PROJECTOR_TYPE.RESAMPLER: "resampler",
|
|
|
|
VISION_PROJECTOR_TYPE.GLM_EDGE: "adapter",
|
|
|
|
VISION_PROJECTOR_TYPE.MERGER: "qwen2vl_merger",
|
|
|
|
VISION_PROJECTOR_TYPE.GEMMA3: "gemma3",
|
|
|
|
}
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.TOKEN_EMBD: "token_embd",
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM: "token_embd_norm",
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES: "token_types",
|
|
|
|
MODEL_TENSOR.POS_EMBD: "position_embd",
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM: "output_norm",
|
|
|
|
MODEL_TENSOR.OUTPUT: "output",
|
|
|
|
MODEL_TENSOR.ROPE_FREQS: "rope_freqs",
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG: "rope_factors_long",
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_SHORT: "rope_factors_short",
|
|
|
|
MODEL_TENSOR.ATTN_NORM: "blk.{bid}.attn_norm",
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2: "blk.{bid}.attn_norm_2",
|
|
|
|
MODEL_TENSOR.ATTN_QKV: "blk.{bid}.attn_qkv",
|
|
|
|
MODEL_TENSOR.ATTN_Q: "blk.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.ATTN_K: "blk.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.ATTN_V: "blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.ATTN_OUT: "blk.{bid}.attn_output",
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD: "blk.{bid}.attn_rot_embd",
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM: "blk.{bid}.attn_q_norm",
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM: "blk.{bid}.attn_k_norm",
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM: "blk.{bid}.attn_output_norm",
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM: "blk.{bid}.post_attention_norm",
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP: "blk.{bid}.ffn_gate_inp",
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP: "blk.{bid}.ffn_gate_inp_shexp",
|
|
|
|
MODEL_TENSOR.FFN_NORM: "blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM: "blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM: "blk.{bid}.post_ffw_norm",
|
|
|
|
MODEL_TENSOR.FFN_GATE: "blk.{bid}.ffn_gate",
|
|
|
|
MODEL_TENSOR.FFN_DOWN: "blk.{bid}.ffn_down",
|
|
|
|
MODEL_TENSOR.FFN_UP: "blk.{bid}.ffn_up",
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP: "blk.{bid}.ffn_gate_shexp",
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP: "blk.{bid}.ffn_down_shexp",
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP: "blk.{bid}.ffn_up_shexp",
|
|
|
|
MODEL_TENSOR.FFN_ACT: "blk.{bid}.ffn",
|
|
|
|
MODEL_TENSOR.FFN_NORM_EXP: "blk.{bid}.ffn_norm_exps",
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP: "blk.{bid}.ffn_gate_exps",
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP: "blk.{bid}.ffn_down_exps",
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP: "blk.{bid}.ffn_up_exps",
|
2025-01-04 21:06:11 +01:00
|
|
|
MODEL_TENSOR.FFN_EXP_PROBS_B: "blk.{bid}.exp_probs_b",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM: "blk.{bid}.layer_output_norm",
|
2025-06-26 19:34:02 +02:00
|
|
|
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD: "per_layer_token_embd", # gemma3n
|
|
|
|
MODEL_TENSOR.PER_LAYER_MODEL_PROJ: "per_layer_model_proj", # gemma3n
|
|
|
|
MODEL_TENSOR.PER_LAYER_PROJ_NORM: "per_layer_proj_norm", # gemma3n
|
|
|
|
MODEL_TENSOR.ALTUP_UNEMBD_PROJ: "altup_unembd_proj", # gemma3n
|
|
|
|
MODEL_TENSOR.ALTUP_PROJ: "altup_proj", # gemma3n
|
|
|
|
MODEL_TENSOR.PER_LAYER_INP_GATE: "blk.{bid}.inp_gate", # gemma3n
|
|
|
|
MODEL_TENSOR.PER_LAYER_PROJ: "blk.{bid}.proj", # gemma3n
|
|
|
|
MODEL_TENSOR.PER_LAYER_POST_NORM: "blk.{bid}.post_norm", # gemma3n
|
|
|
|
MODEL_TENSOR.ALTUP_CORRECT_COEF: "blk.{bid}.altup_correct_coef", # gemma3n
|
|
|
|
MODEL_TENSOR.ALTUP_CORRECT_SCALE: "blk.{bid}.altup_correct_scale", # gemma3n
|
|
|
|
MODEL_TENSOR.ALTUP_PREDICT_COEF: "blk.{bid}.altup_predict_coef", # gemma3n
|
|
|
|
MODEL_TENSOR.ALTUP_ROUTER: "blk.{bid}.altup_router", # gemma3n
|
|
|
|
MODEL_TENSOR.ALTUP_ROUTER_NORM: "blk.{bid}.altup_router_norm", # gemma3n
|
|
|
|
MODEL_TENSOR.LAUREL_L: "blk.{bid}.laurel_l", # gemma3n
|
|
|
|
MODEL_TENSOR.LAUREL_R: "blk.{bid}.laurel_r", # gemma3n
|
|
|
|
MODEL_TENSOR.LAUREL_POST_NORM: "blk.{bid}.laurel_post_norm", # gemma3n
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.SSM_IN: "blk.{bid}.ssm_in",
|
|
|
|
MODEL_TENSOR.SSM_CONV1D: "blk.{bid}.ssm_conv1d",
|
|
|
|
MODEL_TENSOR.SSM_X: "blk.{bid}.ssm_x",
|
|
|
|
MODEL_TENSOR.SSM_DT: "blk.{bid}.ssm_dt",
|
2025-07-09 14:59:57 -04:00
|
|
|
MODEL_TENSOR.SSM_DT_NORM: "blk.{bid}.ssm_dt_norm",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.SSM_A: "blk.{bid}.ssm_a",
|
2025-07-09 14:59:57 -04:00
|
|
|
MODEL_TENSOR.SSM_B_NORM: "blk.{bid}.ssm_b_norm",
|
|
|
|
MODEL_TENSOR.SSM_C_NORM: "blk.{bid}.ssm_c_norm",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.SSM_D: "blk.{bid}.ssm_d",
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
MODEL_TENSOR.SSM_NORM: "blk.{bid}.ssm_norm",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.SSM_OUT: "blk.{bid}.ssm_out",
|
2025-03-18 07:27:50 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1",
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2",
|
2025-03-18 07:27:50 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_A0: "blk.{bid}.time_mix_a0",
|
|
|
|
MODEL_TENSOR.TIME_MIX_A1: "blk.{bid}.time_mix_a1",
|
|
|
|
MODEL_TENSOR.TIME_MIX_A2: "blk.{bid}.time_mix_a2",
|
|
|
|
MODEL_TENSOR.TIME_MIX_V0: "blk.{bid}.time_mix_v0",
|
|
|
|
MODEL_TENSOR.TIME_MIX_V1: "blk.{bid}.time_mix_v1",
|
|
|
|
MODEL_TENSOR.TIME_MIX_V2: "blk.{bid}.time_mix_v2",
|
|
|
|
MODEL_TENSOR.TIME_MIX_G1: "blk.{bid}.time_mix_g1",
|
|
|
|
MODEL_TENSOR.TIME_MIX_G2: "blk.{bid}.time_mix_g2",
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_K: "blk.{bid}.time_mix_k_k",
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_A: "blk.{bid}.time_mix_k_a",
|
|
|
|
MODEL_TENSOR.TIME_MIX_R_K: "blk.{bid}.time_mix_r_k",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_X: "blk.{bid}.time_mix_lerp_x",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_K: "blk.{bid}.time_mix_lerp_k",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_V: "blk.{bid}.time_mix_lerp_v",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_R: "blk.{bid}.time_mix_lerp_r",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_G: "blk.{bid}.time_mix_lerp_g",
|
2025-01-10 09:58:08 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED: "blk.{bid}.time_mix_lerp_fused",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_W: "blk.{bid}.time_mix_lerp_w",
|
|
|
|
MODEL_TENSOR.TIME_MIX_FIRST: "blk.{bid}.time_mix_first",
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY: "blk.{bid}.time_mix_decay",
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W1: "blk.{bid}.time_mix_decay_w1",
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W2: "blk.{bid}.time_mix_decay_w2",
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY: "blk.{bid}.time_mix_key",
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE: "blk.{bid}.time_mix_value",
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE: "blk.{bid}.time_mix_receptance",
|
|
|
|
MODEL_TENSOR.TIME_MIX_GATE: "blk.{bid}.time_mix_gate",
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN: "blk.{bid}.time_mix_ln",
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT: "blk.{bid}.time_mix_output",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K: "blk.{bid}.channel_mix_lerp_k",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R: "blk.{bid}.channel_mix_lerp_r",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_KEY: "blk.{bid}.channel_mix_key",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE: "blk.{bid}.channel_mix_receptance",
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_VALUE: "blk.{bid}.channel_mix_value",
|
|
|
|
MODEL_TENSOR.ATTN_Q_A: "blk.{bid}.attn_q_a",
|
|
|
|
MODEL_TENSOR.ATTN_Q_B: "blk.{bid}.attn_q_b",
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA: "blk.{bid}.attn_kv_a_mqa",
|
|
|
|
MODEL_TENSOR.ATTN_KV_B: "blk.{bid}.attn_kv_b",
|
2025-04-15 07:49:57 +01:00
|
|
|
MODEL_TENSOR.ATTN_K_B: "blk.{bid}.attn_k_b",
|
|
|
|
MODEL_TENSOR.ATTN_V_B: "blk.{bid}.attn_v_b",
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.ATTN_Q_A_NORM: "blk.{bid}.attn_q_a_norm",
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM: "blk.{bid}.attn_kv_a_norm",
|
|
|
|
MODEL_TENSOR.ATTN_SUB_NORM: "blk.{bid}.attn_sub_norm",
|
|
|
|
MODEL_TENSOR.FFN_SUB_NORM: "blk.{bid}.ffn_sub_norm",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_NORM: "dec.blk.{bid}.attn_norm",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_Q: "dec.blk.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_K: "dec.blk.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_V: "dec.blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_OUT: "dec.blk.{bid}.attn_o",
|
|
|
|
MODEL_TENSOR.DEC_ATTN_REL_B: "dec.blk.{bid}.attn_rel_b",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM: "dec.blk.{bid}.cross_attn_norm",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q: "dec.blk.{bid}.cross_attn_q",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_K: "dec.blk.{bid}.cross_attn_k",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_V: "dec.blk.{bid}.cross_attn_v",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT: "dec.blk.{bid}.cross_attn_o",
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B: "dec.blk.{bid}.cross_attn_rel_b",
|
|
|
|
MODEL_TENSOR.DEC_FFN_NORM: "dec.blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.DEC_FFN_GATE: "dec.blk.{bid}.ffn_gate",
|
|
|
|
MODEL_TENSOR.DEC_FFN_DOWN: "dec.blk.{bid}.ffn_down",
|
|
|
|
MODEL_TENSOR.DEC_FFN_UP: "dec.blk.{bid}.ffn_up",
|
|
|
|
MODEL_TENSOR.DEC_OUTPUT_NORM: "dec.output_norm",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM: "enc.blk.{bid}.attn_norm",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_Q: "enc.blk.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_K: "enc.blk.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_V: "enc.blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT: "enc.blk.{bid}.attn_o",
|
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B: "enc.blk.{bid}.attn_rel_b",
|
|
|
|
MODEL_TENSOR.ENC_FFN_NORM: "enc.blk.{bid}.ffn_norm",
|
|
|
|
MODEL_TENSOR.ENC_FFN_GATE: "enc.blk.{bid}.ffn_gate",
|
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN: "enc.blk.{bid}.ffn_down",
|
|
|
|
MODEL_TENSOR.ENC_FFN_UP: "enc.blk.{bid}.ffn_up",
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM: "enc.output_norm",
|
2024-09-28 17:42:03 +03:00
|
|
|
MODEL_TENSOR.CLS: "cls",
|
|
|
|
MODEL_TENSOR.CLS_OUT: "cls.output",
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_TENSOR.CONV1D: "conv1d",
|
|
|
|
MODEL_TENSOR.CONVNEXT_DW: "convnext.{bid}.dw",
|
|
|
|
MODEL_TENSOR.CONVNEXT_NORM: "convnext.{bid}.norm",
|
|
|
|
MODEL_TENSOR.CONVNEXT_PW1: "convnext.{bid}.pw1",
|
|
|
|
MODEL_TENSOR.CONVNEXT_PW2: "convnext.{bid}.pw2",
|
|
|
|
MODEL_TENSOR.CONVNEXT_GAMMA: "convnext.{bid}.gamma",
|
|
|
|
MODEL_TENSOR.POSNET_CONV1: "posnet.{bid}.conv1",
|
|
|
|
MODEL_TENSOR.POSNET_CONV2: "posnet.{bid}.conv2",
|
|
|
|
MODEL_TENSOR.POSNET_NORM: "posnet.{bid}.norm",
|
|
|
|
MODEL_TENSOR.POSNET_NORM1: "posnet.{bid}.norm1",
|
|
|
|
MODEL_TENSOR.POSNET_NORM2: "posnet.{bid}.norm2",
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_NORM: "posnet.{bid}.attn_norm",
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_Q: "posnet.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_K: "posnet.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_V: "posnet.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_OUT: "posnet.{bid}.attn_output",
|
2025-07-11 20:27:01 +02:00
|
|
|
MODEL_TENSOR.SHORTCONV_CONV: "blk.{bid}.shortconv.conv",
|
|
|
|
MODEL_TENSOR.SHORTCONV_INPROJ: "blk.{bid}.shortconv.in_proj",
|
|
|
|
MODEL_TENSOR.SHORTCONV_OUTPROJ: "blk.{bid}.shortconv.out_proj",
|
2025-04-20 23:29:36 +02:00
|
|
|
# vision
|
|
|
|
MODEL_TENSOR.V_MMPROJ: "mm.{bid}",
|
|
|
|
MODEL_TENSOR.V_MMPROJ_FC: "mm.model.fc",
|
|
|
|
MODEL_TENSOR.V_MMPROJ_MLP: "mm.model.mlp.{bid}",
|
|
|
|
MODEL_TENSOR.V_MMPROJ_PEG: "mm.model.peg.{bid}",
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_CLS: "v.class_embd",
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_PATCH: "v.patch_embd",
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_POS: "v.position_embd",
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_Q: "v.blk.{bid}.attn_q",
|
2025-05-11 11:35:52 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_Q_NORM: "v.blk.{bid}.attn_q_norm",
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_K: "v.blk.{bid}.attn_k",
|
2025-05-11 11:35:52 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_K_NORM: "v.blk.{bid}.attn_k_norm",
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_V: "v.blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.V_ENC_INPUT_NORM: "v.blk.{bid}.ln1",
|
2025-05-19 13:04:14 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_O: "v.blk.{bid}.attn_out",
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_O_NORM: "v.blk.{bid}.attn_out_norm",
|
|
|
|
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: "v.blk.{bid}.ln2",
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_FFN_UP: "v.blk.{bid}.ffn_up",
|
2025-04-23 20:21:59 +02:00
|
|
|
MODEL_TENSOR.V_ENC_FFN_GATE: "v.blk.{bid}.ffn_gate",
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_FFN_DOWN: "v.blk.{bid}.ffn_down",
|
2025-05-10 16:26:42 +02:00
|
|
|
MODEL_TENSOR.V_LAYER_SCALE_1: "v.blk.{bid}.ls1",
|
|
|
|
MODEL_TENSOR.V_LAYER_SCALE_2: "v.blk.{bid}.ls2",
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_PRE_NORM: "v.pre_ln",
|
|
|
|
MODEL_TENSOR.V_POST_NORM: "v.post_ln",
|
|
|
|
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
|
2025-05-01 17:05:42 +02:00
|
|
|
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
|
|
|
|
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k",
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q",
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k",
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_V: "resampler.attn.v",
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_OUT: "resampler.attn.out",
|
|
|
|
MODEL_TENSOR.V_RESMPL_KV: "resampler.kv",
|
|
|
|
MODEL_TENSOR.V_RESMPL_KV_NORM: "resampler.ln_kv",
|
|
|
|
MODEL_TENSOR.V_RESMPL_POST_NORM: "resampler.ln_post",
|
|
|
|
MODEL_TENSOR.V_RESMPL_Q_NORM: "resampler.ln_q",
|
|
|
|
MODEL_TENSOR.V_RESMPL_PROJ: "resampler.proj",
|
|
|
|
MODEL_TENSOR.V_RESMPL_QUERY: "resampler.query",
|
2025-04-23 20:21:59 +02:00
|
|
|
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK: "v.token_embd.img_break", # pixtral
|
2025-05-01 17:05:42 +02:00
|
|
|
MODEL_TENSOR.V_MM_PATCH_MERGER: "mm.patch_merger", # mistral small 3.1
|
2025-05-22 20:42:48 +02:00
|
|
|
# audio (mtmd)
|
|
|
|
MODEL_TENSOR.A_ENC_EMBD_POS: "a.position_embd",
|
|
|
|
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
|
|
|
|
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
|
|
|
|
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
|
|
|
|
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
|
|
|
|
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
|
|
|
|
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
|
|
|
|
MODEL_TENSOR.A_MMPROJ: "mm.a.mlp.{bid}",
|
2025-05-25 14:06:32 +02:00
|
|
|
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
|
2025-05-22 20:42:48 +02:00
|
|
|
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
|
|
|
|
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
|
2023-11-10 22:04:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
2025-05-22 20:42:48 +02:00
|
|
|
MODEL_ARCH.MMPROJ: [
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_MMPROJ,
|
|
|
|
MODEL_TENSOR.V_MMPROJ_FC,
|
|
|
|
MODEL_TENSOR.V_MMPROJ_MLP,
|
|
|
|
MODEL_TENSOR.V_MMPROJ_PEG,
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_CLS,
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_PATCH,
|
|
|
|
MODEL_TENSOR.V_ENC_EMBD_POS,
|
2025-05-19 13:04:14 +02:00
|
|
|
MODEL_TENSOR.V_ENC_INPUT_NORM,
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_Q,
|
2025-05-11 11:35:52 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_Q_NORM,
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_K,
|
2025-05-11 11:35:52 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_K_NORM,
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_V,
|
2025-05-19 13:04:14 +02:00
|
|
|
MODEL_TENSOR.V_ENC_ATTN_O,
|
|
|
|
MODEL_TENSOR.V_ENC_ATTN_O_NORM,
|
|
|
|
MODEL_TENSOR.V_ENC_POST_ATTN_NORM,
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_FFN_UP,
|
2025-04-23 20:21:59 +02:00
|
|
|
MODEL_TENSOR.V_ENC_FFN_GATE,
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_ENC_FFN_DOWN,
|
2025-05-10 16:26:42 +02:00
|
|
|
MODEL_TENSOR.V_LAYER_SCALE_1,
|
|
|
|
MODEL_TENSOR.V_LAYER_SCALE_2,
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_PRE_NORM,
|
|
|
|
MODEL_TENSOR.V_POST_NORM,
|
|
|
|
MODEL_TENSOR.V_MM_INP_PROJ,
|
2025-05-01 17:05:42 +02:00
|
|
|
MODEL_TENSOR.V_MM_INP_NORM,
|
2025-04-20 23:29:36 +02:00
|
|
|
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
|
|
|
|
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_Q,
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_K,
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_V,
|
|
|
|
MODEL_TENSOR.V_RESMPL_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.V_RESMPL_KV,
|
|
|
|
MODEL_TENSOR.V_RESMPL_KV_NORM,
|
|
|
|
MODEL_TENSOR.V_RESMPL_POST_NORM,
|
|
|
|
MODEL_TENSOR.V_RESMPL_Q_NORM,
|
|
|
|
MODEL_TENSOR.V_RESMPL_PROJ,
|
|
|
|
MODEL_TENSOR.V_RESMPL_QUERY,
|
2025-04-23 20:21:59 +02:00
|
|
|
MODEL_TENSOR.V_TOK_EMBD_IMG_BREAK,
|
2025-05-01 17:05:42 +02:00
|
|
|
MODEL_TENSOR.V_MM_PATCH_MERGER,
|
2025-05-22 20:42:48 +02:00
|
|
|
# audio
|
|
|
|
MODEL_TENSOR.A_ENC_EMBD_POS,
|
|
|
|
MODEL_TENSOR.A_ENC_CONV1D,
|
|
|
|
MODEL_TENSOR.A_PRE_NORM,
|
|
|
|
MODEL_TENSOR.A_POST_NORM,
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_Q,
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_K,
|
|
|
|
MODEL_TENSOR.A_ENC_ATTN_V,
|
|
|
|
MODEL_TENSOR.A_ENC_INPUT_NORM,
|
|
|
|
MODEL_TENSOR.A_ENC_OUTPUT,
|
|
|
|
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_UP,
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_GATE,
|
|
|
|
MODEL_TENSOR.A_ENC_FFN_DOWN,
|
|
|
|
MODEL_TENSOR.A_MMPROJ,
|
2025-05-25 14:06:32 +02:00
|
|
|
MODEL_TENSOR.A_MMPROJ_FC,
|
2025-05-22 20:42:48 +02:00
|
|
|
MODEL_TENSOR.A_MM_NORM_PRE,
|
|
|
|
MODEL_TENSOR.A_MM_NORM_MID,
|
2025-04-20 23:29:36 +02:00
|
|
|
],
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_ARCH.LLAMA: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
2023-12-13 13:04:25 +01:00
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2023-12-13 13:04:25 +01:00
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
2023-11-10 22:04:50 -07:00
|
|
|
],
|
2025-04-07 23:06:44 +02:00
|
|
|
MODEL_ARCH.LLAMA4: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
2024-12-23 08:22:33 +08:00
|
|
|
MODEL_ARCH.DECI: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-03-23 17:41:53 +01:00
|
|
|
MODEL_ARCH.GROK: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
|
|
],
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_ARCH.GPTNEOX: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.FALCON: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.BAICHUAN: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.STARCODER: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.BERT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2024-02-11 10:21:38 -06:00
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
2024-02-11 10:21:38 -06:00
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
2025-05-29 21:42:31 +02:00
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-02-11 10:21:38 -06:00
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
2024-09-28 17:42:03 +03:00
|
|
|
MODEL_TENSOR.CLS,
|
|
|
|
MODEL_TENSOR.CLS_OUT,
|
2023-11-10 22:04:50 -07:00
|
|
|
],
|
2024-02-13 12:03:53 -05:00
|
|
|
MODEL_ARCH.NOMIC_BERT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
|
|
],
|
2025-04-28 15:52:15 -04:00
|
|
|
MODEL_ARCH.NOMIC_BERT_MOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
|
|
|
],
|
2025-06-16 21:53:41 +09:00
|
|
|
MODEL_ARCH.NEO_BERT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.CLS,
|
|
|
|
MODEL_TENSOR.CLS_OUT,
|
|
|
|
],
|
2024-05-11 09:46:09 +02:00
|
|
|
MODEL_ARCH.JINA_BERT_V2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.TOKEN_TYPES,
|
2024-06-06 09:22:41 +02:00
|
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
2024-05-11 09:46:09 +02:00
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.LAYER_OUT_NORM,
|
2024-09-28 17:42:03 +03:00
|
|
|
MODEL_TENSOR.CLS,
|
2024-05-11 09:46:09 +02:00
|
|
|
],
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_ARCH.MPT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2023-12-27 22:39:45 +07:00
|
|
|
MODEL_TENSOR.FFN_ACT,
|
2024-04-04 02:05:10 +08:00
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
2023-11-10 22:04:50 -07:00
|
|
|
],
|
|
|
|
MODEL_ARCH.GPTJ: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.REFACT: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.BLOOM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
2023-11-14 11:17:12 +01:00
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.STABLELM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-04-16 08:48:35 -07:00
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
2023-11-10 22:04:50 -07:00
|
|
|
],
|
2023-12-02 02:16:31 +08:00
|
|
|
MODEL_ARCH.QWEN: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-01-19 19:53:13 +08:00
|
|
|
MODEL_ARCH.QWEN2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
2024-12-07 16:12:27 -05:00
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
2024-01-19 19:53:13 +08:00
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2025-07-16 20:03:51 +08:00
|
|
|
MODEL_ARCH.DREAM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
llama : add Qwen2VL support + multimodal RoPE (#10361)
* Barebone Qwen2VL LLM convertor
* Add Qwen2VL cli entrypoint
* [WIP] add qwen2vl arch
* Verify m-rope output
* Add vl-rope/2d-rope support for qwen2vl ViT
* update qwen2vl cli tool
* update 5D tensor op workaround
* [WIP] qwen2vl vision model
* make batch and clip utils compatible with qwen2vl
* [WIP] create inference workflow, gguf convert script but fix
* correcting vision-rope behavior, add the missing last layer back to ViT
* add arg parser to qwen2vl_surgery
* replace variable size array with vector
* cuda-gdb cmake preset
* add fp32 mrope, vision rope kernel
* add fp16 support for qwen2vl and m-rope
* add `GGML_ROPE_TYPE_MROPE`, `GGML_ROPE_TYPE_VISION`
* fix rope op mode switching, out dated func args
* update `llama_hparams`
* update to keep up stream changes
* resolve linter, test errors
* add makefile entry, update speical image padding token
* add mrope unit test, fix few compiler warnings
* rename `mrope` related function, params
* minor updates on debug util, bug fixs
* add `m-rope` testcase to `test-backend-ops`
* Apply suggestions from code review
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
* fix traililng whitespce
* store `llama_hparams.rope_sections` with fixed size array
* update position id tensor size check in GGML_OP_ROPE
* minor updates
* update `ggml_backend_*_supports_op` of unsupported backends
* remote old `rope_section` compare operator
---------
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
2024-12-14 20:43:46 +08:00
|
|
|
MODEL_ARCH.QWEN2VL: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
2024-01-19 19:53:13 +08:00
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-04-16 23:40:48 +08:00
|
|
|
MODEL_ARCH.QWEN2MOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
2025-04-09 17:47:36 +08:00
|
|
|
MODEL_ARCH.QWEN3: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.QWEN3MOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2023-12-24 22:35:49 +09:00
|
|
|
MODEL_ARCH.PLAMO: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2025-07-16 01:11:42 +09:00
|
|
|
MODEL_ARCH.PLAMO2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
|
|
MODEL_TENSOR.SSM_IN,
|
|
|
|
MODEL_TENSOR.SSM_CONV1D,
|
|
|
|
MODEL_TENSOR.SSM_X,
|
|
|
|
MODEL_TENSOR.SSM_DT,
|
|
|
|
MODEL_TENSOR.SSM_A,
|
|
|
|
MODEL_TENSOR.SSM_D,
|
|
|
|
MODEL_TENSOR.SSM_OUT,
|
|
|
|
MODEL_TENSOR.SSM_DT_NORM,
|
|
|
|
MODEL_TENSOR.SSM_B_NORM,
|
|
|
|
MODEL_TENSOR.SSM_C_NORM,
|
|
|
|
],
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_ARCH.GPT2: [
|
2023-12-28 09:03:57 -05:00
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2023-11-10 22:04:50 -07:00
|
|
|
],
|
2023-12-18 17:27:47 +00:00
|
|
|
MODEL_ARCH.PHI2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2024-04-24 15:00:37 +08:00
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.PHI3: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2023-12-18 17:27:47 +00:00
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
2024-10-01 02:31:36 -04:00
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG,
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_SHORT,
|
2023-12-18 17:27:47 +00:00
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
2024-01-13 13:44:37 +02:00
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
2023-12-18 17:27:47 +00:00
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-01-19 17:07:27 +08:00
|
|
|
],
|
2025-01-09 11:21:41 +01:00
|
|
|
MODEL_ARCH.PHIMOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG,
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_SHORT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-01-19 17:07:27 +08:00
|
|
|
MODEL_ARCH.CODESHELL: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.POS_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-01-28 16:00:30 +08:00
|
|
|
],
|
|
|
|
MODEL_ARCH.ORION: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-02-01 17:19:51 +08:00
|
|
|
MODEL_ARCH.INTERNLM2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-02-07 14:15:56 +08:00
|
|
|
MODEL_ARCH.MINICPM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2024-06-03 15:49:30 +08:00
|
|
|
MODEL_TENSOR.OUTPUT,
|
2024-02-07 14:15:56 +08:00
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
2024-12-04 17:42:50 +08:00
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG,
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_SHORT,
|
2024-02-07 14:15:56 +08:00
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-09-16 14:45:20 +08:00
|
|
|
MODEL_ARCH.MINICPM3: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
2024-10-01 02:31:36 -04:00
|
|
|
MODEL_TENSOR.ROPE_FACTORS_LONG,
|
|
|
|
MODEL_TENSOR.ROPE_FACTORS_SHORT,
|
2024-09-16 14:45:20 +08:00
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q_A,
|
|
|
|
MODEL_TENSOR.ATTN_Q_B,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
|
|
|
MODEL_TENSOR.ATTN_KV_B,
|
|
|
|
MODEL_TENSOR.ATTN_Q_A_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-02-21 05:08:22 -08:00
|
|
|
MODEL_ARCH.GEMMA: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
],
|
2024-06-28 00:00:43 -04:00
|
|
|
MODEL_ARCH.GEMMA2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM,
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
|
|
],
|
2025-03-12 09:30:24 +01:00
|
|
|
MODEL_ARCH.GEMMA3: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
2025-03-22 23:28:19 +01:00
|
|
|
MODEL_TENSOR.OUTPUT,
|
2025-03-12 09:30:24 +01:00
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM,
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
|
|
],
|
2025-06-26 19:34:02 +02:00
|
|
|
MODEL_ARCH.GEMMA3N: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM,
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
|
|
# altup / laurel
|
|
|
|
MODEL_TENSOR.PER_LAYER_TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.PER_LAYER_MODEL_PROJ,
|
|
|
|
MODEL_TENSOR.PER_LAYER_INP_GATE,
|
|
|
|
MODEL_TENSOR.PER_LAYER_PROJ,
|
|
|
|
MODEL_TENSOR.PER_LAYER_PROJ_NORM,
|
|
|
|
MODEL_TENSOR.PER_LAYER_POST_NORM,
|
|
|
|
MODEL_TENSOR.ALTUP_PROJ,
|
|
|
|
MODEL_TENSOR.ALTUP_UNEMBD_PROJ,
|
|
|
|
MODEL_TENSOR.ALTUP_CORRECT_COEF,
|
|
|
|
MODEL_TENSOR.ALTUP_CORRECT_SCALE,
|
|
|
|
MODEL_TENSOR.ALTUP_PREDICT_COEF,
|
|
|
|
MODEL_TENSOR.ALTUP_ROUTER,
|
|
|
|
MODEL_TENSOR.ALTUP_ROUTER_NORM,
|
|
|
|
MODEL_TENSOR.LAUREL_L,
|
|
|
|
MODEL_TENSOR.LAUREL_R,
|
|
|
|
MODEL_TENSOR.LAUREL_POST_NORM,
|
|
|
|
],
|
2024-03-02 01:00:46 +05:30
|
|
|
MODEL_ARCH.STARCODER2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_ARCH.RWKV6: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_X,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_K,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_V,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_R,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_G,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_W,
|
2025-01-10 09:58:08 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
2024-09-01 22:38:17 +08:00
|
|
|
MODEL_TENSOR.TIME_MIX_FIRST,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_GATE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN,
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_R,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_RECEPTANCE,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
|
|
|
],
|
2025-01-10 09:58:08 +08:00
|
|
|
MODEL_ARCH.RWKV6QWEN2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_X,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_K,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_V,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_R,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_G,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_W,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
|
|
|
MODEL_TENSOR.TIME_MIX_FIRST,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_DECAY_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_GATE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN,
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
2025-03-18 07:27:50 +08:00
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.RWKV7: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM_2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W0,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_A0,
|
|
|
|
MODEL_TENSOR.TIME_MIX_A1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_A2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_V0,
|
|
|
|
MODEL_TENSOR.TIME_MIX_V1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_V2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_G1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_G2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_K,
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_A,
|
|
|
|
MODEL_TENSOR.TIME_MIX_R_K,
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN,
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_LERP_K,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_KEY,
|
|
|
|
MODEL_TENSOR.CHANNEL_MIX_VALUE,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.ARWKV7: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LERP_FUSED,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W0,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_W2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_A0,
|
|
|
|
MODEL_TENSOR.TIME_MIX_A1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_A2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_V0,
|
|
|
|
MODEL_TENSOR.TIME_MIX_V1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_V2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_G1,
|
|
|
|
MODEL_TENSOR.TIME_MIX_G2,
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_K,
|
|
|
|
MODEL_TENSOR.TIME_MIX_K_A,
|
|
|
|
MODEL_TENSOR.TIME_MIX_R_K,
|
|
|
|
MODEL_TENSOR.TIME_MIX_KEY,
|
|
|
|
MODEL_TENSOR.TIME_MIX_VALUE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_RECEPTANCE,
|
|
|
|
MODEL_TENSOR.TIME_MIX_LN,
|
|
|
|
MODEL_TENSOR.TIME_MIX_OUTPUT,
|
2025-01-10 09:58:08 +08:00
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
|
|
|
MODEL_ARCH.MAMBA: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.SSM_IN,
|
|
|
|
MODEL_TENSOR.SSM_CONV1D,
|
|
|
|
MODEL_TENSOR.SSM_X,
|
|
|
|
MODEL_TENSOR.SSM_DT,
|
|
|
|
MODEL_TENSOR.SSM_A,
|
|
|
|
MODEL_TENSOR.SSM_D,
|
|
|
|
MODEL_TENSOR.SSM_OUT,
|
|
|
|
],
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
MODEL_ARCH.MAMBA2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.SSM_IN,
|
|
|
|
MODEL_TENSOR.SSM_CONV1D,
|
|
|
|
MODEL_TENSOR.SSM_DT,
|
|
|
|
MODEL_TENSOR.SSM_A,
|
|
|
|
MODEL_TENSOR.SSM_D,
|
|
|
|
MODEL_TENSOR.SSM_NORM,
|
|
|
|
MODEL_TENSOR.SSM_OUT,
|
|
|
|
],
|
2025-07-09 14:59:57 -04:00
|
|
|
MODEL_ARCH.JAMBA: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.SSM_IN,
|
|
|
|
MODEL_TENSOR.SSM_CONV1D,
|
|
|
|
MODEL_TENSOR.SSM_X,
|
|
|
|
MODEL_TENSOR.SSM_DT,
|
|
|
|
MODEL_TENSOR.SSM_DT_NORM,
|
|
|
|
MODEL_TENSOR.SSM_A,
|
|
|
|
MODEL_TENSOR.SSM_B_NORM,
|
|
|
|
MODEL_TENSOR.SSM_C_NORM,
|
|
|
|
MODEL_TENSOR.SSM_D,
|
|
|
|
MODEL_TENSOR.SSM_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-03-29 21:37:03 +08:00
|
|
|
MODEL_ARCH.XVERSE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-03-15 16:41:22 -04:00
|
|
|
MODEL_ARCH.COMMAND_R: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
2024-04-09 09:16:13 +01:00
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
2024-03-15 16:41:22 -04:00
|
|
|
],
|
2025-01-04 09:33:31 -05:00
|
|
|
MODEL_ARCH.COHERE2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-04-13 11:33:52 +02:00
|
|
|
MODEL_ARCH.DBRX: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_OUT_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-04-19 09:35:54 +00:00
|
|
|
MODEL_ARCH.OLMO: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-11-25 10:36:09 -08:00
|
|
|
MODEL_ARCH.OLMO2: [
|
2024-11-19 01:04:08 -08:00
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-09-15 23:47:37 -07:00
|
|
|
MODEL_ARCH.OLMOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
],
|
2024-07-05 05:14:21 +12:00
|
|
|
MODEL_ARCH.OPENELM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-05-24 14:31:13 +02:00
|
|
|
MODEL_ARCH.ARCTIC: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_NORM_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
],
|
2024-12-16 00:02:46 +07:00
|
|
|
MODEL_ARCH.DEEPSEEK: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
MODEL_ARCH.DEEPSEEK2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_A,
|
|
|
|
MODEL_TENSOR.ATTN_Q_B,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
|
|
|
MODEL_TENSOR.ATTN_KV_B,
|
2025-04-15 07:49:57 +01:00
|
|
|
MODEL_TENSOR.ATTN_K_B,
|
|
|
|
MODEL_TENSOR.ATTN_V_B,
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
MODEL_TENSOR.ATTN_Q_A_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
2025-01-04 21:06:11 +01:00
|
|
|
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
],
|
2025-03-27 10:49:15 +00:00
|
|
|
MODEL_ARCH.PLM: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_MQA,
|
|
|
|
MODEL_TENSOR.ATTN_KV_A_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_KV_B,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
],
|
2024-07-07 20:52:10 +08:00
|
|
|
MODEL_ARCH.CHATGLM : [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
2025-02-02 15:48:46 +08:00
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
2024-07-07 20:52:10 +08:00
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2025-04-11 18:10:10 +08:00
|
|
|
MODEL_ARCH.GLM4 : [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ATTN_POST_NORM,
|
|
|
|
MODEL_TENSOR.FFN_POST_NORM,
|
|
|
|
],
|
2024-06-24 02:27:57 +08:00
|
|
|
MODEL_ARCH.BITNET: [
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.ATTN_SUB_NORM,
|
|
|
|
MODEL_TENSOR.FFN_SUB_NORM,
|
|
|
|
],
|
2024-06-24 07:06:05 +02:00
|
|
|
MODEL_ARCH.T5: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_Q,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_K,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_V,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.DEC_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_Q,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_K,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_V,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.DEC_CROSS_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.DEC_FFN_NORM,
|
|
|
|
MODEL_TENSOR.DEC_FFN_GATE,
|
|
|
|
MODEL_TENSOR.DEC_FFN_DOWN,
|
|
|
|
MODEL_TENSOR.DEC_FFN_UP,
|
|
|
|
MODEL_TENSOR.DEC_OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_Q,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_K,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_V,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.ENC_FFN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_FFN_GATE,
|
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
2024-08-10 11:43:26 +02:00
|
|
|
MODEL_TENSOR.ENC_FFN_UP,
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.T5ENCODER: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_Q,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_K,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_V,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ENC_ATTN_REL_B,
|
|
|
|
MODEL_TENSOR.ENC_FFN_NORM,
|
|
|
|
MODEL_TENSOR.ENC_FFN_GATE,
|
|
|
|
MODEL_TENSOR.ENC_FFN_DOWN,
|
2024-06-24 07:06:05 +02:00
|
|
|
MODEL_TENSOR.ENC_FFN_UP,
|
|
|
|
MODEL_TENSOR.ENC_OUTPUT_NORM,
|
|
|
|
],
|
2024-07-02 10:36:00 -04:00
|
|
|
MODEL_ARCH.JAIS: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_QKV,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-08-15 19:23:33 -07:00
|
|
|
MODEL_ARCH.NEMOTRON: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-08-16 15:35:18 +09:00
|
|
|
MODEL_ARCH.EXAONE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
2024-09-17 00:44:58 -06:00
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
|
|
|
MODEL_ARCH.GRANITE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
2024-09-25 01:06:52 -06:00
|
|
|
MODEL_TENSOR.OUTPUT,
|
2024-09-17 00:44:58 -06:00
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
2024-08-16 15:35:18 +09:00
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-09-25 01:06:52 -06:00
|
|
|
MODEL_ARCH.GRANITE_MOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
2025-05-13 07:12:01 -06:00
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
2024-09-25 01:06:52 -06:00
|
|
|
],
|
model : Granite Four (#13550)
* wip: llama : separate recurrent states from the KV cache
This will be necessary to support Jamba
(and other recurrent models mixed with Attention).
Doesn't compile yet, and finding a slot isn't yet done correctly for recurrent states.
* llama : use std::find for seq_nodes in llama_rs_cache
* llama : state checkpoints for recurrent models
* llama : correctly handle more edge cases for the rs cache
* llama : rename many llama_kv_cache_* functions
* llama : remove useless return value for some llama_cache_* functions
* llama : rethink recurrent state cell counts
* llama : begin work on support for variable GQA
This will also be useful for Jamba if we consider the Mamba layers
to have 0 KV heads.
* llama : gracefully fail when not finding hybrid slot
* llama : support Jamba
* llama : fix BERT inference without KV cache
* convert-hf : check for unprocessed Jamba experts
* convert-hf : support Mini-Jamba conversion
* llama : fix Jamba quantization sanity checks
* llama : sequence-length-aware batch splitting
* llama : use equal-sequence-length sub-batches for recurrent models
* ggml : simplify SSM-related operators
* llama : make recurrent state slot allocation contiguous
* llama : adapt internal uses of batches to llama_ubatch
* llama : fix batch split output count for embeddings
* llama : minimize swaps when reordering logits
This reduces overhead when running hellaswag
on thousands of sequences with very small 100k params Mamba models.
* llama : fix edge case finding batch seq_id of split recurrent cell
This otherwise was a problem when running the HellaSwag benchmark
with small batch sizes, making it crash.
* llama : avoid copies for simple batch splits
* llama : use im2col and mul_mat to perform convolution for Mamba
This removes the need for ggml_ssm_conv!!!
But performance seems slighly worse on my system,
especially for prompt processing.
Maybe ggml_mul_mat isn't optimized for small row sizes?
More performance testing is necessary until GGML_OP_SSM_CONV is removed.
* ggml : make ggml_ssm_scan not modify its source tensors
* llama : fix shared recurrent tail cell count for small ubatch sizes
Otherwise it was impossible to run the 'parallel' example with '-ub 1'
with a Mamba or Jamba model.
* llama : fix .base() compilation error on Windows
* llama : allow doing the equivalent of SSM_CONV with SUM_ROWS and MUL
* ggml : allow GGML_OP_CONCAT to work on non-contiguous tensors
The implementation already supported it,
and this makes Mamba's conv step slightly faster.
* llama : rename llama_cache to llama_past
This can be changed back later if the name change is wrong.
I was renaming the functions anyway to generalize kv-cache-related
functions to hybrid and recurrent model architectures.
I think llama_past is a better name than llama_cache for a combined
kv cache and recurrent state cache, because the states it contains
pretty much always come before the newly-added ones for any particular
sequence. Also 'llama_past_clear' sounds more obvious in what it does
than 'llama_kv_cache_clear'. The future is what the models generate.
(For embeddings, the kv cache isn't really used anyway)
Still, I'm open to better suggestions.
* examples : replace llama_kv_cache_seq_* with llama_past_seq_*
* mamba : fix non-contiguous usage of ggml_silu
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : session saving and reloading for hybrid models
* convert_hf : fix Jamba conversion
* llama : fix mixed signedness comparison
* llama : use unused n_embd_k_gqa in k_shift
This also slightly reduces the diff from the master branch
* llama : begin renaming llama_past back to llama_kv_cache
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* llama : remove implicit recurrent state rollbacks
* llama : partially apply clang-format style
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* feat: Add conversion for Bamba models
This is borrowed and adapted from the original implementation
https://github.com/ggml-org/llama.cpp/pull/10810
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add Granite 4 conversion
This is a manual copy from my draft branch
https://github.com/gabe-l-hart/llama.cpp/blob/GraniteFourDraft/convert_hf_to_gguf.py#L5076
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Plumb bamba through llama-arch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add bamba to llama_arch_is_hybrid_recurrent
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add optional mamba ssm_in bias tensor
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add template specialization for get_arr to load a vector<uint32_t> for layer index arr in hparams
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Use an explicit bool to determine mamaba vs mamba2
This allows other architectures like bamba and granitemoehybrid to use
mamab2 without a growing architecture `if` statement inside the mamba
implementation.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Isolate mamba(2) and granite attention layer building in static methods
This will allow these layer-builder methods to be used from other build
structs without complex inheritance.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use per-layer sizes in granite build_attention_layer
Also no need to pass in kv cache since it's already in the inp_attn
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: First (broken) pass at end-to-end Bamba implementation
It generates (garbage) tokens! Still lots of debugging to do.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Only do Granite multipliers if set
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Pull granite ffn portion into a static function and reuse in hybrid
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat(py): Allow gguf duplicate keys if they match by value and type
This is helpful for hybrid models that want to do gguf param setting by
calling multiple parent classes without needing to make those parent
classes try/except on every attempt to set a gguf value.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor(py): Simplify granitemoehybrid conversion to use parents better
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add GRANITE_MOE_HYBRID through llama-arch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Support GRANITE_MOE_HYBRID in llama-model
This re-uses the Bamba code paths heavily and simply adds the missing parts
for loading MoE and the shared expert.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* style: Fix flake8 errors
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix recurrent cache get after rebase
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix hybrid granite implementation for signature changes in build_mamba*_layer
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Refactor relationship between non-hybrid classes and hybrid impl to use mixins
The challenge here is to give both the non-hybrid classes (llm_build_mamba
and llm_build_granite) AND the hybrid class (llm_build_hybrid_mamba) access
to the same intermediate "base class" functionality (build_mamba*_layer,
build_granite_attention_layer) without running into trouble with diamond
inheritance of llm_graph_context. Due to the non-trivial initialization
that happens in llm_graph_context, diamond inheritance results in multiple
initializations of the common base which cause problems around the unique
ptrs. I wanted to get away from `self->` everywhere, but this is still a
bit cleaner than making those methods static I think.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Implement the full copy-paste version to duplicate the layer builders
This follows the pattern where the type of input is pinned to the type of
memory and that is used to dispatch to the correct version of `build_rs` /
`build_attn`. There's a lot of code duplication that can hopefully be
pulled into common functions in the graph later.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Rename llm_build_hybrid_mamba -> llm_build_granite_hybrid
I've got back-and-forth a lot about how/if to try to implement reuse of the
"child model" layer types for hybrid models. At the end of the day, I think
hybrid models are their own beast and even if their layers are inspired by
other models, they should maintain control of their own layer building (in
other words, the copy-paste method). Given that, the name should reflect
that this is not a generic hybrid model builder, but rather a granite-
specific hybrid model builder that can do MoE (granite 4) or dense (bamba).
As part if this, I also cleaned up dangling comments from previous attempts
at using static methods for reusability.
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* memory : correctly handle failure in apply()
ggml-ci
* style: Remove TODO for adding first hybrid models to the switch
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix bad merge in tensor_mapping.py w/ SSM_NORM
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Fix bad merge resolution with variable renames/moves in llm_build_mamba
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* docs: Fix comment about duplicate key check
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Conform to standard way of initializing inp_out_ids
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* convert : fix jamba conv1d shape squeezing
* fix: Fix input initialization in granite_hybrid after removal of hybrid inputs
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use llm_graph_context_mamba in llm_build_granite_hybrid
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Refactor mamba2/granite/jamba/granite_hybrid relationships as mixins
The key is for the mixin classes (llm_graph_context_mamba,
llm_graph_context_granite) to use virtual inheritance from
llm_graph_context. This allows the common members to exist only once in the
class hierarchy. The downside is that llm_graph_context will be
re-initialized once for each parent (ie 2x for single mixin, 3x for two
mixins, etc...).
Branch: GraniteFourWithJamba
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* graph : add back hybrid memory graph input
But this time it contains the sub-cache graph inputs.
This *should* make it easier to handle updating the inputs
when caching the graph (eventually).
* model : add Jamba to Mamba-specific hparams printing
* fix: Fix input setup after upstream merge
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* jamba : remove redundant nullptr initializations
* model : remove unnecessary prefix for tensor loading constants
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* model : use ggml_swiglu_split for Mamba
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* feat: Add support for dense FFN in GraniteMoeHybrid
This was already partially supported via reusing the granite ffn builder,
and there may be models that leverage this architecture going forward. The
naming is a bit odd, but in the transformers version, it reuses the same
model class and simply has zero regular experts and a single shared expert
(which is the same as a single dense FFN).
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Add support for dense FFN tensor names on c++ side
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Use child inputs for Falcon H1 after merge resolution
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unnecessary prefix on tensor constants
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
* model : make falcon-h1 use shared mamba2 layer builder
* memory : avoid referring to KV in recurrent cache logs
* fix: Revert order changes for Falcon H1 to stay consistent with upstream
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* gguf-py : avoid adding duplicate tensor mappings for Jamba
Some of the tensor names are common with Llama4
* refactor: Collapse Bamba and GraniteMoeHybrid into GraniteHybrid
The only key difference is the use of rope which is now set via
rope_finetuned in the hparams
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Remove use of diamond inheritance
Per PR discussion, it's simpler to keep this with basic inheritance and not
introduce the complexity of virtual inheritance and multiple inheritance
https://github.com/ggml-org/llama.cpp/pull/13550#issuecomment-3053787556
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* feat: Log mamba params for Granite Hybrid
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unused ssm_in_b
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* refactor: Remove ATTENTION_LAYER_INDICES hparam in favor of n_head_kv
This matches how recurrent vs attention heads are identified for Jamba
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unused template expansion for get_arr
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Review cleanup in convert_hf_to_gguf
The gist is to be explicit about which base class is being used with the
multiple inheritance setup
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Undo hidden warnings about duplicate identical keys in add_key_value
After further discussion, this encourages sloppy overwriting in the model
converters
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: If not using ROPE, context is "infinite"
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* doc: Add a comment outlining expected duplicate key warnings
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
* fix: Remove unnecessary duplicate keys in converter
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
(thanks for the sharp eyes and patience!)
Branch: GraniteFour
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
---------
Signed-off-by: Gabe Goodhart <ghart@us.ibm.com>
Co-authored-by: Francis Couture-Harpin <git@compilade.net>
Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
Co-authored-by: Sigbjørn Skjæret <sigbjorn.skjaeret@scala.com>
2025-07-10 18:20:13 -06:00
|
|
|
MODEL_ARCH.GRANITE_HYBRID: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.SSM_IN,
|
|
|
|
MODEL_TENSOR.SSM_CONV1D,
|
|
|
|
MODEL_TENSOR.SSM_DT,
|
|
|
|
MODEL_TENSOR.SSM_A,
|
|
|
|
MODEL_TENSOR.SSM_D,
|
|
|
|
MODEL_TENSOR.SSM_NORM,
|
|
|
|
MODEL_TENSOR.SSM_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
# MoE
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
# Dense
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-09-28 12:08:43 +00:00
|
|
|
MODEL_ARCH.CHAMELEON: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2024-12-18 19:27:21 +02:00
|
|
|
MODEL_ARCH.WAVTOKENIZER_DEC: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.CONV1D,
|
|
|
|
MODEL_TENSOR.CONVNEXT_DW,
|
|
|
|
MODEL_TENSOR.CONVNEXT_NORM,
|
|
|
|
MODEL_TENSOR.CONVNEXT_PW1,
|
|
|
|
MODEL_TENSOR.CONVNEXT_PW2,
|
|
|
|
MODEL_TENSOR.CONVNEXT_GAMMA,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.POSNET_CONV1,
|
|
|
|
MODEL_TENSOR.POSNET_CONV2,
|
|
|
|
MODEL_TENSOR.POSNET_NORM,
|
|
|
|
MODEL_TENSOR.POSNET_NORM1,
|
|
|
|
MODEL_TENSOR.POSNET_NORM2,
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_NORM,
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_Q,
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_K,
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_V,
|
|
|
|
MODEL_TENSOR.POSNET_ATTN_OUT,
|
|
|
|
],
|
2025-03-30 22:21:03 +02:00
|
|
|
MODEL_ARCH.BAILINGMOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
2025-06-15 00:52:06 -07:00
|
|
|
MODEL_ARCH.DOTS1: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_EXP_PROBS_B,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
2025-06-16 00:04:06 +01:00
|
|
|
MODEL_ARCH.ARCEE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2025-06-28 22:08:21 +08:00
|
|
|
MODEL_ARCH.ERNIE4_5: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2025-07-09 12:03:49 +04:00
|
|
|
MODEL_ARCH.FALCON_H1: [
|
|
|
|
# Token embedding
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
|
|
|
|
# Input layernorm
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
|
|
|
|
# Attention components
|
|
|
|
MODEL_TENSOR.ATTN_Q, # Query projection
|
|
|
|
MODEL_TENSOR.ATTN_K, # Key projection
|
|
|
|
MODEL_TENSOR.ATTN_V, # Value projection
|
|
|
|
MODEL_TENSOR.ATTN_OUT, # Output projection
|
|
|
|
|
|
|
|
# SSM components (Mamba2 specific)
|
|
|
|
MODEL_TENSOR.SSM_IN, # Input projection for SSM
|
|
|
|
MODEL_TENSOR.SSM_CONV1D, # Convolution layer
|
|
|
|
MODEL_TENSOR.SSM_DT, # Delta time projection
|
|
|
|
MODEL_TENSOR.SSM_A, # A parameter (log form)
|
|
|
|
MODEL_TENSOR.SSM_D, # D parameter
|
|
|
|
MODEL_TENSOR.SSM_NORM, # Normalization in SSM
|
|
|
|
MODEL_TENSOR.SSM_OUT, # Output projection
|
|
|
|
|
|
|
|
# Pre-feedforward layernorm
|
|
|
|
MODEL_TENSOR.FFN_PRE_NORM,
|
|
|
|
|
|
|
|
# Feed-forward network components
|
|
|
|
MODEL_TENSOR.FFN_GATE, # Gate projection (SwiGLU)
|
|
|
|
MODEL_TENSOR.FFN_DOWN, # Down projection
|
|
|
|
MODEL_TENSOR.FFN_UP, # Up projection
|
|
|
|
|
|
|
|
# Post-feedforward layernorm
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM, # Final layer norm
|
|
|
|
MODEL_TENSOR.OUTPUT, # Output projection (lm_head)
|
|
|
|
],
|
2025-07-08 10:24:06 +02:00
|
|
|
MODEL_ARCH.HUNYUAN_MOE: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.FFN_GATE_INP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE_EXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_EXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_EXP,
|
|
|
|
MODEL_TENSOR.FFN_GATE_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_DOWN_SHEXP,
|
|
|
|
MODEL_TENSOR.FFN_UP_SHEXP,
|
|
|
|
],
|
2025-07-08 18:07:01 +02:00
|
|
|
MODEL_ARCH.SMOLLM3: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.OUTPUT_NORM,
|
|
|
|
MODEL_TENSOR.OUTPUT,
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
],
|
2025-07-11 20:27:01 +02:00
|
|
|
MODEL_ARCH.LFM2: [
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD,
|
|
|
|
MODEL_TENSOR.TOKEN_EMBD_NORM,
|
|
|
|
MODEL_TENSOR.SHORTCONV_CONV,
|
|
|
|
MODEL_TENSOR.SHORTCONV_INPROJ,
|
|
|
|
MODEL_TENSOR.SHORTCONV_OUTPROJ,
|
|
|
|
MODEL_TENSOR.FFN_GATE,
|
|
|
|
MODEL_TENSOR.FFN_DOWN,
|
|
|
|
MODEL_TENSOR.FFN_UP,
|
|
|
|
MODEL_TENSOR.FFN_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_NORM, # operator_norm
|
|
|
|
MODEL_TENSOR.ATTN_Q_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_K_NORM,
|
|
|
|
MODEL_TENSOR.ATTN_Q,
|
|
|
|
MODEL_TENSOR.ATTN_K,
|
|
|
|
MODEL_TENSOR.ATTN_V,
|
|
|
|
MODEL_TENSOR.ATTN_OUT,
|
|
|
|
],
|
2023-11-10 22:04:50 -07:00
|
|
|
# TODO
|
|
|
|
}
|
|
|
|
|
|
|
|
# tensors that will not be serialized
|
|
|
|
MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
|
|
|
|
MODEL_ARCH.LLAMA: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-12-23 08:22:33 +08:00
|
|
|
MODEL_ARCH.DECI: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2023-11-10 22:04:50 -07:00
|
|
|
MODEL_ARCH.BAICHUAN: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2023-12-02 02:16:31 +08:00
|
|
|
MODEL_ARCH.QWEN: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-01-19 17:07:27 +08:00
|
|
|
MODEL_ARCH.CODESHELL: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-01-28 16:00:30 +08:00
|
|
|
MODEL_ARCH.ORION: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-03-02 01:00:46 +05:30
|
|
|
MODEL_ARCH.STARCODER2: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
2024-03-29 21:37:03 +08:00
|
|
|
],
|
|
|
|
MODEL_ARCH.XVERSE: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
2024-03-02 01:00:46 +05:30
|
|
|
],
|
2024-12-16 00:02:46 +07:00
|
|
|
MODEL_ARCH.DEEPSEEK: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
Add support for DeepseekV2ForCausalLM (#7519)
* common : increase max number of experts to 160
* common : add tensors ATTN_Q_A, ATTN_Q_A_NORM, ATTN_Q_B, ATTN_KV_A_MQA, ATTN_KV_A_NORM, ATTN_KV_B needed by DeepSeek-V2 MLA (multi-head latent attention) architecture
* common : add model header parameters: leading_dense_block_count, expert_feed_forward_length, expert_shared_count, expert_weights_scale, attention.q_lora_rank, attention.kv_lora_rank, rope.scaling.yarn_log_multiplier
* convert-hf : add model conversion support for DeepseekV2ForCausalLM
* llama : add model types for DeepSeek-V2 and DeepSeek-V2-Lite models
* llama : add two new llm_build_moe_ffn() arguments: scale_w (whether to scale weights of selected MoE experts) and w_scale (numerical value of the scaling factor)
* llama : add inference support for LLM_ARCH_DEEPSEEK2
---------
Co-authored-by: Stanisław Szymczyk <sszymczy@gmail.com>
2024-05-28 17:07:05 +02:00
|
|
|
MODEL_ARCH.DEEPSEEK2: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2024-07-07 20:52:10 +08:00
|
|
|
MODEL_ARCH.CHATGLM: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
],
|
2024-08-15 19:23:33 -07:00
|
|
|
MODEL_ARCH.NEMOTRON: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
MODEL_TENSOR.ATTN_ROT_EMBD,
|
|
|
|
],
|
2025-03-30 22:21:03 +02:00
|
|
|
MODEL_ARCH.BAILINGMOE: [
|
|
|
|
MODEL_TENSOR.ROPE_FREQS,
|
|
|
|
],
|
2023-11-10 22:04:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
#
|
|
|
|
# types
|
|
|
|
#
|
|
|
|
|
|
|
|
|
|
|
|
class TokenType(IntEnum):
|
|
|
|
NORMAL = 1
|
|
|
|
UNKNOWN = 2
|
|
|
|
CONTROL = 3
|
|
|
|
USER_DEFINED = 4
|
|
|
|
UNUSED = 5
|
|
|
|
BYTE = 6
|
|
|
|
|
|
|
|
|
|
|
|
class RopeScalingType(Enum):
|
2024-12-04 17:42:50 +08:00
|
|
|
NONE = 'none'
|
|
|
|
LINEAR = 'linear'
|
|
|
|
YARN = 'yarn'
|
|
|
|
LONGROPE = 'longrope'
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
|
2024-02-15 11:21:49 -06:00
|
|
|
class PoolingType(IntEnum):
|
|
|
|
NONE = 0
|
|
|
|
MEAN = 1
|
|
|
|
CLS = 2
|
2025-05-02 11:42:30 -04:00
|
|
|
LAST = 3
|
|
|
|
RANK = 4
|
2024-02-15 11:21:49 -06:00
|
|
|
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
class GGMLQuantizationType(IntEnum):
|
2024-03-03 09:43:42 +01:00
|
|
|
F32 = 0
|
|
|
|
F16 = 1
|
|
|
|
Q4_0 = 2
|
|
|
|
Q4_1 = 3
|
|
|
|
Q5_0 = 6
|
|
|
|
Q5_1 = 7
|
|
|
|
Q8_0 = 8
|
|
|
|
Q8_1 = 9
|
|
|
|
Q2_K = 10
|
|
|
|
Q3_K = 11
|
|
|
|
Q4_K = 12
|
|
|
|
Q5_K = 13
|
|
|
|
Q6_K = 14
|
|
|
|
Q8_K = 15
|
|
|
|
IQ2_XXS = 16
|
|
|
|
IQ2_XS = 17
|
|
|
|
IQ3_XXS = 18
|
|
|
|
IQ1_S = 19
|
|
|
|
IQ4_NL = 20
|
|
|
|
IQ3_S = 21
|
|
|
|
IQ2_S = 22
|
|
|
|
IQ4_XS = 23
|
2024-03-14 04:40:14 -06:00
|
|
|
I8 = 24
|
|
|
|
I16 = 25
|
|
|
|
I32 = 26
|
gguf : add support for I64 and F64 arrays (#6062)
* gguf : add support for I64 and F64 arrays
GGML currently does not support I64 or F64 arrays and they are not often
used in machine learning, however if in the future the need arises, it
would be nice to add them now, so that the types are next to the other
types I8, I16, I32 in the enums, and it also reserves their type number.
Furthermore, with this addition the GGUF format becomes very usable for
most computational applications of NumPy (being compatible with the most
common NumPy dtypes: i8, i16, i32, i64, f32, f64), providing a faster,
and more versatile alternative to the `npz` format, and a simpler
alternative to the `hdf5` format.
The change in this PR seems small, not significantly increasing the
maintenance burden. I tested this from Python using GGUFWriter/Reader
and `gguf-dump`, as well as from C, everything seems to work.
* Fix compiler warnings
2024-03-15 02:46:51 -06:00
|
|
|
I64 = 27
|
|
|
|
F64 = 28
|
2024-03-26 15:21:27 +01:00
|
|
|
IQ1_M = 29
|
2024-05-08 02:30:09 -04:00
|
|
|
BF16 = 30
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-05 21:48:47 -04:00
|
|
|
TQ1_0 = 34
|
|
|
|
TQ2_0 = 35
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
|
2025-01-04 21:06:11 +01:00
|
|
|
class ExpertGatingFuncType(IntEnum):
|
|
|
|
SOFTMAX = 1
|
|
|
|
SIGMOID = 2
|
|
|
|
|
|
|
|
|
2024-05-11 11:06:26 -04:00
|
|
|
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
|
|
|
|
|
|
|
|
|
|
|
# from llama_ftype in llama.h
|
|
|
|
# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
|
|
|
|
class LlamaFileType(IntEnum):
|
|
|
|
ALL_F32 = 0
|
|
|
|
MOSTLY_F16 = 1 # except 1d tensors
|
|
|
|
MOSTLY_Q4_0 = 2 # except 1d tensors
|
|
|
|
MOSTLY_Q4_1 = 3 # except 1d tensors
|
2024-08-08 13:33:09 -04:00
|
|
|
# MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
2024-05-11 11:06:26 -04:00
|
|
|
# MOSTLY_Q4_2 = 5 # support has been removed
|
|
|
|
# MOSTLY_Q4_3 = 6 # support has been removed
|
|
|
|
MOSTLY_Q8_0 = 7 # except 1d tensors
|
|
|
|
MOSTLY_Q5_0 = 8 # except 1d tensors
|
|
|
|
MOSTLY_Q5_1 = 9 # except 1d tensors
|
|
|
|
MOSTLY_Q2_K = 10 # except 1d tensors
|
|
|
|
MOSTLY_Q3_K_S = 11 # except 1d tensors
|
|
|
|
MOSTLY_Q3_K_M = 12 # except 1d tensors
|
|
|
|
MOSTLY_Q3_K_L = 13 # except 1d tensors
|
|
|
|
MOSTLY_Q4_K_S = 14 # except 1d tensors
|
|
|
|
MOSTLY_Q4_K_M = 15 # except 1d tensors
|
|
|
|
MOSTLY_Q5_K_S = 16 # except 1d tensors
|
|
|
|
MOSTLY_Q5_K_M = 17 # except 1d tensors
|
|
|
|
MOSTLY_Q6_K = 18 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_XXS = 19 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_XS = 20 # except 1d tensors
|
|
|
|
MOSTLY_Q2_K_S = 21 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_XS = 22 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_XXS = 23 # except 1d tensors
|
|
|
|
MOSTLY_IQ1_S = 24 # except 1d tensors
|
|
|
|
MOSTLY_IQ4_NL = 25 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_S = 26 # except 1d tensors
|
|
|
|
MOSTLY_IQ3_M = 27 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_S = 28 # except 1d tensors
|
|
|
|
MOSTLY_IQ2_M = 29 # except 1d tensors
|
|
|
|
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
|
|
|
MOSTLY_IQ1_M = 31 # except 1d tensors
|
|
|
|
MOSTLY_BF16 = 32 # except 1d tensors
|
2024-12-07 13:37:50 +01:00
|
|
|
# MOSTLY_Q4_0_4_4 = 33 # removed from gguf files, use Q4_0 and runtime repack
|
|
|
|
# MOSTLY_Q4_0_4_8 = 34 # removed from gguf files, use Q4_0 and runtime repack
|
|
|
|
# MOSTLY_Q4_0_8_8 = 35 # removed from gguf files, use Q4_0 and runtime repack
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-05 21:48:47 -04:00
|
|
|
MOSTLY_TQ1_0 = 36 # except 1d tensors
|
|
|
|
MOSTLY_TQ2_0 = 37 # except 1d tensors
|
2024-05-11 11:06:26 -04:00
|
|
|
|
|
|
|
GUESSED = 1024 # not specified in the model file
|
|
|
|
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
class GGUFEndian(IntEnum):
|
|
|
|
LITTLE = 0
|
|
|
|
BIG = 1
|
|
|
|
|
|
|
|
|
|
|
|
class GGUFValueType(IntEnum):
|
|
|
|
UINT8 = 0
|
|
|
|
INT8 = 1
|
|
|
|
UINT16 = 2
|
|
|
|
INT16 = 3
|
|
|
|
UINT32 = 4
|
|
|
|
INT32 = 5
|
|
|
|
FLOAT32 = 6
|
|
|
|
BOOL = 7
|
|
|
|
STRING = 8
|
|
|
|
ARRAY = 9
|
|
|
|
UINT64 = 10
|
|
|
|
INT64 = 11
|
|
|
|
FLOAT64 = 12
|
|
|
|
|
|
|
|
@staticmethod
|
|
|
|
def get_type(val: Any) -> GGUFValueType:
|
|
|
|
if isinstance(val, (str, bytes, bytearray)):
|
|
|
|
return GGUFValueType.STRING
|
|
|
|
elif isinstance(val, list):
|
|
|
|
return GGUFValueType.ARRAY
|
|
|
|
elif isinstance(val, float):
|
|
|
|
return GGUFValueType.FLOAT32
|
|
|
|
elif isinstance(val, bool):
|
|
|
|
return GGUFValueType.BOOL
|
|
|
|
elif isinstance(val, int):
|
|
|
|
return GGUFValueType.INT32
|
|
|
|
# TODO: need help with 64-bit types in Python
|
|
|
|
else:
|
2024-05-04 05:36:41 +10:00
|
|
|
raise ValueError(f"Unknown type: {type(val)}")
|
2023-11-10 22:04:50 -07:00
|
|
|
|
|
|
|
|
2025-04-22 16:24:54 +02:00
|
|
|
class VisionProjectorType:
|
|
|
|
GEMMA3 = "gemma3"
|
|
|
|
IDEFICS3 = "idefics3"
|
2025-04-23 20:21:59 +02:00
|
|
|
PIXTRAL = "pixtral"
|
2025-05-19 13:04:14 +02:00
|
|
|
LLAMA4 = "llama4"
|
2025-05-02 17:17:15 +02:00
|
|
|
QWEN2VL = "qwen2vl_merger"
|
|
|
|
QWEN25VL = "qwen2.5vl_merger"
|
2025-05-22 20:42:48 +02:00
|
|
|
ULTRAVOX = "ultravox"
|
2025-05-10 16:26:42 +02:00
|
|
|
INTERNVL = "internvl"
|
2025-05-25 14:06:32 +02:00
|
|
|
QWEN2A = "qwen2a" # audio
|
2025-05-27 14:06:10 +02:00
|
|
|
QWEN25O = "qwen2.5o" # omni
|
2025-04-22 16:24:54 +02:00
|
|
|
|
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
# Items here are (block size, type size)
|
2024-05-23 10:00:21 +03:00
|
|
|
QK_K = 256
|
2024-05-08 18:16:38 -04:00
|
|
|
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
2024-03-03 09:43:42 +01:00
|
|
|
GGMLQuantizationType.F32: (1, 4),
|
|
|
|
GGMLQuantizationType.F16: (1, 2),
|
|
|
|
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
|
|
|
GGMLQuantizationType.Q4_1: (32, 2 + 2 + 16),
|
|
|
|
GGMLQuantizationType.Q5_0: (32, 2 + 4 + 16),
|
|
|
|
GGMLQuantizationType.Q5_1: (32, 2 + 2 + 4 + 16),
|
|
|
|
GGMLQuantizationType.Q8_0: (32, 2 + 32),
|
|
|
|
GGMLQuantizationType.Q8_1: (32, 4 + 4 + 32),
|
|
|
|
GGMLQuantizationType.Q2_K: (256, 2 + 2 + QK_K // 16 + QK_K // 4),
|
|
|
|
GGMLQuantizationType.Q3_K: (256, 2 + QK_K // 4 + QK_K // 8 + 12),
|
|
|
|
GGMLQuantizationType.Q4_K: (256, 2 + 2 + QK_K // 2 + 12),
|
|
|
|
GGMLQuantizationType.Q5_K: (256, 2 + 2 + QK_K // 2 + QK_K // 8 + 12),
|
|
|
|
GGMLQuantizationType.Q6_K: (256, 2 + QK_K // 2 + QK_K // 4 + QK_K // 16),
|
|
|
|
GGMLQuantizationType.Q8_K: (256, 4 + QK_K + QK_K // 8),
|
|
|
|
GGMLQuantizationType.IQ2_XXS: (256, 2 + QK_K // 4),
|
|
|
|
GGMLQuantizationType.IQ2_XS: (256, 2 + QK_K // 4 + QK_K // 32),
|
|
|
|
GGMLQuantizationType.IQ3_XXS: (256, 2 + QK_K // 4 + QK_K // 8),
|
|
|
|
GGMLQuantizationType.IQ1_S: (256, 2 + QK_K // 8 + QK_K // 16),
|
|
|
|
GGMLQuantizationType.IQ4_NL: (32, 2 + 16),
|
|
|
|
GGMLQuantizationType.IQ3_S: (256, 2 + QK_K // 4 + QK_K // 8 + QK_K // 32 + 4),
|
|
|
|
GGMLQuantizationType.IQ2_S: (256, 2 + QK_K // 4 + QK_K // 16),
|
|
|
|
GGMLQuantizationType.IQ4_XS: (256, 2 + 2 + QK_K // 2 + QK_K // 64),
|
2024-03-14 04:40:14 -06:00
|
|
|
GGMLQuantizationType.I8: (1, 1),
|
|
|
|
GGMLQuantizationType.I16: (1, 2),
|
|
|
|
GGMLQuantizationType.I32: (1, 4),
|
gguf : add support for I64 and F64 arrays (#6062)
* gguf : add support for I64 and F64 arrays
GGML currently does not support I64 or F64 arrays and they are not often
used in machine learning, however if in the future the need arises, it
would be nice to add them now, so that the types are next to the other
types I8, I16, I32 in the enums, and it also reserves their type number.
Furthermore, with this addition the GGUF format becomes very usable for
most computational applications of NumPy (being compatible with the most
common NumPy dtypes: i8, i16, i32, i64, f32, f64), providing a faster,
and more versatile alternative to the `npz` format, and a simpler
alternative to the `hdf5` format.
The change in this PR seems small, not significantly increasing the
maintenance burden. I tested this from Python using GGUFWriter/Reader
and `gguf-dump`, as well as from C, everything seems to work.
* Fix compiler warnings
2024-03-15 02:46:51 -06:00
|
|
|
GGMLQuantizationType.I64: (1, 8),
|
|
|
|
GGMLQuantizationType.F64: (1, 8),
|
2024-04-21 14:49:30 +02:00
|
|
|
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
2024-05-08 02:30:09 -04:00
|
|
|
GGMLQuantizationType.BF16: (1, 2),
|
ggml-quants : ternary packing for TriLMs and BitNet b1.58 (#8151)
* ggml-quants : 1.625 bpw ternary packing for BitNet 1.58b
* ggml-quants : faster 1.625 bpw AVX2 vec_dot
Not using a lookup table anymore makes it match q4_0 speed.
* gguf-py : fix formatting
* llama : remove spaces on empty line
* ggml-quants : subtract 1 when back in epi8
This makes the 1.625 bpw type go faster than q4_0. Still not the fastest.
* ggml-quants : Q2_2 now faster than Q4_K on with AVX2
* ggml-quants : cleanup Q1_3 code formatting
* ggml-quants : ARM NEON vec_dot for q2_2 and q1_3
* ggml-quants : use ceiling division when quantizing q1_3
* convert-hf : simplify BitNet pre-quantization
This still results in the exact same tensor weights and scales,
but it reveals some weirdness in the current algorithm.
* convert-hf : allow converting the weird BitNet 1.3B
Its FFN size is 5460 which is not convenient.
The offending tensors are kept in F16,
which makes the final model 5.01 bpw.
* bitnet : replace 1.58b with b1.58, as in the paper
* ggml-quants : fix build failure on Windows
* ggml-quants : attempt to fix Arm 32-bit support
* ggml : add some informative comments in q1_3 vec_dot
* ggml : add TQ1_0 and TQ2_0 ternary quantization types
* ggml : even faster TQ2_0
* ggml : also faster TQ1_0
Same optimization as for TQ2_0 by offsetting the sum instead of the weights.
This makes TQ1_0 almost as fast as Q8_0 on AVX2.
* ggml : fix build issues in certain environments
* ggml : add NEON vec_dot implementation for TQ1_0 and TQ2_0
* ggml : avoid directly using vmlal_high_s8, for 32-bit ARM compat
The compiler seems smart enough to use the same instruction
even when using vget_high_s8 instead.
* ggml : remove q1_3 and q2_2
No more 1.625 bpw and 2.000 bpw,
now instead using 1.6875 bpw and 2.0625 bpw
with TQ1_0 and TQ2_0, respectively.
* llama : remove the separate scale tensors of BitNet b1.58
They won't be needed, since the remaining ternary quant types have
built-in scales.
* ggml-quants : rename fields of TQ1_0 and TQ2_0 structs for consistency
* ggml-quants : allow using vdotq_s32 in TQ2_0 vec_dot
Not yet tested on hardware which supports it,
might not work or might not even compile. But also it might.
It should make the performance better on recent ARM CPUs.
* ggml-quants : remove comment about possible format change of TQ2_0
Making it slightly more convenient for AVX512
but less convenient for everything else is not worth the trouble.
* gguf-py : Numpy (de)quantization for TQ1_0 and TQ2_0
* ggml-quants : use roundf instead of nearest_int for TQ1_0 and TQ2_0
This does not change anything for ternary models,
since their values should never end up being in halfway cases anyway.
* convert : allow direct conversion to TQ1_0 and TQ2_0
The token embeddings and output tensors are kept in F16
to allow quantizing them to Q4_K and Q6_K with llama-quantize.
* llama : handle fallback for TQ1_0 and TQ2_0 with Q4_0
Q4_0 is not completely symmetric (so not lossless for ternary models),
but it should be good enough.
* ggml-quants : allow using ARM dot product instructions for TQ1_0
* ggml-quants : deduplicate TQ1_0 and TQ2_0 __ARM_FEATURE_DOTPROD support
* ggml : remove unused ggml_mul special case
It would otherwise conflict with the more general
optimization coming with Mamba-2.
* ggml : handle TQ1_0 and TQ2_0 in dequantization-based operators
* test-backend-ops : add TQ1_0 and TQ2_0 comments for later
Not yet adding uncommented, because some backends like SYCL and Metal
do not properly handle unknown types in supports_op for GGML_OP_MUL_MAT.
(and Metal also doesn't handle it with GGML_OP_GET_ROWS)
Support for TQ1_0 and TQ2_0 for other backends than CPU
will be added in follow-up pull requests.
2024-09-05 21:48:47 -04:00
|
|
|
GGMLQuantizationType.TQ1_0: (256, 2 + 4 * 13),
|
|
|
|
GGMLQuantizationType.TQ2_0: (256, 2 + 64),
|
2023-11-10 22:04:50 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# Aliases for backward compatibility.
|
|
|
|
|
|
|
|
# general
|
|
|
|
KEY_GENERAL_ARCHITECTURE = Keys.General.ARCHITECTURE
|
|
|
|
KEY_GENERAL_QUANTIZATION_VERSION = Keys.General.QUANTIZATION_VERSION
|
|
|
|
KEY_GENERAL_ALIGNMENT = Keys.General.ALIGNMENT
|
|
|
|
KEY_GENERAL_NAME = Keys.General.NAME
|
|
|
|
KEY_GENERAL_AUTHOR = Keys.General.AUTHOR
|
|
|
|
KEY_GENERAL_URL = Keys.General.URL
|
|
|
|
KEY_GENERAL_DESCRIPTION = Keys.General.DESCRIPTION
|
|
|
|
KEY_GENERAL_LICENSE = Keys.General.LICENSE
|
|
|
|
KEY_GENERAL_SOURCE_URL = Keys.General.SOURCE_URL
|
|
|
|
KEY_GENERAL_FILE_TYPE = Keys.General.FILE_TYPE
|
|
|
|
|
|
|
|
# LLM
|
2024-03-14 17:21:56 +01:00
|
|
|
KEY_VOCAB_SIZE = Keys.LLM.VOCAB_SIZE
|
2023-11-10 22:04:50 -07:00
|
|
|
KEY_CONTEXT_LENGTH = Keys.LLM.CONTEXT_LENGTH
|
|
|
|
KEY_EMBEDDING_LENGTH = Keys.LLM.EMBEDDING_LENGTH
|
|
|
|
KEY_BLOCK_COUNT = Keys.LLM.BLOCK_COUNT
|
|
|
|
KEY_FEED_FORWARD_LENGTH = Keys.LLM.FEED_FORWARD_LENGTH
|
|
|
|
KEY_USE_PARALLEL_RESIDUAL = Keys.LLM.USE_PARALLEL_RESIDUAL
|
|
|
|
KEY_TENSOR_DATA_LAYOUT = Keys.LLM.TENSOR_DATA_LAYOUT
|
|
|
|
|
|
|
|
# attention
|
|
|
|
KEY_ATTENTION_HEAD_COUNT = Keys.Attention.HEAD_COUNT
|
|
|
|
KEY_ATTENTION_HEAD_COUNT_KV = Keys.Attention.HEAD_COUNT_KV
|
|
|
|
KEY_ATTENTION_MAX_ALIBI_BIAS = Keys.Attention.MAX_ALIBI_BIAS
|
|
|
|
KEY_ATTENTION_CLAMP_KQV = Keys.Attention.CLAMP_KQV
|
|
|
|
KEY_ATTENTION_LAYERNORM_EPS = Keys.Attention.LAYERNORM_EPS
|
|
|
|
KEY_ATTENTION_LAYERNORM_RMS_EPS = Keys.Attention.LAYERNORM_RMS_EPS
|
|
|
|
|
|
|
|
# RoPE
|
|
|
|
KEY_ROPE_DIMENSION_COUNT = Keys.Rope.DIMENSION_COUNT
|
|
|
|
KEY_ROPE_FREQ_BASE = Keys.Rope.FREQ_BASE
|
|
|
|
KEY_ROPE_SCALING_TYPE = Keys.Rope.SCALING_TYPE
|
|
|
|
KEY_ROPE_SCALING_FACTOR = Keys.Rope.SCALING_FACTOR
|
|
|
|
KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
|
|
|
|
KEY_ROPE_SCALING_FINETUNED = Keys.Rope.SCALING_FINETUNED
|
|
|
|
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
|
|
|
# SSM
|
|
|
|
KEY_SSM_CONV_KERNEL = Keys.SSM.CONV_KERNEL
|
|
|
|
KEY_SSM_INNER_SIZE = Keys.SSM.INNER_SIZE
|
|
|
|
KEY_SSM_STATE_SIZE = Keys.SSM.STATE_SIZE
|
|
|
|
KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
|
llama : initial Mamba-2 support (#9126)
* llama : initial Mamba-2 support
* ggml : SIMD ggml_ssm_scan for Mamba-2
* ggml : improve ggml_mul speed when masking recurrent states
* llama : support running Mamba-Codestral-7B-v0.1
* llama : fix Mamba-2 conv state saving
* ggml : make the ggml_mul fast broadcast path more consistently formatted
* llama : remove unused variable
* llama : add missing break
* convert_hf : prefer SentencePiece tokenizer for Mamba-2 when present
The tokenzier.json of Mamba-Codestral-7B-v0.1 otherwise requires
workarounds to work correctly.
* llama : avoid redundant state copy for Mamba 1 and 2
* metal : attempt to adapt SSM_SCAN for Mamba-2
* metal : fix SSM_SCAN pipeline scope
* metal : use log and exp instead of log1pf and expf in SSM_SCAN
* metal : remove unused arguments for SSM_SCAN
The max index is 31, so trimming the arguments is necessary.
* metal : add back n_seqs to SSM_SCAN args
Whoops, this is needed for the offset in the concatenated output.
* metal : fix SSM_SCAN state head offset
* metal : fix wrong number of tokens per sequence in SSM_SCAN
* ggml : remove unused fast broadcast path in GGML_MUL
This was initially added because states were masked with ggml_mul,
but this is no longer done and so this "optimisation" is no longer
necessary, or at least not worth the additional code complexity.
* ggml : avoid multiply by D in GGML_OP_SSM_SCAN
This makes the weight buft detection in src/llama.cpp simpler.
* convert : transpose Mamba-2 A, D and reshape SSM_NORM
This breaks existing conversions of Mamba-2 models
to avoid some reshapes.
Not sure if it's a good idea,
but it makes the graph slightly cleaner.
* llama : more appropriate SSM_SCAN and SSM_CONV buft support checks
* convert : fix flake8 lint
* metal : fix confusion between ; and ,
* metal : add missing args for nb references in ssm_scan_f32_group
* metal : single-user mamba2 inference works
* kv-cache : remove const_cast when setting inputs for s_copy
And also fix multi-user inference for recurrent models
by using cell_id instead of i as the kv cell index
when populating s_copy.
* convert : avoid AutoConfig for Mamba and Mamba2 hparams
* kv-cache : allow context shift for recurrent models
* graph : fix recurrent state copies when avoiding copies
Works, but using lambda functions might not be that clean.
* ggml : fix mamba2 ssm scan when compiled with SVE
* ggml-cpu : reorder SVE FMA for consistency with other SIMD arches
* cuda : implement ssm scan for Mamba2
There is still room for improvement, but it works!
* cuda : adapt Mamba1 ssm scan to shape changes from Mamba2
* mamba : fix mismatched new and delete size for llm_build_mamba
Subclasses of llm_graph_context cannot have extra fields,
because the called destructor is not the one from the subclass.
This otherwise would cause problems when runnning Mamba-(1|2) inference
when compiled -DGGML_SANITIZE_ADDRESS=ON
* cuda : graceful fallback for Mamba-1 models with weird embd size
2025-07-02 13:10:24 -04:00
|
|
|
KEY_SSM_GROUP_COUNT = Keys.SSM.GROUP_COUNT
|
2024-08-21 12:06:36 +04:00
|
|
|
KEY_SSM_DT_B_C_RMS = Keys.SSM.DT_B_C_RMS
|
llama : support Mamba Selective State Space Models (#5328)
* mamba : begin working on support for Mamba SSM
* mamba : begin figuring out how to (ab)use the kv cache for Mamba
* mamba : recurrent inference almost works, but incoherent
* mamba : recurrent inference WORKS!!!
* convert : optionally use d_conv and d_state from config.json for Mamba
* mamba : refactor recurrent conv, resulting in 20% perf increase
It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.
I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
* ggml : parallelize ggml_exp
This results in 8% faster token generation for Mamba-130M.
* mamba : simplify the conv step with a self-overlapping view
Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.
Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.
Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).
* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32
Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
* mamba : fix self-overlapping view depth stride
* mamba : handle batches of more than 1 token
This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.
Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.
* ggml: add ggml_ssm_scan to help with parallel selective scan
If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
* ggml : in ggml_ssm_scan, merge multiple rows in the same vec operation
This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
* mamba : very basic quantization support
Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)
Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.
Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.
* convert : fix wrong name for layer norm weight of offical Mamba models
I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
* mamba : fuse more steps of the SSM scan in the ggml_ssm_scan operator
This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.
However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
* convert : for Mamba, also consider the "MambaLMHeadModel" arch name
It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
* mamba : fix vocab size problems with official models
The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.
Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
* ggml : remove ggml_exp and ggml_soft_plus
They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
* mamba : remove some useless comments
No code change.
* convert : fix flake8 linter errors
* mamba : apply suggestions from code review
* mamba : remove unecessary branch for row-wise ssm_state and C multiplication
It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.
* ggml : in ggml_ssm_scan, use more appropriate asserts
* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
* mamba : multiple sequences, but one at a time
This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).
The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)
Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.
* mamba : support llama_kv_cache_seq_cp
This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.
Each KV cell is dedicated to the sequence ID corresponding to its own index.
* mamba : use a state mask
It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.
inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).
* llama : replace the usage of n_ctx with kv_self.size in many places
* mamba : use n_tokens directly instead of n_tok
* mamba : in comments, properly refer to KV cells instead of slots
* mamba : reduce memory usage of ggml_ssm_scan
From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.
The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
* mamba : simultaneous sequence processing
A batch can now contain tokens from multiple sequences.
This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.
However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.
* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba
This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).
Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.
Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.
* llama : add inp_s_seq as a new input tensor
The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.
The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.
Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
* mamba : support llama_kv_cache_seq_cp copy chains
* mamba : support shifting and dividing the kv cache pos
* mamba : make the server and parallel examples work with whole sequences
A seq_id is dedicated to the system prompt in both cases.
* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
* mamba : dedicate an input tensor for state copy indices
This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
* mamba : adapt perplexity, batched, and batched-bench examples
* perplexity : limit the max number of sequences
This adapts to what the loaded model can provide.
* llama : add llama_n_max_seq to get the upper limit for seq_ids
Used by the perplexity example.
* batched : pass n_parallel to the model's context params
This should have been there already, but it wasn't.
* batched-bench : reserve sequences to support Mamba
* batched-bench : fix tokens being put in wrong sequences
Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
* mamba : stop abusing attention metadata
This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.
This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
will not require breaking existing converted Mamba models again)
* gguf-py : add new KV metadata key-value pairs for Mamba
* llama : add new metadata key-value pairs for Mamba
* llama : guard against divisions by zero when n_head is 0
* mamba : rename "unlimited" KV cache property to "recurrent"
* mamba : more correctly update the "used" field of the KV cache
* ggml : in ggml_ssm_scan, use a threshold for soft_plus
This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
* convert : for Mamba, fallback to internal NeoX tokenizer
The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
* mamba : support state saving and restoring
* ggml : implicitly pass src tensors through dst for Mamba-related ops
* mamba : clarify some comments
* server : fix cache_tokens not getting correctly resized
Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.
For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
* convert-hf : support new metadata keys for Mamba
For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
* mamba : rename metadata to be more similar to transformers library
This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".
* mamba : support mamba-*-hf models
These models share their token_embd.weight with their output.weight
* mamba : add missing spaces
This is purely a formatting change.
* convert-hf : omit output.weight when identical with token_embd.weight
Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
* readme : add Mamba to supported models, and add recent API changes
* mamba : move state_seq and state_mask views outside layer loop
A few tensors were also missing `struct` in front of `ggml_tensor`.
2024-03-08 17:31:00 -05:00
|
|
|
|
2023-11-10 22:04:50 -07:00
|
|
|
# tokenization
|
|
|
|
KEY_TOKENIZER_MODEL = Keys.Tokenizer.MODEL
|
2024-04-29 16:58:41 +03:00
|
|
|
KEY_TOKENIZER_PRE = Keys.Tokenizer.PRE
|
2023-11-10 22:04:50 -07:00
|
|
|
KEY_TOKENIZER_LIST = Keys.Tokenizer.LIST
|
|
|
|
KEY_TOKENIZER_TOKEN_TYPE = Keys.Tokenizer.TOKEN_TYPE
|
|
|
|
KEY_TOKENIZER_SCORES = Keys.Tokenizer.SCORES
|
|
|
|
KEY_TOKENIZER_MERGES = Keys.Tokenizer.MERGES
|
|
|
|
KEY_TOKENIZER_BOS_ID = Keys.Tokenizer.BOS_ID
|
|
|
|
KEY_TOKENIZER_EOS_ID = Keys.Tokenizer.EOS_ID
|
2024-10-12 08:21:51 +03:00
|
|
|
KEY_TOKENIZER_EOT_ID = Keys.Tokenizer.EOT_ID
|
|
|
|
KEY_TOKENIZER_EOM_ID = Keys.Tokenizer.EOM_ID
|
2023-11-10 22:04:50 -07:00
|
|
|
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
|
|
|
|
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
|
|
|
|
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
|
2024-02-15 14:14:37 +01:00
|
|
|
KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
|
2023-11-10 22:04:50 -07:00
|
|
|
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
|
|
|
|
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
|
2024-10-12 08:21:51 +03:00
|
|
|
|
|
|
|
KEY_TOKENIZER_FIM_PRE_ID = Keys.Tokenizer.FIM_PRE_ID
|
|
|
|
KEY_TOKENIZER_FIM_SUF_ID = Keys.Tokenizer.FIM_SUF_ID
|
|
|
|
KEY_TOKENIZER_FIM_MID_ID = Keys.Tokenizer.FIM_MID_ID
|
|
|
|
KEY_TOKENIZER_FIM_PAD_ID = Keys.Tokenizer.FIM_PAD_ID
|
|
|
|
KEY_TOKENIZER_FIM_REP_ID = Keys.Tokenizer.FIM_REP_ID
|
|
|
|
KEY_TOKENIZER_FIM_SEP_ID = Keys.Tokenizer.FIM_SEP_ID
|
|
|
|
|
|
|
|
# deprecated
|
|
|
|
KEY_TOKENIZER_PREFIX_ID = Keys.Tokenizer.PREFIX_ID
|
2024-04-16 08:13:13 +02:00
|
|
|
KEY_TOKENIZER_SUFFIX_ID = Keys.Tokenizer.SUFFIX_ID
|
|
|
|
KEY_TOKENIZER_MIDDLE_ID = Keys.Tokenizer.MIDDLE_ID
|