mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-18 00:27:31 +00:00
fixes
This commit is contained in:
@ -6632,11 +6632,11 @@ class FalconH1Model(Mamba2Model):
|
|||||||
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"])
|
||||||
self.gguf_writer.add_key_length(self.hparams["head_dim"])
|
self.gguf_writer.add_key_length(self.hparams["head_dim"])
|
||||||
self.gguf_writer.add_value_length(self.hparams["head_dim"])
|
self.gguf_writer.add_value_length(self.hparams["head_dim"])
|
||||||
self.gguf_writer.add_float32("falcon-h1.key_multiplier", self.hparams["key_multiplier"])
|
self.gguf_writer.add_float32("falcon_h1.key_multiplier", self.hparams["key_multiplier"])
|
||||||
|
|
||||||
## Other params
|
## Other params
|
||||||
self.gguf_writer.add_float32("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"])
|
self.gguf_writer.add_float32("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"])
|
||||||
self.gguf_writer.add_float32("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"])
|
self.gguf_writer.add_float32("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"])
|
||||||
|
|
||||||
## Validation ##
|
## Validation ##
|
||||||
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported"
|
||||||
@ -6644,34 +6644,34 @@ class FalconH1Model(Mamba2Model):
|
|||||||
|
|
||||||
|
|
||||||
# Add Falcon Mamba2 specific configuration
|
# Add Falcon Mamba2 specific configuration
|
||||||
self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"])
|
self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"])
|
||||||
self.gguf_writer.add_uint32("falcon-h1.attention.head_dim", self.hparams["head_dim"])
|
self.gguf_writer.add_uint32("falcon_h1.attention.head_dim", self.hparams["head_dim"])
|
||||||
self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"])
|
self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"])
|
||||||
self.gguf_writer.add_uint32("falcon-h1.num_attention_heads", self.find_hparam(["num_attention_heads"]))
|
self.gguf_writer.add_uint32("falcon_h1.num_attention_heads", self.find_hparam(["num_attention_heads"]))
|
||||||
self.gguf_writer.add_uint32("falcon-h1.num_key_value_heads",
|
self.gguf_writer.add_uint32("falcon_h1.num_key_value_heads",
|
||||||
self.find_hparam(["num_key_value_heads"], optional=True) or
|
self.find_hparam(["num_key_value_heads"], optional=True) or
|
||||||
self.find_hparam(["num_attention_heads"]))
|
self.find_hparam(["num_attention_heads"]))
|
||||||
|
|
||||||
# Add multipliers as metadata instead of tensors
|
# Add multipliers as metadata instead of tensors
|
||||||
self.gguf_writer.add_float32("falcon-h1.attention_in_multiplier", self.attention_in_multiplier)
|
self.gguf_writer.add_float32("falcon_h1.attention_in_multiplier", self.attention_in_multiplier)
|
||||||
self.gguf_writer.add_float32("falcon-h1.attention_out_multiplier", self.attention_out_multiplier)
|
self.gguf_writer.add_float32("falcon_h1.attention_out_multiplier", self.attention_out_multiplier)
|
||||||
self.gguf_writer.add_float32("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier)
|
self.gguf_writer.add_float32("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier)
|
||||||
self.gguf_writer.add_float32("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier)
|
self.gguf_writer.add_float32("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier)
|
||||||
|
|
||||||
# Add MLP multipliers
|
# Add MLP multipliers
|
||||||
if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2:
|
if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2:
|
||||||
self.gguf_writer.add_float32("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0])
|
self.gguf_writer.add_float32("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0])
|
||||||
self.gguf_writer.add_float32("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1])
|
self.gguf_writer.add_float32("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1])
|
||||||
|
|
||||||
# Add has MuP flag if SSM multipliers are present
|
# Add has MuP flag if SSM multipliers are present
|
||||||
if self.ssm_multipliers is not None:
|
if self.ssm_multipliers is not None:
|
||||||
self.gguf_writer.add_bool("falcon-h1.ssm.has_mup", True)
|
self.gguf_writer.add_bool("falcon_h1.ssm.has_mup", True)
|
||||||
|
|
||||||
# Add any other Falcon Mamba2 specific configuration
|
# Add any other Falcon Mamba2 specific configuration
|
||||||
self.gguf_writer.add_bool("falcon-h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True))
|
self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True))
|
||||||
self.gguf_writer.add_bool("falcon-h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True))
|
self.gguf_writer.add_bool("falcon_h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True))
|
||||||
self.gguf_writer.add_bool("falcon-h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True))
|
self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True))
|
||||||
self.gguf_writer.add_float32("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True))
|
self.gguf_writer.add_float32("falcon_h1.rope_theta", self.find_hparam(["rope_theta"], optional=True))
|
||||||
|
|
||||||
###### CONVERSION LOGIC ######
|
###### CONVERSION LOGIC ######
|
||||||
|
|
||||||
|
@ -1177,7 +1177,7 @@ class TensorNameMap:
|
|||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_MUP_VEC: (
|
MODEL_TENSOR.SSM_MUP_VEC: (
|
||||||
"model.layers.{bid}.mamba.mup_vector", # falcon-h1
|
"model.layers.{bid}.mamba.mup_vector", # falcon_h1
|
||||||
),
|
),
|
||||||
|
|
||||||
MODEL_TENSOR.SSM_NORM: (
|
MODEL_TENSOR.SSM_NORM: (
|
||||||
|
@ -46,7 +46,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
|
|||||||
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
{ LLM_ARCH_STARCODER2, "starcoder2" },
|
||||||
{ LLM_ARCH_MAMBA, "mamba" },
|
{ LLM_ARCH_MAMBA, "mamba" },
|
||||||
{ LLM_ARCH_MAMBA2, "mamba2" },
|
{ LLM_ARCH_MAMBA2, "mamba2" },
|
||||||
{ LLM_ARCH_FALCON_H1, "falcon-h1" },
|
{ LLM_ARCH_FALCON_H1, "falcon_h1" },
|
||||||
{ LLM_ARCH_XVERSE, "xverse" },
|
{ LLM_ARCH_XVERSE, "xverse" },
|
||||||
{ LLM_ARCH_COMMAND_R, "command-r" },
|
{ LLM_ARCH_COMMAND_R, "command-r" },
|
||||||
{ LLM_ARCH_COHERE2, "cohere2" },
|
{ LLM_ARCH_COHERE2, "cohere2" },
|
||||||
|
@ -226,7 +226,7 @@ struct llama_layer {
|
|||||||
struct ggml_tensor * ffn_down_enc = nullptr;
|
struct ggml_tensor * ffn_down_enc = nullptr;
|
||||||
struct ggml_tensor * ffn_up_enc = nullptr;
|
struct ggml_tensor * ffn_up_enc = nullptr;
|
||||||
|
|
||||||
// falcon-h1
|
// falcon_h1
|
||||||
struct ggml_tensor * ssm_in_b = nullptr;
|
struct ggml_tensor * ssm_in_b = nullptr;
|
||||||
struct ggml_tensor * ssm_mup_vec = nullptr;
|
struct ggml_tensor * ssm_mup_vec = nullptr;
|
||||||
|
|
||||||
|
@ -315,6 +315,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t
|
|||||||
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
||||||
new_type = GGML_TYPE_IQ2_S;
|
new_type = GGML_TYPE_IQ2_S;
|
||||||
}
|
}
|
||||||
|
} else if (name.find("ssm_in.weight") != std::string::npos) {
|
||||||
|
// For mamba-based models it's better to not quantize the ssm-proj layers
|
||||||
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) {
|
||||||
|
new_type = GGML_TYPE_BF16;
|
||||||
|
}
|
||||||
} else if (name.find("attn_q.weight") != std::string::npos) {
|
} else if (name.find("attn_q.weight") != std::string::npos) {
|
||||||
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
||||||
new_type = GGML_TYPE_IQ3_XXS;
|
new_type = GGML_TYPE_IQ3_XXS;
|
||||||
|
@ -1522,7 +1522,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
|
|||||||
tokenizer_pre == "llama-v3" ||
|
tokenizer_pre == "llama-v3" ||
|
||||||
tokenizer_pre == "llama-bpe"||
|
tokenizer_pre == "llama-bpe"||
|
||||||
tokenizer_pre == "falcon3" ||
|
tokenizer_pre == "falcon3" ||
|
||||||
tokenizer_pre == "falcon-h1" ||
|
tokenizer_pre == "falcon_h1" ||
|
||||||
tokenizer_pre == "pixtral") {
|
tokenizer_pre == "pixtral") {
|
||||||
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
|
||||||
ignore_merges = true;
|
ignore_merges = true;
|
||||||
|
Reference in New Issue
Block a user