diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 356682189..764c04f10 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6632,11 +6632,11 @@ class FalconH1Model(Mamba2Model): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(self.hparams["head_dim"]) self.gguf_writer.add_value_length(self.hparams["head_dim"]) - self.gguf_writer.add_float32("falcon-h1.key_multiplier", self.hparams["key_multiplier"]) + self.gguf_writer.add_float32("falcon_h1.key_multiplier", self.hparams["key_multiplier"]) ## Other params - self.gguf_writer.add_float32("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) - self.gguf_writer.add_float32("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"]) + self.gguf_writer.add_float32("falcon_h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) + self.gguf_writer.add_float32("falcon_h1.embedding_multiplier", self.hparams["embedding_multiplier"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" @@ -6644,34 +6644,34 @@ class FalconH1Model(Mamba2Model): # Add Falcon Mamba2 specific configuration - self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"]) - self.gguf_writer.add_uint32("falcon-h1.attention.head_dim", self.hparams["head_dim"]) - self.gguf_writer.add_uint32("falcon-h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"]) - self.gguf_writer.add_uint32("falcon-h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) - self.gguf_writer.add_uint32("falcon-h1.num_key_value_heads", + self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_chunk_size", self.hparams["mamba_chunk_size"]) + self.gguf_writer.add_uint32("falcon_h1.attention.head_dim", self.hparams["head_dim"]) + self.gguf_writer.add_uint32("falcon_h1.ssm.mamba_d_ssm", self.hparams["mamba_d_ssm"]) + self.gguf_writer.add_uint32("falcon_h1.num_attention_heads", self.find_hparam(["num_attention_heads"])) + self.gguf_writer.add_uint32("falcon_h1.num_key_value_heads", self.find_hparam(["num_key_value_heads"], optional=True) or self.find_hparam(["num_attention_heads"])) # Add multipliers as metadata instead of tensors - self.gguf_writer.add_float32("falcon-h1.attention_in_multiplier", self.attention_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.attention_out_multiplier", self.attention_out_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier) + self.gguf_writer.add_float32("falcon_h1.attention_in_multiplier", self.attention_in_multiplier) + self.gguf_writer.add_float32("falcon_h1.attention_out_multiplier", self.attention_out_multiplier) + self.gguf_writer.add_float32("falcon_h1.ssm_in_multiplier", self.ssm_in_multiplier) + self.gguf_writer.add_float32("falcon_h1.ssm_out_multiplier", self.ssm_out_multiplier) # Add MLP multipliers if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: - self.gguf_writer.add_float32("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0]) - self.gguf_writer.add_float32("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1]) + self.gguf_writer.add_float32("falcon_h1.mlp_gate_multiplier", self.mlp_multipliers[0]) + self.gguf_writer.add_float32("falcon_h1.mlp_down_multiplier", self.mlp_multipliers[1]) # Add has MuP flag if SSM multipliers are present if self.ssm_multipliers is not None: - self.gguf_writer.add_bool("falcon-h1.ssm.has_mup", True) + self.gguf_writer.add_bool("falcon_h1.ssm.has_mup", True) # Add any other Falcon Mamba2 specific configuration - self.gguf_writer.add_bool("falcon-h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) - self.gguf_writer.add_bool("falcon-h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) - self.gguf_writer.add_bool("falcon-h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) - self.gguf_writer.add_float32("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) + self.gguf_writer.add_bool("falcon_h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) + self.gguf_writer.add_bool("falcon_h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) + self.gguf_writer.add_bool("falcon_h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) + self.gguf_writer.add_float32("falcon_h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index be3fff848..1ce339bd1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -1177,7 +1177,7 @@ class TensorNameMap: ), MODEL_TENSOR.SSM_MUP_VEC: ( - "model.layers.{bid}.mamba.mup_vector", # falcon-h1 + "model.layers.{bid}.mamba.mup_vector", # falcon_h1 ), MODEL_TENSOR.SSM_NORM: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 635a5bdbc..4b555754e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -46,7 +46,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_STARCODER2, "starcoder2" }, { LLM_ARCH_MAMBA, "mamba" }, { LLM_ARCH_MAMBA2, "mamba2" }, - { LLM_ARCH_FALCON_H1, "falcon-h1" }, + { LLM_ARCH_FALCON_H1, "falcon_h1" }, { LLM_ARCH_XVERSE, "xverse" }, { LLM_ARCH_COMMAND_R, "command-r" }, { LLM_ARCH_COHERE2, "cohere2" }, diff --git a/src/llama-model.h b/src/llama-model.h index fc235cd23..8e14be82b 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -226,7 +226,7 @@ struct llama_layer { struct ggml_tensor * ffn_down_enc = nullptr; struct ggml_tensor * ffn_up_enc = nullptr; - // falcon-h1 + // falcon_h1 struct ggml_tensor * ssm_in_b = nullptr; struct ggml_tensor * ssm_mup_vec = nullptr; diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index f4b5713d7..2039ed523 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -315,6 +315,11 @@ static ggml_type llama_tensor_get_type(quantize_state_impl & qs, ggml_type new_t else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) { new_type = GGML_TYPE_IQ2_S; } + } else if (name.find("ssm_in.weight") != std::string::npos) { + // For mamba-based models it's better to not quantize the ssm-proj layers + if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) { + new_type = GGML_TYPE_BF16; + } } else if (name.find("attn_q.weight") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) { new_type = GGML_TYPE_IQ3_XXS; diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp index d49251560..7d2a41301 100644 --- a/src/llama-vocab.cpp +++ b/src/llama-vocab.cpp @@ -1522,7 +1522,7 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) { tokenizer_pre == "llama-v3" || tokenizer_pre == "llama-bpe"|| tokenizer_pre == "falcon3" || - tokenizer_pre == "falcon-h1" || + tokenizer_pre == "falcon_h1" || tokenizer_pre == "pixtral") { pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3; ignore_merges = true;