From 8bea92261eaf5984818dbae487ebb7e0e82c6914 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 14:32:11 +0400 Subject: [PATCH 1/2] python fixes --- convert_hf_to_gguf.py | 14 ++++++++++++-- gguf-py/gguf/tensor_mapping.py | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 356682189..193fee8d2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -686,6 +686,9 @@ class TextModel(ModelBase): if chkhsh == "9d032fcbd5501f4a38150912590928bfb36091efb5df11b8e2124b0390e3fb1e": # ref: https://huggingface.co/tiiuae/Falcon3-7B-Base res = "falcon3" + if chkhsh == "60476e1243776c4fb1b993dbd7a5f15ac22f83c80afdf425fa5ae01c8d44ef86": + # ref: https://huggingface.co/collections/tiiuae/falcon-h1-6819f2795bc406da60fab8df + res = "falcon-H1" if chkhsh == "8e62295832751ca1e8f92f2226f403dea30dc5165e448b5bfa05af5340c64ec7": # ref: https://huggingface.co/BAAI/bge-large-zh-v1.5 res = "bert-bge-large" @@ -4905,8 +4908,11 @@ class Mamba2Model(TextModel): # Fail early for models which don't have a block expansion factor of 2 # TODO: does this really matter? - assert d_inner == 2 * d_model - assert d_inner % head_dim == 0 + # skip the assertion for FalconH1 Model + architectures = self.hparams.get("architectures") + if architectures is None or architectures[0] != "FalconH1ForCausalLM": + assert d_inner == 2 * d_model + assert d_inner % head_dim == 0 self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default self.gguf_writer.add_embedding_length(d_model) @@ -4945,6 +4951,10 @@ class Mamba2Model(TextModel): d_model = self.find_hparam(["hidden_size", "d_model", "dim"]) d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model n_group = self.hparams.get("n_groups", 1) + architectures = self.hparams.get("architectures") + if architectures is not None and architectures[0] == "FalconH1ForCausalLM": + # FalconH1F has a different d_inner + d_inner = self.hparams.get("mamba_d_ssm") data_torch = data_torch.reshape((n_group, d_inner // n_group)) if name.endswith(".A_log"): diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index be3fff848..45a6a70d4 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -589,6 +589,7 @@ class TensorNameMap: MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", "backbone.layers.{bid}.mixer.out_proj", + "model.layers.{bid}.mamba.out_proj", # falcon-h1 ), MODEL_TENSOR.TIME_MIX_W0: ( From 071f4b7fd86e097c444b02e33e8a14123a9116a7 Mon Sep 17 00:00:00 2001 From: ibrahimkhadraoui Date: Fri, 4 Jul 2025 14:37:02 +0400 Subject: [PATCH 2/2] changed precision for multipliers float 32->64 --- convert_hf_to_gguf.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 193fee8d2..6c3f66721 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6642,11 +6642,11 @@ class FalconH1Model(Mamba2Model): self.gguf_writer.add_layer_norm_rms_eps(self.hparams["rms_norm_eps"]) self.gguf_writer.add_key_length(self.hparams["head_dim"]) self.gguf_writer.add_value_length(self.hparams["head_dim"]) - self.gguf_writer.add_float32("falcon-h1.key_multiplier", self.hparams["key_multiplier"]) + self.gguf_writer.add_float64("falcon-h1.key_multiplier", self.hparams["key_multiplier"]) ## Other params - self.gguf_writer.add_float32("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) - self.gguf_writer.add_float32("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"]) + self.gguf_writer.add_float64("falcon-h1.lm_head_multiplier", self.hparams["lm_head_multiplier"]) + self.gguf_writer.add_float64("falcon-h1.embedding_multiplier", self.hparams["embedding_multiplier"]) ## Validation ## assert self.hparams.get("hidden_act") in [None, "silu"], "Only SILU activation supported" @@ -6663,15 +6663,15 @@ class FalconH1Model(Mamba2Model): self.find_hparam(["num_attention_heads"])) # Add multipliers as metadata instead of tensors - self.gguf_writer.add_float32("falcon-h1.attention_in_multiplier", self.attention_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.attention_out_multiplier", self.attention_out_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier) - self.gguf_writer.add_float32("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier) + self.gguf_writer.add_float64("falcon-h1.attention_in_multiplier", self.attention_in_multiplier) + self.gguf_writer.add_float64("falcon-h1.attention_out_multiplier", self.attention_out_multiplier) + self.gguf_writer.add_float64("falcon-h1.ssm_in_multiplier", self.ssm_in_multiplier) + self.gguf_writer.add_float64("falcon-h1.ssm_out_multiplier", self.ssm_out_multiplier) # Add MLP multipliers if isinstance(self.mlp_multipliers, (list, tuple)) and len(self.mlp_multipliers) == 2: - self.gguf_writer.add_float32("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0]) - self.gguf_writer.add_float32("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1]) + self.gguf_writer.add_float64("falcon-h1.mlp_gate_multiplier", self.mlp_multipliers[0]) + self.gguf_writer.add_float64("falcon-h1.mlp_down_multiplier", self.mlp_multipliers[1]) # Add has MuP flag if SSM multipliers are present if self.ssm_multipliers is not None: @@ -6681,7 +6681,7 @@ class FalconH1Model(Mamba2Model): self.gguf_writer.add_bool("falcon-h1.mamba_use_mlp", self.find_hparam(["mamba_use_mlp"], optional=True)) self.gguf_writer.add_bool("falcon-h1.mamba_norm_before_gate", self.find_hparam(["mamba_norm_before_gate"], optional=True)) self.gguf_writer.add_bool("falcon-h1.mamba_rms_norm", self.find_hparam(["mamba_rms_norm"], optional=True)) - self.gguf_writer.add_float32("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) + self.gguf_writer.add_float64("falcon-h1.rope_theta", self.find_hparam(["rope_theta"], optional=True)) ###### CONVERSION LOGIC ######