From 024bd29445ee5cb84abbd959b2fdb2d149aa8126 Mon Sep 17 00:00:00 2001 From: Vaibhavs10 Date: Tue, 17 Jun 2025 15:03:34 +0200 Subject: [PATCH] Init - first pass. --- convert_hf_to_gguf.py | 10 ++++ docs/development/HOWTO-add-model.md | 20 ++++---- gguf-py/gguf/constants.py | 17 +++++++ src/llama-arch.cpp | 19 ++++++++ src/llama-arch.h | 1 + src/llama-model.cpp | 75 +++++++++++++++++++++++++++++ 6 files changed, 133 insertions(+), 9 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 58e455ae6..9d12a1e56 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6298,6 +6298,16 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel): super().set_gguf_parameters() self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"]) +@Model.register("SmolLM3ForCausalLM") +class SmolLM3Model(LlamaModel): + model_arch = gguf.MODEL_ARCH.SMOLLM3 + + def set_gguf_parameters(self): + super().set_gguf_parameters() + + if self.model.config.no_rope_layers is not None: + self.gguf_writer.add_array("smollm3.no_rope_layers", self.model.config.no_rope_layers, gguf.GGUFValueType.INT32) + ###### CONVERSION LOGIC ###### diff --git a/docs/development/HOWTO-add-model.md b/docs/development/HOWTO-add-model.md index 7f71e0247..51e0b0b20 100644 --- a/docs/development/HOWTO-add-model.md +++ b/docs/development/HOWTO-add-model.md @@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv ### 2. Define the model architecture in `llama.cpp` -The model params and tensors layout must be defined in `llama.cpp`: -1. Define a new `llm_arch` -2. Define the tensors layout in `LLM_TENSOR_NAMES` -3. Add any non-standard metadata in `llm_load_hparams` -4. Create the tensors for inference in `llm_load_tensors` -5. If the model has a RoPE operation, add the rope type in `llama_rope_type` +The model params and tensors layout must be defined in `llama.cpp` source files: +1. Define a new `llm_arch` enum value in `src/llama-arch.h`. +2. In `src/llama-arch.cpp`: + - Add the architecture name to the `LLM_ARCH_NAMES` map. + - Add the tensor mappings to the `LLM_TENSOR_NAMES` map. +3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`. +4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`. NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions. ### 3. Build the GGML graph implementation -This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`. - -Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`. +This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`. +Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor. +Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`. +Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct. Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR. diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 834a1d5e1..054591dcf 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -346,6 +346,7 @@ class MODEL_ARCH(IntEnum): BAILINGMOE = auto() DOTS1 = auto() ARCEE = auto() + SMOLLM3 = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -629,6 +630,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.BAILINGMOE: "bailingmoe", MODEL_ARCH.DOTS1: "dots1", MODEL_ARCH.ARCEE: "arcee", + MODEL_ARCH.SMOLLM3: "smollm3", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -2101,6 +2103,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_DOWN, MODEL_TENSOR.FFN_UP, ], + MODEL_ARCH.SMOLLM3: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index de8d289cf..ec8e5278a 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -75,6 +75,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_BAILINGMOE, "bailingmoe" }, { LLM_ARCH_DOTS1, "dots1" }, { LLM_ARCH_ARCEE, "arcee" }, + { LLM_ARCH_SMOLLM3, "smollm3" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -1625,6 +1626,24 @@ static const std::map> LLM_TENSOR_N { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, }, }, + { + LLM_ARCH_SMOLLM3, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd.weight" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm.weight" }, + { LLM_TENSOR_OUTPUT, "output.weight" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm.weight" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q.weight" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k.weight" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v.weight" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output.weight" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate.weight" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down.weight" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up.weight" }, + }, + }, }; static const std::map LLM_TENSOR_INFOS = { diff --git a/src/llama-arch.h b/src/llama-arch.h index 3e8a61da3..11b71e536 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -79,6 +79,7 @@ enum llm_arch { LLM_ARCH_BAILINGMOE, LLM_ARCH_DOTS1, LLM_ARCH_ARCEE, + LLM_ARCH_SMOLLM3, LLM_ARCH_UNKNOWN, }; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index a5eb122f9..611b3f6fc 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -13734,6 +13734,75 @@ struct llm_build_arcee : public llm_graph_context { } }; +struct llm_build_smollm3 : public llm_graph_context { + llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) { + std::vector no_rope_layers; + if (arch == LLM_ARCH_SMOLLM3) { + const int kid = gguf_find_key(model.meta, "smollm3.no_rope_layers"); + if (kid != -1) { + const uint32_t n = gguf_get_arr_n(model.meta, kid); + no_rope_layers.resize(n); + const int nb = gguf_get_arr_data(model.meta, kid, no_rope_layers.data(), n * sizeof(int32_t)); + GGML_ASSERT(nb == int(n * sizeof(int32_t))); + } + } + + const int64_t n_tokens = params.n_tokens; + const int64_t n_layer = hparams.n_layer; + + gf->n_threads = params.n_threads; + + // build the graph + inp_tokens->set_input(ubatch); + inp_pos->set_input(ubatch); + inp_attn_temp->set_input(ubatch); + + struct ggml_tensor * cur = build_inp_embd(); + struct ggml_tensor * lay_out = nullptr; + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inp_norm = build_norm(cur, hparams.f_norm_eps, il, tn(LLM_TENSOR_ATTN_NORM, il)); + struct ggml_tensor * qkv = build_attn(inp_norm, il); + struct ggml_tensor * q = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, 0, 0, 0); + struct ggml_tensor * k = ggml_view_4d(ctx, qkv, hparams.n_embd_head_k, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_k, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0); + struct ggml_tensor * v = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il) + ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0); + + ggml_set_name(q, "q"); + ggml_set_name(k, "k"); + ggml_set_name(v, "v"); + + struct ggml_tensor * qcur = q; + struct ggml_tensor * kcur = k; + + bool apply_rope = true; + if (arch == LLM_ARCH_SMOLLM3) { + if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) != no_rope_layers.end()) { + apply_rope = false; + } + } + + if (apply_rope && get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il))) { + qcur = ggml_rope_ext(ctx, q, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul); + kcur = ggml_rope_ext(ctx, k, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul); + } + + struct ggml_tensor * attn_out = build_attn_out(inp_norm, qcur, kcur, v, il); + + if (hparams.use_par_res) { + // parallel residual + lay_out = ggml_add(ctx, attn_out, build_ff_par(inp_norm, il)); + } else { + // sequential residual + lay_out = ggml_add(ctx, cur, attn_out); + lay_out = build_ff_seq(lay_out, il); + } + cur = lay_out; + } + + build_output(cur, lay_out); + } +}; + llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const { llama_memory_i * res; @@ -14085,6 +14154,10 @@ llm_graph_result_ptr llama_model::build_graph( { llm = std::make_unique(*this, params, gf); } break; + case LLM_ARCH_SMOLLM3: + { + llm = std::make_unique(*this, params, gf); + } break; default: GGML_ABORT("fatal error"); } @@ -14235,9 +14308,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_CHAMELEON: case LLM_ARCH_BAILINGMOE: case LLM_ARCH_NEO_BERT: + case LLM_ARCH_SMOLLM3: case LLM_ARCH_ARCEE: return LLAMA_ROPE_TYPE_NORM; + // the pairs of head values are offset by n_rot/2 case LLM_ARCH_FALCON: case LLM_ARCH_GROK: