Init - first pass.

2025-06-27 12:05:03 +00:00 · 2025-06-17 15:03:34 +02:00
parent e434e69183
commit 024bd29445
6 changed files with 133 additions and 9 deletions
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@ -6298,6 +6298,16 @@ class UltravoxWhisperEncoderModel(WhisperEncoderModel):
        super().set_gguf_parameters()
        self.gguf_writer.add_audio_stack_factor(self.global_config["stack_factor"])

+@Model.register("SmolLM3ForCausalLM")
+class SmolLM3Model(LlamaModel):
+    model_arch = gguf.MODEL_ARCH.SMOLLM3
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+
+        if self.model.config.no_rope_layers is not None:
+            self.gguf_writer.add_array("smollm3.no_rope_layers", self.model.config.no_rope_layers, gguf.GGUFValueType.INT32)
+
 ###### CONVERSION LOGIC ######


--- a/docs/development/HOWTO-add-model.md
+++ b/docs/development/HOWTO-add-model.md
@ -83,20 +83,22 @@ NOTE: Tensor names must end with `.weight` or `.bias` suffixes, that is the conv

 ### 2. Define the model architecture in `llama.cpp`

-The model params and tensors layout must be defined in `llama.cpp`:
-1. Define a new `llm_arch`
-2. Define the tensors layout in `LLM_TENSOR_NAMES`
-3. Add any non-standard metadata in `llm_load_hparams`
-4. Create the tensors for inference in `llm_load_tensors`
-5. If the model has a RoPE operation, add the rope type in `llama_rope_type`
+The model params and tensors layout must be defined in `llama.cpp` source files:
+1. Define a new `llm_arch` enum value in `src/llama-arch.h`.
+2. In `src/llama-arch.cpp`:
+    - Add the architecture name to the `LLM_ARCH_NAMES` map.
+    - Add the tensor mappings to the `LLM_TENSOR_NAMES` map.
+3. Add any non-standard metadata loading in the `llama_model_loader` constructor in `src/llama-model-loader.cpp`.
+4. If the model has a RoPE operation, add a case for the architecture in `llama_model_rope_type` function in `src/llama-model.cpp`.

 NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorch` dimensions.

 ### 3. Build the GGML graph implementation

-This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
-
-Have a look at existing implementations like `build_llama`, `build_dbrx` or `build_bert`.
+This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `src/llama-model.cpp`.
+Create a new struct that inherits from `llm_graph_context` and implement the graph-building logic in its constructor.
+Have a look at existing implementations like `llm_build_llama`, `llm_build_dbrx` or `llm_build_bert`.
+Then, in the `llama_model::build_graph` method, add a case for your architecture to instantiate your new graph-building struct.

 Some `ggml` backends do not support all operations. Backend implementations can be added in a separate PR.

--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@ -346,6 +346,7 @@ class MODEL_ARCH(IntEnum):
    BAILINGMOE       = auto()
    DOTS1            = auto()
    ARCEE            = auto()
+    SMOLLM3          = auto()


 class VISION_PROJECTOR_TYPE(IntEnum):
@ -629,6 +630,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
    MODEL_ARCH.BAILINGMOE:       "bailingmoe",
    MODEL_ARCH.DOTS1:            "dots1",
    MODEL_ARCH.ARCEE:            "arcee",
+    MODEL_ARCH.SMOLLM3:          "smollm3",
 }

 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@ -2101,6 +2103,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
        MODEL_TENSOR.FFN_DOWN,
        MODEL_TENSOR.FFN_UP,
    ],
+    MODEL_ARCH.SMOLLM3: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ROPE_FREQS,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.ATTN_ROT_EMBD,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
    # TODO
 }

--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@ -75,6 +75,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
    { LLM_ARCH_BAILINGMOE,       "bailingmoe"       },
    { LLM_ARCH_DOTS1,            "dots1"            },
    { LLM_ARCH_ARCEE,            "arcee"            },
+    { LLM_ARCH_SMOLLM3,          "smollm3"          },
    { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };

@ -1625,6 +1626,24 @@ static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_N
            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
        },
    },
+    {
+        LLM_ARCH_SMOLLM3,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,     "token_embd.weight"            },
+            { LLM_TENSOR_OUTPUT_NORM,    "output_norm.weight"           },
+            { LLM_TENSOR_OUTPUT,         "output.weight"                },
+            { LLM_TENSOR_ROPE_FREQS,     "rope_freqs"                   },
+            { LLM_TENSOR_ATTN_NORM,      "blk.%d.attn_norm.weight"      },
+            { LLM_TENSOR_ATTN_Q,         "blk.%d.attn_q.weight"         },
+            { LLM_TENSOR_ATTN_K,         "blk.%d.attn_k.weight"         },
+            { LLM_TENSOR_ATTN_V,         "blk.%d.attn_v.weight"         },
+            { LLM_TENSOR_ATTN_OUT,       "blk.%d.attn_output.weight"    },
+            { LLM_TENSOR_ATTN_ROT_EMBD,  "blk.%d.attn_rot_embd"         },
+            { LLM_TENSOR_FFN_GATE,       "blk.%d.ffn_gate.weight"       },
+            { LLM_TENSOR_FFN_DOWN,       "blk.%d.ffn_down.weight"       },
+            { LLM_TENSOR_FFN_UP,         "blk.%d.ffn_up.weight"         },
+        },
+    },
 };

 static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@ -79,6 +79,7 @@ enum llm_arch {
    LLM_ARCH_BAILINGMOE,
    LLM_ARCH_DOTS1,
    LLM_ARCH_ARCEE,
+    LLM_ARCH_SMOLLM3,
    LLM_ARCH_UNKNOWN,
 };

--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@ -13734,6 +13734,75 @@ struct llm_build_arcee : public llm_graph_context {
    }
 };

+struct llm_build_smollm3 : public llm_graph_context {
+    llm_build_smollm3(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
+        std::vector<int32_t> no_rope_layers;
+        if (arch == LLM_ARCH_SMOLLM3) {
+            const int kid = gguf_find_key(model.meta, "smollm3.no_rope_layers");
+            if (kid != -1) {
+                const uint32_t n = gguf_get_arr_n(model.meta, kid);
+                no_rope_layers.resize(n);
+                const int nb = gguf_get_arr_data(model.meta, kid, no_rope_layers.data(), n * sizeof(int32_t));
+                GGML_ASSERT(nb == int(n * sizeof(int32_t)));
+            }
+        }
+
+        const int64_t n_tokens = params.n_tokens;
+        const int64_t n_layer  = hparams.n_layer;
+
+        gf->n_threads = params.n_threads;
+
+        // build the graph
+        inp_tokens->set_input(ubatch);
+        inp_pos->set_input(ubatch);
+        inp_attn_temp->set_input(ubatch);
+
+        struct ggml_tensor * cur = build_inp_embd();
+        struct ggml_tensor * lay_out = nullptr;
+
+        for (int il = 0; il < n_layer; ++il) {
+            struct ggml_tensor * inp_norm = build_norm(cur, hparams.f_norm_eps, il, tn(LLM_TENSOR_ATTN_NORM, il));
+            struct ggml_tensor * qkv      = build_attn(inp_norm, il);
+            struct ggml_tensor * q        = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head(il),    n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, 0, 0, 0);
+            struct ggml_tensor * k        = ggml_view_4d(ctx, qkv, hparams.n_embd_head_k, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_k, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
+            struct ggml_tensor * v        = ggml_view_4d(ctx, qkv, hparams.n_embd_head_v, hparams.n_head_kv(il), n_tokens, 1, ggml_element_size(qkv)*hparams.n_embd_head_v, ggml_element_size(qkv)*hparams.n_embd_k_gqa(il) + ggml_element_size(qkv)*hparams.n_embd_k_gqa(il), 0, 0);
+
+            ggml_set_name(q, "q");
+            ggml_set_name(k, "k");
+            ggml_set_name(v, "v");
+
+            struct ggml_tensor * qcur = q;
+            struct ggml_tensor * kcur = k;
+
+            bool apply_rope = true;
+            if (arch == LLM_ARCH_SMOLLM3) {
+                if (std::find(no_rope_layers.begin(), no_rope_layers.end(), il) != no_rope_layers.end()) {
+                    apply_rope = false;
+                }
+            }
+
+            if (apply_rope && get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il))) {
+                qcur = ggml_rope_ext(ctx, q, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
+                kcur = ggml_rope_ext(ctx, k, inp_pos->pos, get_tensor_meta(tn(LLM_TENSOR_ROPE_FREQS, il)), hparams.rope_type, 0, hparams.n_rot, hparams.n_gqa(il), hparams.rope_freq_base_train, hparams.rope_freq_scale_train, hparams.n_ctx_orig_yarn, hparams.rope_yarn_log_mul);
+            }
+
+            struct ggml_tensor * attn_out = build_attn_out(inp_norm, qcur, kcur, v, il);
+
+            if (hparams.use_par_res) {
+                // parallel residual
+                lay_out = ggml_add(ctx, attn_out, build_ff_par(inp_norm, il));
+            } else {
+                // sequential residual
+                lay_out = ggml_add(ctx, cur, attn_out);
+                lay_out = build_ff_seq(lay_out, il);
+            }
+            cur = lay_out;
+        }
+
+        build_output(cur, lay_out);
+    }
+};
+
 llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
    llama_memory_i * res;

@ -14085,6 +14154,10 @@ llm_graph_result_ptr llama_model::build_graph(
            {
                llm = std::make_unique<llm_build_arcee>(*this, params, gf);
            } break;
+        case LLM_ARCH_SMOLLM3:
+            {
+                llm = std::make_unique<llm_build_smollm3>(*this, params, gf);
+            } break;
        default:
            GGML_ABORT("fatal error");
    }
@ -14235,9 +14308,11 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
        case LLM_ARCH_CHAMELEON:
        case LLM_ARCH_BAILINGMOE:
        case LLM_ARCH_NEO_BERT:
+        case LLM_ARCH_SMOLLM3:
        case LLM_ARCH_ARCEE:
            return LLAMA_ROPE_TYPE_NORM;

+
        // the pairs of head values are offset by n_rot/2
        case LLM_ARCH_FALCON:
        case LLM_ARCH_GROK: