llama : add support for DistilBert (#13907)

* add distilbert

* small fixes

* add note for LLM_ARCH_DISTIL_BERT

* Use MODEL_ARCH.BERT for DistilBert

---------

Co-authored-by: dinhhuy <huy.dinh@brains-tech.co.jp>
This commit is contained in:
Đinh Trọng Huy
2025-05-30 18:56:02 +09:00
committed by GitHub
parent 2c90da4c7e
commit 291f2b6913
3 changed files with 37 additions and 6 deletions

View File

@ -523,15 +523,15 @@ class TextModel(ModelBase):
self.gguf_writer.add_context_length(n_ctx)
logger.info(f"gguf: context length = {n_ctx}")
if (n_embd := self.find_hparam(["hidden_size", "n_embd"], optional=True)) is not None:
if (n_embd := self.find_hparam(["hidden_size", "n_embd", "dim"], optional=True)) is not None:
self.gguf_writer.add_embedding_length(n_embd)
logger.info(f"gguf: embedding length = {n_embd}")
if (n_ff := self.find_hparam(["intermediate_size", "n_inner"], optional=True)) is not None:
if (n_ff := self.find_hparam(["intermediate_size", "n_inner", "hidden_dim"], optional=True)) is not None:
self.gguf_writer.add_feed_forward_length(n_ff)
logger.info(f"gguf: feed forward length = {n_ff}")
if (n_head := self.find_hparam(["num_attention_heads", "n_head"], optional=True)) is not None:
if (n_head := self.find_hparam(["num_attention_heads", "n_head", "n_heads"], optional=True)) is not None:
self.gguf_writer.add_head_count(n_head)
logger.info(f"gguf: head count = {n_head}")
@ -3907,6 +3907,26 @@ class BertModel(TextModel):
self.gguf_writer.add_add_eos_token(True)
@ModelBase.register("DistilBertModel", "DistilBertForMaskedLM", "DistilBertForSequenceClassification")
class DistilBertModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT
def set_gguf_parameters(self):
self.gguf_writer.add_layer_norm_eps(1e-12)
logger.info("gguf: layer norm epsilon = 1e-12")
super().set_gguf_parameters()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name.startswith("distilbert."):
name = name[11:]
# These layers act as MLM head, so we don't need them
if name.startswith("vocab_"):
return []
return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("RobertaModel", "RobertaForSequenceClassification")
class RobertaModel(BertModel):
model_arch = gguf.MODEL_ARCH.BERT

View File

@ -169,6 +169,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
"layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert
"transformer.layer.{bid}.attention.q_lin", # distillbert
"transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2
@ -183,6 +184,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
"layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert
"transformer.layer.{bid}.attention.k_lin", # distillbert
"transformer.h.{bid}.attn.k_proj", # gpt-j
"transformer.h.{bid}.attn.k", # refact
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
@ -197,6 +199,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
"layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert
"transformer.layer.{bid}.attention.v_lin", # distillbert
"transformer.h.{bid}.attn.v_proj", # gpt-j
"transformer.h.{bid}.attn.v", # refact
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
@ -217,6 +220,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.linear_attn", # deci
"layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert
"transformer.layer.{bid}.attention.out_lin", # distillbert
"transformer.h.{bid}.attn.out_proj", # gpt-j
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
"model.layers.{bid}.self_attn.dense", # persimmon
@ -237,6 +241,7 @@ class TensorNameMap:
# Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
"transformer.layer.{bid}.sa_layer_norm", # distillbert
"encoder.layers.{bid}.norm1", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
@ -313,6 +318,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
"layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert
"transformer.layer.{bid}.ffn.lin1", # distillbert
"transformer.h.{bid}.mlp.fc_in", # gpt-j
"transformer.h.{bid}.mlp.linear_3", # refact
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
@ -396,6 +402,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
"layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert
"transformer.layer.{bid}.ffn.lin2", # distillbert
"transformer.h.{bid}.mlp.fc_out", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
@ -457,6 +464,7 @@ class TensorNameMap:
MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert
"transformer.layer.{bid}.output_layer_norm", # distillbert
"encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
@ -827,6 +835,7 @@ class TensorNameMap:
MODEL_TENSOR.CLS: (
"classifier", # jina
"classifier.dense", # roberta
"pre_classifier", # distillbert
),
MODEL_TENSOR.CLS_OUT: (

View File

@ -2114,7 +2114,7 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
case LLM_ARCH_NOMIC_BERT_MOE:
{
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, 0);
type_embd = create_tensor(tn(LLM_TENSOR_TOKEN_TYPES, "weight"), {n_embd, n_token_types}, TENSOR_NOT_REQUIRED);
if (arch == LLM_ARCH_BERT) {
pos_embd = create_tensor(tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, n_ctx_train}, 0);
@ -5885,8 +5885,10 @@ struct llm_build_bert : public llm_graph_context {
inpL = build_inp_embd(model.tok_embd);
// token types are hardcoded to zero ("Sentence A")
if (model.type_embd) {
ggml_tensor * type_row0 = ggml_view_1d(ctx0, model.type_embd, n_embd, 0);
inpL = ggml_add(ctx0, inpL, type_row0);
}
if (model.arch == LLM_ARCH_BERT) {
inpL = ggml_add(ctx0, ggml_get_rows(ctx0, model.pos_embd, inp_pos), inpL);
}