llama : add support for DistilBert (#13907)

* add distilbert

* small fixes

* add note for LLM_ARCH_DISTIL_BERT

* Use MODEL_ARCH.BERT for DistilBert

---------

Co-authored-by: dinhhuy <huy.dinh@brains-tech.co.jp>
This commit is contained in:
Đinh Trọng Huy
2025-05-30 18:56:02 +09:00
committed by GitHub
parent 2c90da4c7e
commit 291f2b6913
3 changed files with 37 additions and 6 deletions

View File

@ -169,6 +169,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.q_proj_no_perm", # llama-custom
"layers.{bid}.attention.wq", # llama-pth
"encoder.layer.{bid}.attention.self.query", # bert
"transformer.layer.{bid}.attention.q_lin", # distillbert
"transformer.h.{bid}.attn.q_proj", # gpt-j
"model.layers.layers.{bid}.self_attn.q_proj", # plamo
"model.layers.{bid}.attention.wq", # internlm2
@ -183,6 +184,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.k_proj_no_perm", # llama-custom
"layers.{bid}.attention.wk", # llama-pth
"encoder.layer.{bid}.attention.self.key", # bert
"transformer.layer.{bid}.attention.k_lin", # distillbert
"transformer.h.{bid}.attn.k_proj", # gpt-j
"transformer.h.{bid}.attn.k", # refact
"model.layers.layers.{bid}.self_attn.k_proj", # plamo
@ -197,6 +199,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.v_proj", # llama-hf nemotron olmoe olmo2 phimoe
"layers.{bid}.attention.wv", # llama-pth
"encoder.layer.{bid}.attention.self.value", # bert
"transformer.layer.{bid}.attention.v_lin", # distillbert
"transformer.h.{bid}.attn.v_proj", # gpt-j
"transformer.h.{bid}.attn.v", # refact
"model.layers.layers.{bid}.self_attn.v_proj", # plamo
@ -217,6 +220,7 @@ class TensorNameMap:
"model.layers.{bid}.self_attn.linear_attn", # deci
"layers.{bid}.attention.wo", # llama-pth
"encoder.layer.{bid}.attention.output.dense", # bert
"transformer.layer.{bid}.attention.out_lin", # distillbert
"transformer.h.{bid}.attn.out_proj", # gpt-j
"language_model.encoder.layers.{bid}.self_attention.dense", # persimmon
"model.layers.{bid}.self_attn.dense", # persimmon
@ -237,6 +241,7 @@ class TensorNameMap:
# Attention output norm
MODEL_TENSOR.ATTN_OUT_NORM: (
"encoder.layer.{bid}.attention.output.LayerNorm", # bert
"transformer.layer.{bid}.sa_layer_norm", # distillbert
"encoder.layers.{bid}.norm1", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_1", # Grok
"transformer.blocks.{bid}.norm_attn_norm.norm_2", # dbrx
@ -313,6 +318,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.up_proj", # llama-hf refact nemotron olmo2
"layers.{bid}.feed_forward.w3", # llama-pth
"encoder.layer.{bid}.intermediate.dense", # bert
"transformer.layer.{bid}.ffn.lin1", # distillbert
"transformer.h.{bid}.mlp.fc_in", # gpt-j
"transformer.h.{bid}.mlp.linear_3", # refact
"language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon
@ -396,6 +402,7 @@ class TensorNameMap:
"model.layers.{bid}.mlp.down_proj", # llama-hf nemotron olmo2
"layers.{bid}.feed_forward.w2", # llama-pth
"encoder.layer.{bid}.output.dense", # bert
"transformer.layer.{bid}.ffn.lin2", # distillbert
"transformer.h.{bid}.mlp.fc_out", # gpt-j
"language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon
"model.layers.{bid}.mlp.dense_4h_to_h", # persimmon
@ -457,6 +464,7 @@ class TensorNameMap:
MODEL_TENSOR.LAYER_OUT_NORM: (
"encoder.layer.{bid}.output.LayerNorm", # bert
"transformer.layer.{bid}.output_layer_norm", # distillbert
"encoder.layers.{bid}.norm2", # nomic-bert
"transformer.decoder_layer.{bid}.rms_norm_3", # Grok
"encoder.layer.{bid}.mlp.layernorm", # jina-bert-v2
@ -827,6 +835,7 @@ class TensorNameMap:
MODEL_TENSOR.CLS: (
"classifier", # jina
"classifier.dense", # roberta
"pre_classifier", # distillbert
),
MODEL_TENSOR.CLS_OUT: (