mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-29 12:35:16 +00:00
llama : add support for GPT2, Bloom and CodeShell tied word embeddings (#12456)
* Add support for GPT2, Bloom and CodeShell tied word embeddings * Deduplicate tied word embeddings weights * Workaround for incorrect weight map It appears transformer.wte.weight is in the weight map even though the weights are not there, remove it if output weights are encountered first. * check++ * fatfingers--
This commit is contained in:
@ -180,7 +180,8 @@ class Model:
|
|||||||
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
extra = sorted(tensor_names_from_parts.difference(self.tensor_names))
|
||||||
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
missing_files = sorted(set(weight_map[n] for n in missing if n in weight_map))
|
||||||
if len(extra) == 0 and len(missing_files) > 0:
|
if len(extra) == 0 and len(missing_files) > 0:
|
||||||
raise ValueError(f"Missing or incomplete model files: {missing_files}")
|
raise ValueError(f"Missing or incomplete model files: {missing_files}\n"
|
||||||
|
f"Missing tensors: {missing}")
|
||||||
else:
|
else:
|
||||||
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
raise ValueError("Mismatch between weight map and model parts for tensor names:\n"
|
||||||
f"Missing tensors: {missing}\n"
|
f"Missing tensors: {missing}\n"
|
||||||
@ -1099,13 +1100,6 @@ class BloomModel(Model):
|
|||||||
|
|
||||||
tensors.append((self.map_tensor_name(name), data_torch))
|
tensors.append((self.map_tensor_name(name), data_torch))
|
||||||
|
|
||||||
if name == "word_embeddings.weight":
|
|
||||||
assert self.tensor_names is not None
|
|
||||||
|
|
||||||
# TODO: tie them at runtime, don't duplicate in the model file
|
|
||||||
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
|
|
||||||
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
|
||||||
|
|
||||||
return tensors
|
return tensors
|
||||||
|
|
||||||
|
|
||||||
@ -2423,10 +2417,6 @@ class GPT2Model(Model):
|
|||||||
|
|
||||||
tensors.append((new_name, data_torch))
|
tensors.append((new_name, data_torch))
|
||||||
|
|
||||||
# note: GPT2 output is tied to (same as) wte in original model
|
|
||||||
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
|
||||||
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
|
||||||
|
|
||||||
return tensors
|
return tensors
|
||||||
|
|
||||||
|
|
||||||
@ -2756,21 +2746,26 @@ class CodeShellModel(Model):
|
|||||||
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.LINEAR)
|
||||||
self.gguf_writer.add_rope_scaling_factor(1.0)
|
self.gguf_writer.add_rope_scaling_factor(1.0)
|
||||||
|
|
||||||
|
_has_tok_embd = False
|
||||||
|
|
||||||
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
|
||||||
del bid # unused
|
del bid # unused
|
||||||
|
|
||||||
|
output_name = self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT)
|
||||||
|
tok_embd_name = self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD)
|
||||||
|
|
||||||
new_name = self.map_tensor_name(name)
|
new_name = self.map_tensor_name(name)
|
||||||
|
|
||||||
tensors: list[tuple[str, Tensor]] = [(new_name, data_torch)]
|
# assuming token_embd.weight is seen before output.weight
|
||||||
|
if not self._has_tok_embd and new_name == self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT):
|
||||||
|
# even though the tensor file(s) does not contain the word embeddings they are still in the weight map
|
||||||
|
if self.tensor_names and "transformer.wte.weight" in self.tensor_names:
|
||||||
|
logger.debug(f"{tok_embd_name} not found before {output_name}, assuming they are tied")
|
||||||
|
self.tensor_names.remove("transformer.wte.weight")
|
||||||
|
elif new_name == tok_embd_name:
|
||||||
|
self._has_tok_embd = True
|
||||||
|
|
||||||
if new_name == self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD):
|
return [(new_name, data_torch)]
|
||||||
assert self.tensor_names is not None
|
|
||||||
|
|
||||||
if all(s not in self.tensor_names for s in ("lm_head.weight", "output.weight")):
|
|
||||||
# copy tok_embd.weight to output.weight
|
|
||||||
tensors.append((self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT), data_torch))
|
|
||||||
|
|
||||||
return tensors
|
|
||||||
|
|
||||||
|
|
||||||
@Model.register("InternLM2ForCausalLM")
|
@Model.register("InternLM2ForCausalLM")
|
||||||
|
@ -2020,7 +2020,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
// output
|
// output
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = layers[i];
|
auto & layer = layers[i];
|
||||||
@ -2381,7 +2386,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
// output
|
// output
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
output_norm_b = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, 0);
|
||||||
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0);
|
output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if output is NULL, init from the input tok embed
|
||||||
|
if (output == NULL) {
|
||||||
|
output = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
for (int i = 0; i < n_layer; ++i) {
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = layers[i];
|
auto & layer = layers[i];
|
||||||
@ -2407,7 +2417,12 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
|
|||||||
} break;
|
} break;
|
||||||
case LLM_ARCH_CODESHELL:
|
case LLM_ARCH_CODESHELL:
|
||||||
{
|
{
|
||||||
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
|
tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
|
||||||
|
|
||||||
|
// if tok embd is NULL, init from output
|
||||||
|
if (tok_embd == NULL) {
|
||||||
|
tok_embd = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_DUPLICATED);
|
||||||
|
}
|
||||||
|
|
||||||
// output
|
// output
|
||||||
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
|
||||||
|
Reference in New Issue
Block a user