convert : support models with multiple chat templates (#6588)

* Support converting models with multiple chat templates

Adds the following metadata:
* tokenizer.chat_templates
* tokenizer.chat_template.<name1>
* tokenizer.chat_template.<name2>
* tokenizer.chat_template.<...>

Where `tokenizer.chat_templates` is an array of the template names (except `default`), `default` is added to the regular `tokenizer.chat_template`.

* replace filtered characters with underscore

* New script to add/modify/remove metadata

This scripts creates a copy of a GGUF file and allows you to add/modify/remove metadata in the process.

Most importantly this allows you to update chat templates, either as a string or directly from an updated tokenizer_config.json file.

* Add files via upload

add new script to project/readme

* flake--
This commit is contained in:
Sigbjørn Skjæret
2024-04-18 13:49:01 +02:00
committed by GitHub
parent e11b2e6e1e
commit 03c0946d73
7 changed files with 226 additions and 3 deletions

View File

@ -6,7 +6,8 @@ import struct
import tempfile
from enum import Enum, auto
from io import BufferedWriter
from typing import IO, Any, Sequence
from typing import IO, Any, Sequence, Mapping
from string import ascii_letters, digits
import numpy as np
@ -466,7 +467,33 @@ class GGUFWriter:
def add_add_space_prefix(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
def add_chat_template(self, value: str) -> None:
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
if isinstance(value, list):
template_default = None
template_names = set()
for choice in value:
name = choice.get('name', '')
template = choice.get('template')
# Allowing non-alphanumerical characters in template name is probably not a good idea, so filter it
name = ''.join((c if c in ascii_letters + digits else '_' for c in name))
if name and template is not None:
if name == 'default':
template_default = template
else:
template_names.add(name)
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE_N.format(name=name), template)
if template_names:
self.add_array(Keys.Tokenizer.CHAT_TEMPLATES, list(template_names))
if template_default is None:
return
value = template_default
self.add_string(Keys.Tokenizer.CHAT_TEMPLATE, value)
def add_prefix_token_id(self, id: int) -> None: