Minor improvements in GPT2 tokenizer (#3567)

* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
2025-08-18 14:18:50 -04:00 · 2023-10-10 18:59:52 +02:00
parent c5b49360d0
commit 233fc1c69f
5 changed files with 17 additions and 20 deletions
--- a/tests/test-tokenizer-0-falcon.py
+++ b/tests/test-tokenizer-0-falcon.py
@@ -41,6 +41,8 @@ tests = [
        "   Hello",
        "    Hello",
        "    Hello\n    Hello",
+        "\n =",
+        "' era",
    ]

 for text in tests:
@@ -69,15 +71,14 @@ fname_tok = args.fname_tok
 if fname_tok:
    print('tokenizing file: ', fname_tok)
    fname_out = fname_tok + '.tok'
-    with open(fname_tok, 'r') as f:
+    with open(fname_tok, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        s = ''.join(lines)
        res = tokenizer.encode(s)
        # write to file
-        with open(fname_out, 'w') as f:
+        with open(fname_out, 'w', encoding='utf-8') as f:
            for x in res:
-                f.write(str(x) + ' ')
-            f.write('\n')
+                f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
        print('len(res): ', len(res))
        print('len(lines): ', len(lines))
    print('results written to: ', fname_out)