mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-26 19:55:04 +00:00
tests : add test-tokenizer-0.sh + fix some tokenizers (#7036)
* tests : add test-tokenizer-0.sh * unicode : add all unicode number ranges * starcoder : fix pre-tokenizer * tests : add test that fails with DeepSeek tokenizers * falcon : fix regex * unicode : regenerate unicode tables * refact : add tokenizer model * lint : fix * tests : disable failing tests ggml-ci * refact : add tests files ggml-ci * convert : print -> logging ggml-ci * lint : fix * unicode : digit -> number * phi-3 : update
This commit is contained in:
34
tests/test-tokenizer-0.sh
Executable file
34
tests/test-tokenizer-0.sh
Executable file
@ -0,0 +1,34 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Usage:
|
||||
#
|
||||
# test-tokenizer-0.sh <name> <input>
|
||||
#
|
||||
|
||||
if [ $# -ne 2 ]; then
|
||||
printf "Usage: $0 <name> <input>\n"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
name=$1
|
||||
input=$2
|
||||
|
||||
make -j tests/test-tokenizer-0
|
||||
|
||||
printf "Testing %s on %s ...\n" $name $input
|
||||
|
||||
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
|
||||
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
|
||||
|
||||
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
|
||||
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
|
||||
|
||||
diff $input.tok $input.tokcpp > /dev/null 2>&1
|
||||
|
||||
if [ $? -eq 0 ]; then
|
||||
printf "Tokenization is correct!\n"
|
||||
else
|
||||
diff $input.tok $input.tokcpp | head -n 32
|
||||
|
||||
printf "Tokenization differs!\n"
|
||||
fi
|
Reference in New Issue
Block a user