From cc66a7f78f95dcb5208420f4dd1abc3fc6aec0cc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sigbj=C3=B8rn=20Skj=C3=A6ret?= Date: Wed, 11 Jun 2025 17:16:32 +0200 Subject: [PATCH] tests : add test-tokenizers-repo (#14017) --- tests/CMakeLists.txt | 38 +++++++++++++++++++++++++++++++++-- tests/test-tokenizers-repo.sh | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 72 insertions(+), 2 deletions(-) create mode 100755 tests/test-tokenizers-repo.sh diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 2f7bad2cf..85299837c 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -42,6 +42,34 @@ function(llama_test target) set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL}) endfunction() +function(llama_test_cmd target) + include(CMakeParseArguments) + set(options) + set(oneValueArgs NAME LABEL WORKING_DIRECTORY) + set(multiValueArgs ARGS) + cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + + if (NOT DEFINED LLAMA_TEST_LABEL) + set(LLAMA_TEST_LABEL "main") + endif() + if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY) + set(LLAMA_TEST_WORKING_DIRECTORY .) + endif() + if (DEFINED LLAMA_TEST_NAME) + set(TEST_NAME ${LLAMA_TEST_NAME}) + else() + set(TEST_NAME ${target}) + endif() + + add_test( + NAME ${TEST_NAME} + WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY} + COMMAND ${target} + ${LLAMA_TEST_ARGS}) + + set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL}) +endfunction() + # Builds and runs a test source file. # Optional args: # - NAME: name of the executable & test target (defaults to the source file name without extension) @@ -97,8 +125,14 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) -# TODO: missing HF tokenizer for this model in convert_hf_to_gguf_update.py, see https://github.com/ggml-org/llama.cpp/pull/13847 -# llama_test(test-tokenizer-0 NAME test-tokenizer-0-nomic-bert-moe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-nomic-bert-moe.gguf) +if (NOT WIN32) + llama_test_cmd( + ${CMAKE_CURRENT_SOURCE_DIR}/test-tokenizers-repo.sh + NAME test-tokenizers-ggml-vocabs + WORKING_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY} + ARGS https://huggingface.co/ggml-org/vocabs ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocabs + ) +endif() if (LLAMA_LLGUIDANCE) llama_build_and_test(test-grammar-llguidance.cpp ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf) diff --git a/tests/test-tokenizers-repo.sh b/tests/test-tokenizers-repo.sh new file mode 100755 index 000000000..86e839133 --- /dev/null +++ b/tests/test-tokenizers-repo.sh @@ -0,0 +1,36 @@ +#!/bin/bash + +if [ $# -lt 2 ]; then + printf "Usage: $0 []\n" + exit 1 +fi + +if [ $# -eq 3 ]; then + toktest=$3 +else + toktest="./test-tokenizer-0" +fi + +if [ ! -x $toktest ]; then + printf "Test executable \"$toktest\" not found!\n" + exit 1 +fi + +repo=$1 +folder=$2 + +if [ -d $folder ] && [ -d $folder/.git ]; then + (cd $folder; git pull) +else + git clone $repo $folder +fi + +shopt -s globstar +for gguf in $folder/**/*.gguf; do + if [ -f $gguf.inp ] && [ -f $gguf.out ]; then + $toktest $gguf + else + printf "Found \"$gguf\" without matching inp/out files, ignoring...\n" + fi +done +