mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-08-17 21:51:27 -04:00
Minor improvements in GPT2 tokenizer (#3567)
* Fixing minor bugs in bpe_gpt2_preprocess * Don't add bos token in test
This commit is contained in:
@@ -174,10 +174,8 @@ int main(int argc, char **argv) {
|
||||
}
|
||||
|
||||
for (const auto & tok : res) {
|
||||
ofs << tok << " ";
|
||||
ofs << tok << " '" << llama_detokenize_spm(ctx, std::vector<int>{tok}) << "'" << std::endl;
|
||||
}
|
||||
|
||||
ofs << "\n";
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
|
||||
|
Reference in New Issue
Block a user