mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-07-01 13:05:52 +00:00
unicode : switch to multimap based nfd_map (#5799)
* switch to multimap based nfd_map due to compile time issues * simplify multimap keys * dont construct new locale every time
This commit is contained in:
11
llama.cpp
11
llama.cpp
@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm {
|
||||
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
||||
std::vector<uint32_t> nfd_codepoints;
|
||||
for (uint32_t code : codepoints) {
|
||||
auto it = nfd_map.find(code);
|
||||
if (it != nfd_map.end()) {
|
||||
for (uint32_t c : it->second) {
|
||||
nfd_codepoints.push_back(c);
|
||||
auto it = nfd_map.equal_range(code);
|
||||
if (it.first != it.second) {
|
||||
for (auto jt = it.first; jt != it.second; jt++) {
|
||||
nfd_codepoints.push_back(jt->second);
|
||||
}
|
||||
} else {
|
||||
nfd_codepoints.push_back(code);
|
||||
@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm {
|
||||
}
|
||||
|
||||
uint32_t to_lower(uint32_t code) {
|
||||
static const std::locale locale("en_US.UTF-8");
|
||||
#if defined(_WIN32)
|
||||
if (code > 0xFFFF) {
|
||||
return code;
|
||||
}
|
||||
#endif
|
||||
return std::tolower(wchar_t(code), std::locale("en_US.UTF-8"));
|
||||
return std::tolower(wchar_t(code), locale);
|
||||
}
|
||||
|
||||
bool is_ascii_punct(uint32_t code) {
|
||||
|
Reference in New Issue
Block a user