We could use std::unordered_map over std::map (#305)

* Improve performance by changing std::map to std::unordered_map and std::map<id, token> id_to_token; to std::vector<token> id_to_token;

* fix last commit on gpt_vocab_init add vocab.id_to_token.resize(vocab.token_to_id.size());

* Removed include <map>

* Nest struct token score inside gpt_vocab

* renamed token to tok
This commit is contained in:
Fabio R. Sluzala
2023-03-21 14:21:50 -03:00
committed by GitHub
parent 89d5d90f3b
commit 353ec251a4
4 changed files with 36 additions and 24 deletions

View File

@ -8,7 +8,6 @@
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
#include <regex>
@ -130,6 +129,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
}
std::string word;
vocab.id_to_token.resize(n_vocab);
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
finp.read ((char *) &len, sizeof(len));
@ -144,8 +144,10 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
fout.write((char *) &score, sizeof(score));
vocab.token_to_id[word] = i;
vocab.id_to_token[i] = word;
vocab.score[i] = score;
auto &tok_score = vocab.id_to_token[i];
tok_score.tok = word;
tok_score.score = score;
}
}