llama.cpp : split llama_context_params into model and context params (#3301)

* llama.cpp : split llama_context_params into model and context params

ggml-ci

* fix metal build

* fix freq_base/scale default to model value

* llama-bench : keep the same model between tests when possible

* move n_threads to llama_context_params, add n_threads_batch

* fix mpi build

* remove kv_size(), cuda scratch fixes

* remove low-vram option

* add n_threads_batch to system info, refactor to get_system_info()

* add documentation about --threads-batch to the READMEs

* llama-bench fix

* main : fix rope freq/scale warning

* llama.cpp : add llama_get_model
common : add llama_tokenize from model

* remove duplicated ctx/model functions

ggml-ci

* cuda : print total VRAM used
This commit is contained in:
slaren
2023-09-28 21:42:38 +02:00
committed by GitHub
parent 0512d66670
commit 16bc66d947
27 changed files with 713 additions and 633 deletions

View File

@ -200,6 +200,7 @@ struct llama_server_context
llama_model *model = nullptr;
llama_context *ctx = nullptr;
gpt_params params;
int n_ctx;
grammar_parser::parse_state parsed_grammar;
llama_grammar *grammar = nullptr;
@ -239,7 +240,7 @@ struct llama_server_context
num_prompt_tokens = 0;
num_tokens_predicted = 0;
generated_text = "";
generated_text.reserve(params.n_ctx);
generated_text.reserve(n_ctx);
generated_token_probs.clear();
truncated = false;
stopped_eos = false;
@ -265,8 +266,8 @@ struct llama_server_context
LOG_ERROR("unable to load model", {{"model", params_.model}});
return false;
}
last_n_tokens.resize(params.n_ctx);
n_ctx = llama_n_ctx(ctx);
last_n_tokens.resize(n_ctx);
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
return true;
}
@ -351,19 +352,19 @@ struct llama_server_context
{
params.n_keep = (int)num_prompt_tokens;
}
params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
params.n_keep = std::min(n_ctx - 4, params.n_keep);
// if input prompt is too big, truncate like normal
if (num_prompt_tokens >= (size_t)params.n_ctx)
if (num_prompt_tokens >= (size_t)n_ctx)
{
const int n_left = (params.n_ctx - params.n_keep) / 2;
const int n_left = (n_ctx - params.n_keep) / 2;
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());
LOG_VERBOSE("input truncated", {
{"n_ctx", params.n_ctx},
{"n_ctx", n_ctx},
{"n_keep", params.n_keep},
{"n_left", n_left},
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
@ -413,7 +414,7 @@ struct llama_server_context
completion_token_output result;
result.tok = -1;
if (embd.size() >= (size_t)params.n_ctx)
if (embd.size() >= (size_t)n_ctx)
{
// Shift context
@ -433,7 +434,7 @@ struct llama_server_context
truncated = true;
LOG_VERBOSE("input truncated", {
{"n_ctx", params.n_ctx},
{"n_ctx", n_ctx},
{"n_keep", params.n_keep},
{"n_left", n_left},
});
@ -447,12 +448,11 @@ struct llama_server_context
n_eval = params.n_batch;
}
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
{
LOG_ERROR("failed to eval", {
{"n_eval", n_eval},
{"n_past", n_past},
{"n_threads", params.n_threads},
{"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
});
has_next_token = false;
@ -470,11 +470,11 @@ struct llama_server_context
// out of user input, sample next token
const float temp = params.temp;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
const float top_p = params.top_p;
const float tfs_z = params.tfs_z;
const float typical_p = params.typical_p;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
const float repeat_penalty = params.repeat_penalty;
const float alpha_presence = params.presence_penalty;
const float alpha_frequency = params.frequency_penalty;
@ -486,7 +486,7 @@ struct llama_server_context
{
auto *logits = llama_get_logits(ctx);
auto n_vocab = llama_n_vocab(ctx);
auto n_vocab = llama_n_vocab(model);
// Apply params.logit_bias map
for (const auto &it : params.logit_bias)
@ -505,7 +505,7 @@ struct llama_server_context
// Apply penalties
float nl_logit = logits[llama_token_nl(ctx)];
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
llama_sample_repetition_penalty(ctx, &candidates_p,
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
last_n_repeat, repeat_penalty);
@ -690,7 +690,7 @@ struct llama_server_context
std::vector<float> getEmbedding()
{
static const int n_embd = llama_n_embd(ctx);
static const int n_embd = llama_n_embd(model);
if (!params.embedding)
{
LOG_WARNING("embedding disabled", {
@ -734,7 +734,6 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
printf(" -ts SPLIT --tensor-split SPLIT\n");
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
printf(" -nommq, --no-mul-mat-q\n");
printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
printf(" Not recommended since this is both slower and uses more VRAM.\n");
@ -918,14 +917,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
#else
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
#endif // GGML_USE_CUBLAS
}
else if (arg == "--low-vram" || arg == "-lv")
{
#ifdef GGML_USE_CUBLAS
params.low_vram = true;
#else
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
#endif // GGML_USE_CUBLAS
}
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
@ -1031,7 +1022,7 @@ static json format_generation_settings(llama_server_context &llama)
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
return json{
{"n_ctx", llama.params.n_ctx},
{"n_ctx", llama.n_ctx},
{"model", llama.params.model_alias},
{"seed", llama.params.seed},
{"temp", llama.params.temp},
@ -1191,7 +1182,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
const auto &logit_bias = body.find("logit_bias");
if (logit_bias != body.end() && logit_bias->is_array())
{
const int n_vocab = llama_n_vocab(llama.ctx);
const int n_vocab = llama_n_vocab(llama.model);
for (const auto &el : *logit_bias)
{
if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
@ -1324,6 +1315,7 @@ int main(int argc, char **argv)
{"commit", BUILD_COMMIT}});
LOG_INFO("system info", {
{"n_threads", params.n_threads},
{"n_threads_batch", params.n_threads_batch},
{"total_threads", std::thread::hardware_concurrency()},
{"system_info", llama_print_system_info()},
});
@ -1387,7 +1379,7 @@ int main(int argc, char **argv)
if (llama.params.n_beams) {
// Fill llama.generated_token_probs vector with final beam.
llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
llama.n_past, llama.n_remain, llama.params.n_threads);
llama.n_past, llama.n_remain);
// Translate llama.generated_token_probs to llama.generated_text.
append_to_generated_text_from_generated_token_probs(llama);
} else {