mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 04:15:21 +00:00
llama.cpp : split llama_context_params into model and context params (#3301)
* llama.cpp : split llama_context_params into model and context params ggml-ci * fix metal build * fix freq_base/scale default to model value * llama-bench : keep the same model between tests when possible * move n_threads to llama_context_params, add n_threads_batch * fix mpi build * remove kv_size(), cuda scratch fixes * remove low-vram option * add n_threads_batch to system info, refactor to get_system_info() * add documentation about --threads-batch to the READMEs * llama-bench fix * main : fix rope freq/scale warning * llama.cpp : add llama_get_model common : add llama_tokenize from model * remove duplicated ctx/model functions ggml-ci * cuda : print total VRAM used
This commit is contained in:
@ -200,6 +200,7 @@ struct llama_server_context
|
||||
llama_model *model = nullptr;
|
||||
llama_context *ctx = nullptr;
|
||||
gpt_params params;
|
||||
int n_ctx;
|
||||
|
||||
grammar_parser::parse_state parsed_grammar;
|
||||
llama_grammar *grammar = nullptr;
|
||||
@ -239,7 +240,7 @@ struct llama_server_context
|
||||
num_prompt_tokens = 0;
|
||||
num_tokens_predicted = 0;
|
||||
generated_text = "";
|
||||
generated_text.reserve(params.n_ctx);
|
||||
generated_text.reserve(n_ctx);
|
||||
generated_token_probs.clear();
|
||||
truncated = false;
|
||||
stopped_eos = false;
|
||||
@ -265,8 +266,8 @@ struct llama_server_context
|
||||
LOG_ERROR("unable to load model", {{"model", params_.model}});
|
||||
return false;
|
||||
}
|
||||
|
||||
last_n_tokens.resize(params.n_ctx);
|
||||
n_ctx = llama_n_ctx(ctx);
|
||||
last_n_tokens.resize(n_ctx);
|
||||
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
|
||||
return true;
|
||||
}
|
||||
@ -351,19 +352,19 @@ struct llama_server_context
|
||||
{
|
||||
params.n_keep = (int)num_prompt_tokens;
|
||||
}
|
||||
params.n_keep = std::min(params.n_ctx - 4, params.n_keep);
|
||||
params.n_keep = std::min(n_ctx - 4, params.n_keep);
|
||||
|
||||
// if input prompt is too big, truncate like normal
|
||||
if (num_prompt_tokens >= (size_t)params.n_ctx)
|
||||
if (num_prompt_tokens >= (size_t)n_ctx)
|
||||
{
|
||||
const int n_left = (params.n_ctx - params.n_keep) / 2;
|
||||
const int n_left = (n_ctx - params.n_keep) / 2;
|
||||
std::vector<llama_token> new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + params.n_keep);
|
||||
const int erased_blocks = (num_prompt_tokens - params.n_keep - n_left - 1) / n_left;
|
||||
new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + params.n_keep + erased_blocks * n_left, prompt_tokens.end());
|
||||
std::copy(prompt_tokens.end() - params.n_ctx, prompt_tokens.end(), last_n_tokens.begin());
|
||||
std::copy(prompt_tokens.end() - n_ctx, prompt_tokens.end(), last_n_tokens.begin());
|
||||
|
||||
LOG_VERBOSE("input truncated", {
|
||||
{"n_ctx", params.n_ctx},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_keep", params.n_keep},
|
||||
{"n_left", n_left},
|
||||
{"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())},
|
||||
@ -413,7 +414,7 @@ struct llama_server_context
|
||||
completion_token_output result;
|
||||
result.tok = -1;
|
||||
|
||||
if (embd.size() >= (size_t)params.n_ctx)
|
||||
if (embd.size() >= (size_t)n_ctx)
|
||||
{
|
||||
// Shift context
|
||||
|
||||
@ -433,7 +434,7 @@ struct llama_server_context
|
||||
|
||||
truncated = true;
|
||||
LOG_VERBOSE("input truncated", {
|
||||
{"n_ctx", params.n_ctx},
|
||||
{"n_ctx", n_ctx},
|
||||
{"n_keep", params.n_keep},
|
||||
{"n_left", n_left},
|
||||
});
|
||||
@ -447,12 +448,11 @@ struct llama_server_context
|
||||
n_eval = params.n_batch;
|
||||
}
|
||||
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0), params.n_threads))
|
||||
if (llama_decode(ctx, llama_batch_get_one(&embd[n_past], n_eval, n_past, 0)))
|
||||
{
|
||||
LOG_ERROR("failed to eval", {
|
||||
{"n_eval", n_eval},
|
||||
{"n_past", n_past},
|
||||
{"n_threads", params.n_threads},
|
||||
{"embd", tokens_to_str(ctx, embd.cbegin() + n_past, embd.cend())},
|
||||
});
|
||||
has_next_token = false;
|
||||
@ -470,11 +470,11 @@ struct llama_server_context
|
||||
|
||||
// out of user input, sample next token
|
||||
const float temp = params.temp;
|
||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
|
||||
const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k;
|
||||
const float top_p = params.top_p;
|
||||
const float tfs_z = params.tfs_z;
|
||||
const float typical_p = params.typical_p;
|
||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
|
||||
const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n;
|
||||
const float repeat_penalty = params.repeat_penalty;
|
||||
const float alpha_presence = params.presence_penalty;
|
||||
const float alpha_frequency = params.frequency_penalty;
|
||||
@ -486,7 +486,7 @@ struct llama_server_context
|
||||
|
||||
{
|
||||
auto *logits = llama_get_logits(ctx);
|
||||
auto n_vocab = llama_n_vocab(ctx);
|
||||
auto n_vocab = llama_n_vocab(model);
|
||||
|
||||
// Apply params.logit_bias map
|
||||
for (const auto &it : params.logit_bias)
|
||||
@ -505,7 +505,7 @@ struct llama_server_context
|
||||
|
||||
// Apply penalties
|
||||
float nl_logit = logits[llama_token_nl(ctx)];
|
||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), params.n_ctx);
|
||||
auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx);
|
||||
llama_sample_repetition_penalty(ctx, &candidates_p,
|
||||
last_n_tokens.data() + last_n_tokens.size() - last_n_repeat,
|
||||
last_n_repeat, repeat_penalty);
|
||||
@ -690,7 +690,7 @@ struct llama_server_context
|
||||
|
||||
std::vector<float> getEmbedding()
|
||||
{
|
||||
static const int n_embd = llama_n_embd(ctx);
|
||||
static const int n_embd = llama_n_embd(model);
|
||||
if (!params.embedding)
|
||||
{
|
||||
LOG_WARNING("embedding disabled", {
|
||||
@ -734,7 +734,6 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
||||
printf(" -ts SPLIT --tensor-split SPLIT\n");
|
||||
printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n");
|
||||
printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n");
|
||||
printf(" -lv, --low-vram don't allocate VRAM scratch buffer\n");
|
||||
printf(" -nommq, --no-mul-mat-q\n");
|
||||
printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n");
|
||||
printf(" Not recommended since this is both slower and uses more VRAM.\n");
|
||||
@ -918,14 +917,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
||||
}
|
||||
#else
|
||||
LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else if (arg == "--low-vram" || arg == "-lv")
|
||||
{
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
params.low_vram = true;
|
||||
#else
|
||||
LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. It is not possible to set lower vram usage.\n", {});
|
||||
#endif // GGML_USE_CUBLAS
|
||||
}
|
||||
else if (arg == "--no-mul-mat-q" || arg == "-nommq")
|
||||
@ -1031,7 +1022,7 @@ static json format_generation_settings(llama_server_context &llama)
|
||||
eos_bias->second < 0.0f && std::isinf(eos_bias->second);
|
||||
|
||||
return json{
|
||||
{"n_ctx", llama.params.n_ctx},
|
||||
{"n_ctx", llama.n_ctx},
|
||||
{"model", llama.params.model_alias},
|
||||
{"seed", llama.params.seed},
|
||||
{"temp", llama.params.temp},
|
||||
@ -1191,7 +1182,7 @@ static void parse_options_completion(const json &body, llama_server_context &lla
|
||||
const auto &logit_bias = body.find("logit_bias");
|
||||
if (logit_bias != body.end() && logit_bias->is_array())
|
||||
{
|
||||
const int n_vocab = llama_n_vocab(llama.ctx);
|
||||
const int n_vocab = llama_n_vocab(llama.model);
|
||||
for (const auto &el : *logit_bias)
|
||||
{
|
||||
if (el.is_array() && el.size() == 2 && el[0].is_number_integer())
|
||||
@ -1324,6 +1315,7 @@ int main(int argc, char **argv)
|
||||
{"commit", BUILD_COMMIT}});
|
||||
LOG_INFO("system info", {
|
||||
{"n_threads", params.n_threads},
|
||||
{"n_threads_batch", params.n_threads_batch},
|
||||
{"total_threads", std::thread::hardware_concurrency()},
|
||||
{"system_info", llama_print_system_info()},
|
||||
});
|
||||
@ -1387,7 +1379,7 @@ int main(int argc, char **argv)
|
||||
if (llama.params.n_beams) {
|
||||
// Fill llama.generated_token_probs vector with final beam.
|
||||
llama_beam_search(llama.ctx, beam_search_callback, &llama, llama.params.n_beams,
|
||||
llama.n_past, llama.n_remain, llama.params.n_threads);
|
||||
llama.n_past, llama.n_remain);
|
||||
// Translate llama.generated_token_probs to llama.generated_text.
|
||||
append_to_generated_text_from_generated_token_probs(llama);
|
||||
} else {
|
||||
|
Reference in New Issue
Block a user