mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-28 20:25:20 +00:00
server: --offline mode (#13804)
* server: --offline mode (env: LLAMA_OFFLINE) --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
This commit is contained in:
158
common/arg.cpp
158
common/arg.cpp
@ -242,7 +242,56 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
|
|||||||
}
|
}
|
||||||
|
|
||||||
// download one single file from remote URL to local path
|
// download one single file from remote URL to local path
|
||||||
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
|
static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
|
||||||
|
// Check if the file already exists locally
|
||||||
|
auto file_exists = std::filesystem::exists(path);
|
||||||
|
|
||||||
|
// If the file exists, check its JSON metadata companion file.
|
||||||
|
std::string metadata_path = path + ".json";
|
||||||
|
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
||||||
|
std::string etag;
|
||||||
|
std::string last_modified;
|
||||||
|
|
||||||
|
if (file_exists) {
|
||||||
|
if (offline) {
|
||||||
|
LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
|
||||||
|
return true; // skip verification/downloading
|
||||||
|
}
|
||||||
|
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
||||||
|
std::ifstream metadata_in(metadata_path);
|
||||||
|
if (metadata_in.good()) {
|
||||||
|
try {
|
||||||
|
metadata_in >> metadata;
|
||||||
|
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||||
|
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||||
|
etag = metadata.at("etag");
|
||||||
|
}
|
||||||
|
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||||
|
last_modified = metadata.at("lastModified");
|
||||||
|
}
|
||||||
|
} catch (const nlohmann::json::exception & e) {
|
||||||
|
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
||||||
|
} else {
|
||||||
|
if (offline) {
|
||||||
|
LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Send a HEAD request to retrieve the etag and last-modified headers
|
||||||
|
struct common_load_model_from_url_headers {
|
||||||
|
std::string etag;
|
||||||
|
std::string last_modified;
|
||||||
|
};
|
||||||
|
|
||||||
|
common_load_model_from_url_headers headers;
|
||||||
|
bool head_request_ok = false;
|
||||||
|
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
||||||
|
|
||||||
// Initialize libcurl
|
// Initialize libcurl
|
||||||
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
curl_ptr curl(curl_easy_init(), &curl_easy_cleanup);
|
||||||
curl_slist_ptr http_headers;
|
curl_slist_ptr http_headers;
|
||||||
@ -269,49 +318,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|||||||
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Check if the file already exists locally
|
|
||||||
auto file_exists = std::filesystem::exists(path);
|
|
||||||
|
|
||||||
// If the file exists, check its JSON metadata companion file.
|
|
||||||
std::string metadata_path = path + ".json";
|
|
||||||
nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
|
|
||||||
std::string etag;
|
|
||||||
std::string last_modified;
|
|
||||||
|
|
||||||
if (file_exists) {
|
|
||||||
// Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
|
|
||||||
std::ifstream metadata_in(metadata_path);
|
|
||||||
if (metadata_in.good()) {
|
|
||||||
try {
|
|
||||||
metadata_in >> metadata;
|
|
||||||
LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
|
||||||
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
|
||||||
etag = metadata.at("etag");
|
|
||||||
}
|
|
||||||
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
|
||||||
last_modified = metadata.at("lastModified");
|
|
||||||
}
|
|
||||||
} catch (const nlohmann::json::exception & e) {
|
|
||||||
LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
|
|
||||||
} else {
|
|
||||||
LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Send a HEAD request to retrieve the etag and last-modified headers
|
|
||||||
struct common_load_model_from_url_headers {
|
|
||||||
std::string etag;
|
|
||||||
std::string last_modified;
|
|
||||||
};
|
|
||||||
|
|
||||||
common_load_model_from_url_headers headers;
|
|
||||||
bool head_request_ok = false;
|
|
||||||
bool should_download = !file_exists; // by default, we should download if the file does not exist
|
|
||||||
|
|
||||||
// get ETag to see if the remote file has changed
|
|
||||||
{
|
|
||||||
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
|
||||||
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
|
||||||
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
|
||||||
@ -354,7 +360,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|||||||
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
|
||||||
head_request_ok = false;
|
head_request_ok = false;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// if head_request_ok is false, we don't have the etag or last-modified headers
|
// if head_request_ok is false, we don't have the etag or last-modified headers
|
||||||
// we leave should_download as-is, which is true if the file does not exist
|
// we leave should_download as-is, which is true if the file does not exist
|
||||||
@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
|
|||||||
|
|
||||||
// download multiple files from remote URLs to local paths
|
// download multiple files from remote URLs to local paths
|
||||||
// the input is a vector of pairs <url, path>
|
// the input is a vector of pairs <url, path>
|
||||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
|
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
|
||||||
// Prepare download in parallel
|
// Prepare download in parallel
|
||||||
std::vector<std::future<bool>> futures_download;
|
std::vector<std::future<bool>> futures_download;
|
||||||
for (auto const & item : urls) {
|
for (auto const & item : urls) {
|
||||||
futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
|
futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
|
||||||
return common_download_file_single(it.first, it.second, bearer_token);
|
return common_download_file_single(it.first, it.second, bearer_token, offline);
|
||||||
}, item));
|
}, item));
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
|
|||||||
|
|
||||||
static bool common_download_model(
|
static bool common_download_model(
|
||||||
const common_params_model & model,
|
const common_params_model & model,
|
||||||
const std::string & bearer_token) {
|
const std::string & bearer_token,
|
||||||
|
bool offline) {
|
||||||
// Basic validation of the model.url
|
// Basic validation of the model.url
|
||||||
if (model.url.empty()) {
|
if (model.url.empty()) {
|
||||||
LOG_ERR("%s: invalid model url\n", __func__);
|
LOG_ERR("%s: invalid model url\n", __func__);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!common_download_file_single(model.url, model.path, bearer_token)) {
|
if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -547,7 +553,7 @@ static bool common_download_model(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Download in parallel
|
// Download in parallel
|
||||||
common_download_file_multiple(urls, bearer_token);
|
common_download_file_multiple(urls, bearer_token, offline);
|
||||||
}
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
|
|||||||
*
|
*
|
||||||
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
* Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
|
||||||
*/
|
*/
|
||||||
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
|
static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
|
||||||
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
auto parts = string_split<std::string>(hf_repo_with_tag, ':');
|
||||||
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
std::string tag = parts.size() > 1 ? parts.back() : "latest";
|
||||||
std::string hf_repo = parts[0];
|
std::string hf_repo = parts[0];
|
||||||
@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
|
|||||||
long res_code = 0;
|
long res_code = 0;
|
||||||
std::string res_str;
|
std::string res_str;
|
||||||
bool use_cache = false;
|
bool use_cache = false;
|
||||||
|
if (!offline) {
|
||||||
try {
|
try {
|
||||||
auto res = common_remote_get_content(url, params);
|
auto res = common_remote_get_content(url, params);
|
||||||
res_code = res.first;
|
res_code = res.first;
|
||||||
res_str = std::string(res.second.data(), res.second.size());
|
res_str = std::string(res.second.data(), res.second.size());
|
||||||
} catch (const std::exception & e) {
|
} catch (const std::exception & e) {
|
||||||
LOG_WRN("error: failed to get manifest: %s\n", e.what());
|
LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
|
||||||
LOG_WRN("try reading from cache\n");
|
}
|
||||||
// try to read from cache
|
}
|
||||||
try {
|
if (res_code == 0) {
|
||||||
|
if (std::filesystem::exists(cached_response_path)) {
|
||||||
|
LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
|
||||||
res_str = read_file(cached_response_path);
|
res_str = read_file(cached_response_path);
|
||||||
res_code = 200;
|
res_code = 200;
|
||||||
use_cache = true;
|
use_cache = true;
|
||||||
} catch (const std::exception & e) {
|
} else {
|
||||||
throw std::runtime_error("error: failed to get manifest (check your internet connection)");
|
throw std::runtime_error(
|
||||||
|
offline ? "error: failed to get manifest (offline mode)"
|
||||||
|
: "error: failed to get manifest (check your internet connection)");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
std::string ggufFile;
|
std::string ggufFile;
|
||||||
@ -698,24 +709,25 @@ bool common_has_curl() {
|
|||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
|
static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from internet\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
|
static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool common_download_model(
|
static bool common_download_model(
|
||||||
const common_params_model &,
|
const common_params_model &,
|
||||||
const std::string &) {
|
const std::string &,
|
||||||
|
bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
|
static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
|
||||||
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
LOG_ERR("error: built without CURL, cannot download model from the internet\n");
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
@ -742,7 +754,8 @@ struct handle_model_result {
|
|||||||
static handle_model_result common_params_handle_model(
|
static handle_model_result common_params_handle_model(
|
||||||
struct common_params_model & model,
|
struct common_params_model & model,
|
||||||
const std::string & bearer_token,
|
const std::string & bearer_token,
|
||||||
const std::string & model_path_default) {
|
const std::string & model_path_default,
|
||||||
|
bool offline) {
|
||||||
handle_model_result result;
|
handle_model_result result;
|
||||||
// handle pre-fill default model path and url based on hf_repo and hf_file
|
// handle pre-fill default model path and url based on hf_repo and hf_file
|
||||||
{
|
{
|
||||||
@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
|
|||||||
// short-hand to avoid specifying --hf-file -> default it to --model
|
// short-hand to avoid specifying --hf-file -> default it to --model
|
||||||
if (model.hf_file.empty()) {
|
if (model.hf_file.empty()) {
|
||||||
if (model.path.empty()) {
|
if (model.path.empty()) {
|
||||||
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
|
auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
|
||||||
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
|
||||||
exit(1); // built without CURL, error message already printed
|
exit(1); // built without CURL, error message already printed
|
||||||
}
|
}
|
||||||
@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
|
|||||||
|
|
||||||
// then, download it if needed
|
// then, download it if needed
|
||||||
if (!model.url.empty()) {
|
if (!model.url.empty()) {
|
||||||
bool ok = common_download_model(model, bearer_token);
|
bool ok = common_download_model(model, bearer_token, offline);
|
||||||
if (!ok) {
|
if (!ok) {
|
||||||
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
|
||||||
exit(1);
|
exit(1);
|
||||||
@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
|
|
||||||
// handle model and download
|
// handle model and download
|
||||||
{
|
{
|
||||||
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
|
auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
|
||||||
if (params.no_mmproj) {
|
if (params.no_mmproj) {
|
||||||
params.mmproj = {};
|
params.mmproj = {};
|
||||||
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
} else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
|
||||||
@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
|
|||||||
// only download mmproj if the current example is using it
|
// only download mmproj if the current example is using it
|
||||||
for (auto & ex : mmproj_examples) {
|
for (auto & ex : mmproj_examples) {
|
||||||
if (ctx_arg.ex == ex) {
|
if (ctx_arg.ex == ex) {
|
||||||
common_params_handle_model(params.mmproj, params.hf_token, "");
|
common_params_handle_model(params.mmproj, params.hf_token, "", params.offline);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
common_params_handle_model(params.speculative.model, params.hf_token, "");
|
common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
|
||||||
common_params_handle_model(params.vocoder.model, params.hf_token, "");
|
common_params_handle_model(params.vocoder.model, params.hf_token, "", params.offline);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (params.escape) {
|
if (params.escape) {
|
||||||
@ -2996,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
|
|||||||
common_log_set_verbosity_thold(INT_MAX);
|
common_log_set_verbosity_thold(INT_MAX);
|
||||||
}
|
}
|
||||||
));
|
));
|
||||||
|
add_opt(common_arg(
|
||||||
|
{"--offline"},
|
||||||
|
"Offline mode: forces use of cache, prevents network access",
|
||||||
|
[](common_params & params) {
|
||||||
|
params.offline = true;
|
||||||
|
}
|
||||||
|
).set_env("LLAMA_OFFLINE"));
|
||||||
add_opt(common_arg(
|
add_opt(common_arg(
|
||||||
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
{"-lv", "--verbosity", "--log-verbosity"}, "N",
|
||||||
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
"Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
|
||||||
|
@ -291,6 +291,7 @@ struct common_params {
|
|||||||
int32_t verbosity = 0;
|
int32_t verbosity = 0;
|
||||||
int32_t control_vector_layer_start = -1; // layer range for control vector
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
||||||
int32_t control_vector_layer_end = -1; // layer range for control vector
|
int32_t control_vector_layer_end = -1; // layer range for control vector
|
||||||
|
bool offline = false;
|
||||||
|
|
||||||
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
int32_t ppl_stride = 0; // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
|
||||||
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
int32_t ppl_output_type = 0; // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
|
||||||
|
Reference in New Issue
Block a user