server: --offline mode (#13804)

* server: --offline mode (env: LLAMA_OFFLINE) --------- Co-authored-by: Xuan-Son Nguyen <thichthat@gmail.com>
2025-06-28 20:25:20 +00:00 · 2025-05-26 14:34:27 -07:00
parent a26c4cc11e
commit cdf94a1802
2 changed files with 127 additions and 106 deletions
--- a/common/arg.cpp
+++ b/common/arg.cpp
@ -242,7 +242,56 @@ static bool curl_perform_with_retry(const std::string & url, CURL * curl, int ma
 }
 // download one single file from remote URL to local path
-static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token) {
+static bool common_download_file_single(const std::string & url, const std::string & path, const std::string & bearer_token, bool offline) {
    // Check if the file already exists locally
    auto file_exists = std::filesystem::exists(path);
    // If the file exists, check its JSON metadata companion file.
    std::string metadata_path = path + ".json";
    nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
    std::string etag;
    std::string last_modified;
    if (file_exists) {
        if (offline) {
            LOG_INF("%s: using cached file (offline mode): %s\n", __func__, path.c_str());
            return true; // skip verification/downloading
        }
        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
        std::ifstream metadata_in(metadata_path);
        if (metadata_in.good()) {
            try {
                metadata_in >> metadata;
                LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
                    etag = metadata.at("etag");
                }
                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
                    last_modified = metadata.at("lastModified");
                }
            } catch (const nlohmann::json::exception & e) {
                LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
            }
        }
        // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
    } else {
        if (offline) {
            LOG_ERR("%s: required file is not available in cache (offline mode): %s\n", __func__, path.c_str());
            return false;
        }
        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
    }
    // Send a HEAD request to retrieve the etag and last-modified headers
    struct common_load_model_from_url_headers {
        std::string etag;
        std::string last_modified;
    };
    common_load_model_from_url_headers headers;
    bool head_request_ok = false;
    bool should_download = !file_exists; // by default, we should download if the file does not exist
    // Initialize libcurl
    curl_ptr       curl(curl_easy_init(), &curl_easy_cleanup);
    curl_slist_ptr http_headers;
@ -269,49 +318,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
    curl_easy_setopt(curl.get(), CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
 #endif
    // Check if the file already exists locally
    auto file_exists = std::filesystem::exists(path);
    // If the file exists, check its JSON metadata companion file.
    std::string metadata_path = path + ".json";
    nlohmann::json metadata; // TODO @ngxson : get rid of this json, use regex instead
    std::string etag;
    std::string last_modified;
    if (file_exists) {
        // Try and read the JSON metadata file (note: stream autoclosed upon exiting this block).
        std::ifstream metadata_in(metadata_path);
        if (metadata_in.good()) {
            try {
                metadata_in >> metadata;
                LOG_DBG("%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
                if (metadata.contains("etag") && metadata.at("etag").is_string()) {
                    etag = metadata.at("etag");
                }
                if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
                    last_modified = metadata.at("lastModified");
                }
            } catch (const nlohmann::json::exception & e) {
                LOG_ERR("%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
            }
        }
        // if we cannot open the metadata file, we assume that the downloaded file is not valid (etag and last-modified are left empty, so we will download it again)
    } else {
        LOG_INF("%s: no previous model file found %s\n", __func__, path.c_str());
    }
    // Send a HEAD request to retrieve the etag and last-modified headers
    struct common_load_model_from_url_headers {
        std::string etag;
        std::string last_modified;
    };
    common_load_model_from_url_headers headers;
    bool head_request_ok = false;
    bool should_download = !file_exists; // by default, we should download if the file does not exist
    // get ETag to see if the remote file has changed
    {
    typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
    auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
        common_load_model_from_url_headers * headers = (common_load_model_from_url_headers *) userdata;
@ -354,7 +360,6 @@ static bool common_download_file_single(const std::string & url, const std::stri
        LOG_WRN("%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
        head_request_ok = false;
    }
    }
    // if head_request_ok is false, we don't have the etag or last-modified headers
    // we leave should_download as-is, which is true if the file does not exist
@ -460,12 +465,12 @@ static bool common_download_file_single(const std::string & url, const std::stri
 // download multiple files from remote URLs to local paths
 // the input is a vector of pairs <url, path>
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token) {
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> & urls, const std::string & bearer_token, bool offline) {
    // Prepare download in parallel
    std::vector<std::future<bool>> futures_download;
    for (auto const & item : urls) {
-        futures_download.push_back(std::async(std::launch::async, [bearer_token](const std::pair<std::string, std::string> & it) -> bool {
+        futures_download.push_back(std::async(std::launch::async, [bearer_token, offline](const std::pair<std::string, std::string> & it) -> bool {
-            return common_download_file_single(it.first, it.second, bearer_token);
+            return common_download_file_single(it.first, it.second, bearer_token, offline);
        }, item));
    }
@ -481,14 +486,15 @@ static bool common_download_file_multiple(const std::vector<std::pair<std::strin
 static bool common_download_model(
        const common_params_model & model,
-        const std::string & bearer_token) {
+        const std::string & bearer_token,
        bool offline) {
    // Basic validation of the model.url
    if (model.url.empty()) {
        LOG_ERR("%s: invalid model url\n", __func__);
        return false;
    }
-    if (!common_download_file_single(model.url, model.path, bearer_token)) {
+    if (!common_download_file_single(model.url, model.path, bearer_token, offline)) {
        return false;
    }
@ -547,7 +553,7 @@ static bool common_download_model(
        }
        // Download in parallel
-        common_download_file_multiple(urls, bearer_token);
+        common_download_file_multiple(urls, bearer_token, offline);
    }
    return true;
@ -608,7 +614,7 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
 *
 * Note: we use the Ollama-compatible HF API, but not using the blobId. Instead, we use the special "ggufFile" field which returns the value for "hf_file". This is done to be backward-compatible with existing cache files.
 */
-static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token) {
+static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_with_tag, const std::string & bearer_token, bool offline) {
    auto parts = string_split<std::string>(hf_repo_with_tag, ':');
    std::string tag = parts.size() > 1 ? parts.back() : "latest";
    std::string hf_repo = parts[0];
@ -638,20 +644,25 @@ static struct common_hf_file_res common_get_hf_file(const std::string & hf_repo_
    long res_code = 0;
    std::string res_str;
    bool use_cache = false;
    if (!offline) {
        try {
            auto res = common_remote_get_content(url, params);
            res_code = res.first;
            res_str = std::string(res.second.data(), res.second.size());
        } catch (const std::exception & e) {
-        LOG_WRN("error: failed to get manifest: %s\n", e.what());
+            LOG_WRN("error: failed to get manifest at %s: %s\n", url.c_str(), e.what());
-        LOG_WRN("try reading from cache\n");
+        }
-        // try to read from cache
+    }
-        try {
+    if (res_code == 0) {
        if (std::filesystem::exists(cached_response_path)) {
            LOG_WRN("trying to read manifest from cache: %s\n", cached_response_path.c_str());
            res_str = read_file(cached_response_path);
            res_code = 200;
            use_cache = true;
-        } catch (const std::exception & e) {
+        } else {
-            throw std::runtime_error("error: failed to get manifest (check your internet connection)");
+            throw std::runtime_error(
                offline ? "error: failed to get manifest (offline mode)"
                : "error: failed to get manifest (check your internet connection)");
        }
    }
    std::string ggufFile;
@ -698,24 +709,25 @@ bool common_has_curl() {
    return false;
 }
-static bool common_download_file_single(const std::string &, const std::string &, const std::string &) {
+static bool common_download_file_single(const std::string &, const std::string &, const std::string &, bool) {
    LOG_ERR("error: built without CURL, cannot download model from internet\n");
    return false;
 }
-static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &) {
+static bool common_download_file_multiple(const std::vector<std::pair<std::string, std::string>> &, const std::string &, bool) {
    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
    return false;
 }
 static bool common_download_model(
        const common_params_model &,
-        const std::string &) {
+        const std::string &,
        bool) {
    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
    return false;
 }
-static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &) {
+static struct common_hf_file_res common_get_hf_file(const std::string &, const std::string &, bool) {
    LOG_ERR("error: built without CURL, cannot download model from the internet\n");
    return {};
 }
@ -742,7 +754,8 @@ struct handle_model_result {
 static handle_model_result common_params_handle_model(
        struct common_params_model & model,
        const std::string & bearer_token,
-        const std::string & model_path_default) {
+        const std::string & model_path_default,
        bool offline) {
    handle_model_result result;
    // handle pre-fill default model path and url based on hf_repo and hf_file
    {
@ -750,7 +763,7 @@ static handle_model_result common_params_handle_model(
            // short-hand to avoid specifying --hf-file -> default it to --model
            if (model.hf_file.empty()) {
                if (model.path.empty()) {
-                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token);
+                    auto auto_detected = common_get_hf_file(model.hf_repo, bearer_token, offline);
                    if (auto_detected.repo.empty() || auto_detected.ggufFile.empty()) {
                        exit(1); // built without CURL, error message already printed
                    }
@ -791,7 +804,7 @@ static handle_model_result common_params_handle_model(
    // then, download it if needed
    if (!model.url.empty()) {
-        bool ok = common_download_model(model, bearer_token);
+        bool ok = common_download_model(model, bearer_token, offline);
        if (!ok) {
            LOG_ERR("error: failed to download model from %s\n", model.url.c_str());
            exit(1);
@ -934,7 +947,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
    // handle model and download
    {
-        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH);
+        auto res = common_params_handle_model(params.model, params.hf_token, DEFAULT_MODEL_PATH, params.offline);
        if (params.no_mmproj) {
            params.mmproj = {};
        } else if (res.found_mmproj && params.mmproj.path.empty() && params.mmproj.url.empty()) {
@ -944,12 +957,12 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
        // only download mmproj if the current example is using it
        for (auto & ex : mmproj_examples) {
            if (ctx_arg.ex == ex) {
-                common_params_handle_model(params.mmproj,    params.hf_token, "");
+                common_params_handle_model(params.mmproj,    params.hf_token, "", params.offline);
                break;
            }
        }
-        common_params_handle_model(params.speculative.model, params.hf_token, "");
+        common_params_handle_model(params.speculative.model, params.hf_token, "", params.offline);
-        common_params_handle_model(params.vocoder.model,     params.hf_token, "");
+        common_params_handle_model(params.vocoder.model,     params.hf_token, "", params.offline);
    }
    if (params.escape) {
@ -2996,6 +3009,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
            common_log_set_verbosity_thold(INT_MAX);
        }
    ));
    add_opt(common_arg(
        {"--offline"},
        "Offline mode: forces use of cache, prevents network access",
        [](common_params & params) {
            params.offline = true;
        }
    ).set_env("LLAMA_OFFLINE"));
    add_opt(common_arg(
        {"-lv", "--verbosity", "--log-verbosity"}, "N",
        "Set the verbosity threshold. Messages with a higher verbosity will be ignored.",
--- a/common/common.h
+++ b/common/common.h
@ -291,6 +291,7 @@ struct common_params {
    int32_t verbosity                  = 0;
    int32_t control_vector_layer_start = -1; // layer range for control vector
    int32_t control_vector_layer_end   = -1; // layer range for control vector
    bool    offline                    = false;
    int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
    int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line