mirror of
https://github.com/ggml-org/llama.cpp.git
synced 2025-06-27 20:05:20 +00:00
server : add lora hotswap endpoint (WIP) (#8857)
* server : add lora hotswap endpoint * handle lora_no_apply * fix build * updae docs * clean up struct def * fix build * add LoRA test * fix style
This commit is contained in:
@ -78,6 +78,7 @@ enum server_task_type {
|
||||
SERVER_TASK_TYPE_SLOT_SAVE,
|
||||
SERVER_TASK_TYPE_SLOT_RESTORE,
|
||||
SERVER_TASK_TYPE_SLOT_ERASE,
|
||||
SERVER_TASK_TYPE_SET_LORA,
|
||||
};
|
||||
|
||||
struct server_task {
|
||||
@ -622,6 +623,7 @@ struct server_response {
|
||||
struct server_context {
|
||||
llama_model * model = nullptr;
|
||||
llama_context * ctx = nullptr;
|
||||
std::vector<llama_lora_adapter_container> lora_adapters;
|
||||
|
||||
gpt_params params;
|
||||
|
||||
@ -681,6 +683,7 @@ struct server_context {
|
||||
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
lora_adapters = llama_init.lora_adapters;
|
||||
params.n_parallel -= 1; // but be sneaky about it
|
||||
if (model == nullptr) {
|
||||
LOG_ERROR("unable to load model", {{"model", params.model}});
|
||||
@ -1850,6 +1853,14 @@ struct server_context {
|
||||
};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
case SERVER_TASK_TYPE_SET_LORA:
|
||||
{
|
||||
llama_lora_adapters_apply(ctx, lora_adapters);
|
||||
server_task_result result;
|
||||
result.id = task.id;
|
||||
result.data = json{{ "success", true }};
|
||||
queue_results.send(result);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
@ -3328,6 +3339,55 @@ int main(int argc, char ** argv) {
|
||||
return res.set_content(root.dump(), "application/json; charset=utf-8");
|
||||
};
|
||||
|
||||
const auto handle_lora_adapters_list = [&](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
json result = json::array();
|
||||
for (size_t i = 0; i < ctx_server.lora_adapters.size(); ++i) {
|
||||
auto & la = ctx_server.lora_adapters[i];
|
||||
result.push_back({
|
||||
{"id", i},
|
||||
{"path", la.path},
|
||||
{"scale", la.scale},
|
||||
});
|
||||
}
|
||||
res.set_content(result.dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
};
|
||||
|
||||
const auto handle_lora_adapters_apply = [&](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
|
||||
const std::vector<json> body = json::parse(req.body);
|
||||
int max_idx = ctx_server.lora_adapters.size();
|
||||
|
||||
// clear existing value
|
||||
for (auto & la : ctx_server.lora_adapters) {
|
||||
la.scale = 0.0f;
|
||||
}
|
||||
|
||||
// set value
|
||||
for (auto entry : body) {
|
||||
int id = entry.at("id");
|
||||
float scale = entry.at("scale");
|
||||
if (0 <= id && id < max_idx) {
|
||||
ctx_server.lora_adapters[id].scale = scale;
|
||||
} else {
|
||||
throw std::runtime_error("invalid adapter id");
|
||||
}
|
||||
}
|
||||
|
||||
server_task task;
|
||||
task.type = SERVER_TASK_TYPE_SET_LORA;
|
||||
const int id_task = ctx_server.queue_tasks.post(task);
|
||||
ctx_server.queue_results.add_waiting_task_id(id_task);
|
||||
|
||||
server_task_result result = ctx_server.queue_results.recv(id_task);
|
||||
ctx_server.queue_results.remove_waiting_task_id(id_task);
|
||||
|
||||
res.set_content(result.data.dump(), "application/json");
|
||||
res.status = 200; // HTTP OK
|
||||
};
|
||||
|
||||
auto handle_static_file = [](unsigned char * content, size_t len, const char * mime_type) {
|
||||
return [content, len, mime_type](const httplib::Request &, httplib::Response & res) {
|
||||
res.set_content(reinterpret_cast<const char*>(content), len, mime_type);
|
||||
@ -3366,7 +3426,6 @@ int main(int argc, char ** argv) {
|
||||
|
||||
// register API routes
|
||||
svr->Get ("/health", handle_health);
|
||||
svr->Get ("/slots", handle_slots);
|
||||
svr->Get ("/metrics", handle_metrics);
|
||||
svr->Get ("/props", handle_props);
|
||||
svr->Get ("/v1/models", handle_models);
|
||||
@ -3381,6 +3440,11 @@ int main(int argc, char ** argv) {
|
||||
svr->Post("/v1/embeddings", handle_embeddings);
|
||||
svr->Post("/tokenize", handle_tokenize);
|
||||
svr->Post("/detokenize", handle_detokenize);
|
||||
// LoRA adapters hotswap
|
||||
svr->Get ("/lora-adapters", handle_lora_adapters_list);
|
||||
svr->Post("/lora-adapters", handle_lora_adapters_apply);
|
||||
// Save & load slots
|
||||
svr->Get ("/slots", handle_slots);
|
||||
if (!params.slot_save_path.empty()) {
|
||||
// only enable slot endpoints if slot_save_path is set
|
||||
svr->Post("/slots/:id_slot", handle_slots_action);
|
||||
|
Reference in New Issue
Block a user