Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: load multiple models #495

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/scripts/e2e-test-llama-linux-and-mac.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}"
--header 'Content-Type: application/json' \
--data '{
"llama_model_path": "/tmp/testllm",
"model_alias": "gpt-3.5-turbo",
"ctx_len": 50,
"ngl": 32,
"embedding": false
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/e2e-test-llama-windows.bat
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ if not exist "%MODEL_PATH%" (
rem Define JSON strings for curl data
call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
set "curl_data1={\"llama_model_path\":\"%MODEL_PATH_STRING%\"}"
set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"testllm\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"

rem Print the values of curl_data1 and curl_data2 for debugging
echo curl_data1=%curl_data1%
Expand Down
33 changes: 33 additions & 0 deletions context/llama_server_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ struct llama_server_context {
std::condition_variable condition_tasks;
std::mutex mutex_results;
std::condition_variable condition_results;
std::thread bgr_thread;

~llama_server_context() {
if (ctx) {
Expand All @@ -512,6 +513,7 @@ struct llama_server_context {
llama_free_model(model);
model = nullptr;
}
release_resources();
}

bool load_model(const gpt_params& params_) {
Expand Down Expand Up @@ -600,6 +602,10 @@ struct llama_server_context {
// empty system prompt
system_prompt = "";
system_tokens.clear();

model_loaded_external = true;
LOG_INFO << "Started background task here!";
bgr_thread = std::thread(std::bind(&llama_server_context::do_background_tasks, this));
}

std::vector<llama_token> tokenize(const json& json_prompt,
Expand Down Expand Up @@ -1879,6 +1885,33 @@ struct llama_server_context {
}
return true;
}

void do_background_tasks() {
while (model_loaded_external) {
update_slots();
}
LOG_INFO << "Background task stopped! ";
kv_cache_clear();
LOG_INFO << "KV cache cleared!";
}

void release_resources() {
if(model_loaded_external) {
LOG_INFO << "Releasing llama_server_context resources";
model_loaded_external = false;
condition_tasks.notify_one();

if (bgr_thread.joinable()) {
bgr_thread.join();
}

llama_free(ctx);
llama_free_model(model);
ctx = nullptr;
model = nullptr;
LOG_INFO << "Released llama_server_context resources";
}
}
};

static void server_print_usage(const char* argv0, const gpt_params& params,
Expand Down
Loading
Loading