Skip to content

Commit

Permalink
feat: multiple models for llama
Browse files Browse the repository at this point in the history
  • Loading branch information
vansangpfiev committed Apr 16, 2024
1 parent 7ae9928 commit 4dc2ccf
Show file tree
Hide file tree
Showing 8 changed files with 319 additions and 173 deletions.
1 change: 1 addition & 0 deletions .github/scripts/e2e-test-llama-linux-and-mac.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}"
--header 'Content-Type: application/json' \
--data '{
"llama_model_path": "/tmp/testllm",
"model_alias": "gpt-3.5-turbo",
"ctx_len": 50,
"ngl": 32,
"embedding": false
Expand Down
2 changes: 1 addition & 1 deletion .github/scripts/e2e-test-llama-windows.bat
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ if not exist "%MODEL_PATH%" (
rem Define JSON strings for curl data
call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
set "curl_data1={\"llama_model_path\":\"%MODEL_PATH_STRING%\"}"
set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"testllm\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"

rem Print the values of curl_data1 and curl_data2 for debugging
echo curl_data1=%curl_data1%
Expand Down
33 changes: 33 additions & 0 deletions context/llama_server_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ struct llama_server_context {
std::condition_variable condition_tasks;
std::mutex mutex_results;
std::condition_variable condition_results;
std::thread bgr_thread;

~llama_server_context() {
if (ctx) {
Expand All @@ -512,6 +513,7 @@ struct llama_server_context {
llama_free_model(model);
model = nullptr;
}
release_resources();
}

bool load_model(const gpt_params& params_) {
Expand Down Expand Up @@ -600,6 +602,10 @@ struct llama_server_context {
// empty system prompt
system_prompt = "";
system_tokens.clear();

model_loaded_external = true;
LOG_INFO << "Started background task here!";
bgr_thread = std::thread(std::bind(&llama_server_context::do_background_tasks, this));
}

std::vector<llama_token> tokenize(const json& json_prompt,
Expand Down Expand Up @@ -1879,6 +1885,33 @@ struct llama_server_context {
}
return true;
}

void do_background_tasks() {
while (model_loaded_external) {
update_slots();
}
LOG_INFO << "Background task stopped! ";
kv_cache_clear();
LOG_INFO << "KV cache cleared!";
}

void release_resources() {
if(model_loaded_external) {
LOG_INFO << "Releasing llama_server_context resources";
model_loaded_external = false;
condition_tasks.notify_one();

if (bgr_thread.joinable()) {
bgr_thread.join();
}

llama_free(ctx);
llama_free_model(model);
ctx = nullptr;
model = nullptr;
LOG_INFO << "Released llama_server_context resources";
}
}
};

static void server_print_usage(const char* argv0, const gpt_params& params,
Expand Down
Loading

0 comments on commit 4dc2ccf

Please sign in to comment.