feat: multiple models for llama

janhq · Apr 16, 2024 · 4dc2ccf · 4dc2ccf
1 parent 7ae9928
commit 4dc2ccf
Show file tree

Hide file tree

Showing 8 changed files with 319 additions and 173 deletions.
diff --git a/.github/scripts/e2e-test-llama-linux-and-mac.sh b/.github/scripts/e2e-test-llama-linux-and-mac.sh
@@ -45,6 +45,7 @@ response1=$(curl --connect-timeout 60 -o /tmp/response1.log -s -w "%{http_code}"
     --header 'Content-Type: application/json' \
     --data '{
     "llama_model_path": "/tmp/testllm",
+    "model_alias": "gpt-3.5-turbo",
     "ctx_len": 50,
     "ngl": 32,
     "embedding": false

diff --git a/.github/scripts/e2e-test-llama-windows.bat b/.github/scripts/e2e-test-llama-windows.bat
@@ -53,7 +53,7 @@ if not exist "%MODEL_PATH%" (
 rem Define JSON strings for curl data
 call set "MODEL_PATH_STRING=%%MODEL_PATH:\=\\%%"
 set "curl_data1={\"llama_model_path\":\"%MODEL_PATH_STRING%\"}"
-set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
+set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":true,\"model\":\"testllm\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
 
 rem Print the values of curl_data1 and curl_data2 for debugging
 echo curl_data1=%curl_data1%

diff --git a/context/llama_server_context.h b/context/llama_server_context.h
@@ -502,6 +502,7 @@ struct llama_server_context {
   std::condition_variable condition_tasks;
   std::mutex mutex_results;
   std::condition_variable condition_results;
+  std::thread bgr_thread;
 
   ~llama_server_context() {
     if (ctx) {
@@ -512,6 +513,7 @@ struct llama_server_context {
       llama_free_model(model);
       model = nullptr;
     }
+    release_resources();
   }
 
   bool load_model(const gpt_params& params_) {
@@ -600,6 +602,10 @@ struct llama_server_context {
     // empty system prompt
     system_prompt = "";
     system_tokens.clear();
+
+    model_loaded_external = true;
+    LOG_INFO << "Started background task here!";
+    bgr_thread = std::thread(std::bind(&llama_server_context::do_background_tasks, this));
   }
 
   std::vector<llama_token> tokenize(const json& json_prompt,
@@ -1879,6 +1885,33 @@ struct llama_server_context {
     }
     return true;
   }
+
+  void do_background_tasks() {
+    while (model_loaded_external) {
+      update_slots();
+    }
+    LOG_INFO << "Background task stopped! ";
+    kv_cache_clear();
+    LOG_INFO << "KV cache cleared!";
+  }
+
+  void release_resources() {
+    if(model_loaded_external) {
+      LOG_INFO << "Releasing llama_server_context resources";
+      model_loaded_external = false;
+      condition_tasks.notify_one();
+
+      if (bgr_thread.joinable()) {
+        bgr_thread.join();
+      }
+
+      llama_free(ctx);
+      llama_free_model(model);
+      ctx = nullptr;
+      model = nullptr;
+      LOG_INFO << "Released llama_server_context resources";
+    }
+  }
 };
 
 static void server_print_usage(const char* argv0, const gpt_params& params,