feat: multiple models (#14)

Co-authored-by: vansangpfiev <[email protected]>
janhq · May 13, 2024 · 4ad76ba · 4ad76ba
1 parent 672084a
commit 4ad76ba
Show file tree

Hide file tree

Showing 9 changed files with 1,357 additions and 1,288 deletions.
diff --git a/.github/scripts/e2e-test-server-linux-and-mac.sh b/.github/scripts/e2e-test-server-linux-and-mac.sh
@@ -51,6 +51,7 @@ response1=$(curl --connect-timeout 60 -o /tmp/load-llm-model-res.log -s -w "%{ht
     --header 'Content-Type: application/json' \
     --data '{
     "llama_model_path": "/tmp/testllm",
+    "model_alias": "testllm",
     "ctx_len": 50,
     "ngl": 32,
     "embedding": false
@@ -73,7 +74,7 @@ response2=$(
             {"content": "Write a long and sad story for me", "role": "user"}
         ],
         "stream": true,
-        "model": "gpt-3.5-turbo",
+        "model": "testllm",
         "max_tokens": 50,
         "stop": ["hello"],
         "frequency_penalty": 0,
@@ -83,7 +84,7 @@ response2=$(
 )
 
 # unload model
-response3=$(curl --connect-timeout 60 -o /tmp/unload-model-res.log --request GET -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/unloadmodel" \
+response3=$(curl --connect-timeout 60 -o /tmp/unload-model-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/unloadmodel" \
     --header 'Content-Type: application/json' \
     --data '{
     "llama_model_path": "/tmp/testllm"

diff --git a/.github/scripts/e2e-test-server-windows.bat b/.github/scripts/e2e-test-server-windows.bat
@@ -63,7 +63,7 @@ rem Define JSON strings for curl data
 call set "MODEL_LLM_PATH_STRING=%%MODEL_LLM_PATH:\=\\%%"
 call set "MODEL_EMBEDDING_PATH_STRING=%%MODEL_EMBEDDING_PATH:\=\\%%"
 set "curl_data1={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
-set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
+set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"testllm\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
 set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
 set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
 set "curl_data5={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
@@ -82,7 +82,7 @@ curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --
 --header "Content-Type: application/json" ^
 --data "%curl_data2%" > %TEMP%\response2.log 2>&1
 
-curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" --request GET -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1
+curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1
 
 curl.exe --connect-timeout 60 -o "%TEMP%\response4.log" --request POST -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data4%" > %TEMP%\response4.log 2>&1
 

diff --git a/examples/server/server.cc b/examples/server/server.cc
@@ -200,10 +200,11 @@ int main(int argc, char** argv) {
   };
 
   svr->Post("/loadmodel", handle_load_model);
-  svr->Get("/unloadmodel", handle_unload_model);
+  // Use POST since httplib does not read request body for GET method
+  svr->Post("/unloadmodel", handle_unload_model);
   svr->Post("/v1/chat/completions", handle_completions);
   svr->Post("/v1/embeddings", handle_embeddings);
-  svr->Get("/modelstatus", handle_get_model_status);
+  svr->Post("/modelstatus", handle_get_model_status);
 
   LOG_INFO << "HTTP server listening: " << hostname << ":" << port;
   svr->new_task_queue = [] {

diff --git a/src/chat_completion_request.h b/src/chat_completion_request.h
@@ -11,6 +11,7 @@ struct ChatCompletionRequest {
   float presence_penalty = 0;
   Json::Value stop = Json::Value(Json::arrayValue);
   Json::Value messages = Json::Value(Json::arrayValue);
+  std::string model_id;
 };
 
 inline ChatCompletionRequest fromJson(std::shared_ptr<Json::Value> jsonBody) {
@@ -26,6 +27,7 @@ inline ChatCompletionRequest fromJson(std::shared_ptr<Json::Value> jsonBody) {
         (*jsonBody).get("presence_penalty", 0).asFloat();
     completion.messages = (*jsonBody)["messages"];
     completion.stop = (*jsonBody)["stop"];
+    completion.model_id = (*jsonBody).get("model", {}).asString();
   }
   return completion;
 }