Skip to content

Commit

Permalink
feat: multiple models (#14)
Browse files Browse the repository at this point in the history
Co-authored-by: vansangpfiev <[email protected]>
  • Loading branch information
vansangpfiev and sangjanai authored May 13, 2024
1 parent 672084a commit 4ad76ba
Show file tree
Hide file tree
Showing 9 changed files with 1,357 additions and 1,288 deletions.
5 changes: 3 additions & 2 deletions .github/scripts/e2e-test-server-linux-and-mac.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ response1=$(curl --connect-timeout 60 -o /tmp/load-llm-model-res.log -s -w "%{ht
--header 'Content-Type: application/json' \
--data '{
"llama_model_path": "/tmp/testllm",
"model_alias": "testllm",
"ctx_len": 50,
"ngl": 32,
"embedding": false
Expand All @@ -73,7 +74,7 @@ response2=$(
{"content": "Write a long and sad story for me", "role": "user"}
],
"stream": true,
"model": "gpt-3.5-turbo",
"model": "testllm",
"max_tokens": 50,
"stop": ["hello"],
"frequency_penalty": 0,
Expand All @@ -83,7 +84,7 @@ response2=$(
)

# unload model
response3=$(curl --connect-timeout 60 -o /tmp/unload-model-res.log --request GET -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/unloadmodel" \
response3=$(curl --connect-timeout 60 -o /tmp/unload-model-res.log -s -w "%{http_code}" --location "http://127.0.0.1:$PORT/unloadmodel" \
--header 'Content-Type: application/json' \
--data '{
"llama_model_path": "/tmp/testllm"
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/e2e-test-server-windows.bat
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ rem Define JSON strings for curl data
call set "MODEL_LLM_PATH_STRING=%%MODEL_LLM_PATH:\=\\%%"
call set "MODEL_EMBEDDING_PATH_STRING=%%MODEL_EMBEDDING_PATH:\=\\%%"
set "curl_data1={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"gpt-3.5-turbo\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
set "curl_data2={\"messages\":[{\"content\":\"Hello there\",\"role\":\"assistant\"},{\"content\":\"Write a long and sad story for me\",\"role\":\"user\"}],\"stream\":false,\"model\":\"testllm\",\"max_tokens\":50,\"stop\":[\"hello\"],\"frequency_penalty\":0,\"presence_penalty\":0,\"temperature\":0.1}"
set "curl_data3={\"llama_model_path\":\"%MODEL_LLM_PATH_STRING%\"}"
set "curl_data4={\"llama_model_path\":\"%MODEL_EMBEDDING_PATH_STRING%\", \"embedding\": true, \"model_type\": \"embedding\"}"
set "curl_data5={\"input\": \"Hello\", \"model\": \"test-embedding\", \"encoding_format\": \"float\"}"
Expand All @@ -82,7 +82,7 @@ curl.exe --connect-timeout 60 -o "%TEMP%\response2.log" -s -w "%%{http_code}" --
--header "Content-Type: application/json" ^
--data "%curl_data2%" > %TEMP%\response2.log 2>&1

curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" --request GET -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1
curl.exe --connect-timeout 60 -o "%TEMP%\response3.log" -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/unloadmodel" --header "Content-Type: application/json" --data "%curl_data3%" > %TEMP%\response3.log 2>&1

curl.exe --connect-timeout 60 -o "%TEMP%\response4.log" --request POST -s -w "%%{http_code}" --location "http://127.0.0.1:%PORT%/loadmodel" --header "Content-Type: application/json" --data "%curl_data4%" > %TEMP%\response4.log 2>&1

Expand Down
5 changes: 3 additions & 2 deletions examples/server/server.cc
Original file line number Diff line number Diff line change
Expand Up @@ -200,10 +200,11 @@ int main(int argc, char** argv) {
};

svr->Post("/loadmodel", handle_load_model);
svr->Get("/unloadmodel", handle_unload_model);
// Use POST since httplib does not read request body for GET method
svr->Post("/unloadmodel", handle_unload_model);
svr->Post("/v1/chat/completions", handle_completions);
svr->Post("/v1/embeddings", handle_embeddings);
svr->Get("/modelstatus", handle_get_model_status);
svr->Post("/modelstatus", handle_get_model_status);

LOG_INFO << "HTTP server listening: " << hostname << ":" << port;
svr->new_task_queue = [] {
Expand Down
2 changes: 2 additions & 0 deletions src/chat_completion_request.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ struct ChatCompletionRequest {
float presence_penalty = 0;
Json::Value stop = Json::Value(Json::arrayValue);
Json::Value messages = Json::Value(Json::arrayValue);
std::string model_id;
};

inline ChatCompletionRequest fromJson(std::shared_ptr<Json::Value> jsonBody) {
Expand All @@ -26,6 +27,7 @@ inline ChatCompletionRequest fromJson(std::shared_ptr<Json::Value> jsonBody) {
(*jsonBody).get("presence_penalty", 0).asFloat();
completion.messages = (*jsonBody)["messages"];
completion.stop = (*jsonBody)["stop"];
completion.model_id = (*jsonBody).get("model", {}).asString();
}
return completion;
}
Expand Down
Loading

0 comments on commit 4ad76ba

Please sign in to comment.