Skip to content

Commit

Permalink
Custom benchmark with parameters (#88)
Browse files Browse the repository at this point in the history
* Custom benchmark with parameters

* Mention arguments for benchmark.py

* Tweak
  • Loading branch information
guoqingbao authored Aug 23, 2024
1 parent 124fadc commit c7ff257
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 25 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,9 @@ After the `candle-vllm` service is running, run the Python script and enjoy effi

## Batched requests

``` shell
python3 examples/benchmark.py --batch 16 --max_tokens 1024
```
Refer to `examples/benchmark.py`

``` python
Expand Down
48 changes: 25 additions & 23 deletions examples/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,26 @@
from openai import Stream
from openai.types.chat import ChatCompletionChunk
from typing import List
# Run: cargo run --release -- --port 2000 --model-id <MODEL_ID> <MODEL_TYPE> --repeat-last-n 64
import argparse
# Run candle-vllm service: cargo run --release -- --port 2000 --model-id <MODEL_ID> <MODEL_TYPE> --repeat-last-n 64
# MODEL_ID is the huggingface model id or local weight path
# MODEL_TYPE is one of ["llama", "llama3", "mistral", "phi2", "phi3", "qwen2", "gemma", "yi", "stable-lm"]

# Then run this file: python3 examples/benchmark.py --batch 16

openai.api_key = "EMPTY"

openai.base_url = "http://localhost:2000/v1/"

# You may add your custom prompts here
PROMPT_CANDIDATES = ["Explain how to best learn Rust.",
"Please talk about deep learning.",
"Do you know the capital city of China? Talk the details of you known.",
"Who is the best female actor in the world? Explain why.",
"Let me know how to deal with depression?",
"How to make money in short time?",
"What is the future trend of large language model?",
"The famous tech companies in the world."]

async def chat_completion(model, max_tokens, prompt):
completion = openai.chat.completions.create(
model=model,
Expand All @@ -34,26 +45,12 @@ async def stream_response(response_idx, stream: Stream[ChatCompletionChunk]):
result += r
return (response_idx, result)

async def benchmark():
model = "mistral7b"
max_tokens = 1024
# 16 requests
prompts = ["Explain how to best learn Rust.",
"Please talk about deep learning.",
"Do you know the capital city of China? Talk the details of you known.",
"Who is the best female actor in the world? Explain why.",
"Let me know how to deal with depression?",
"How to make money in short time?",
"What is the future trend of large language model?",
"The famous tech companies in the world.",
"Explain how to best learn Rust.",
"Please talk about deep learning.",
"Do you know the capital city of China? Talk the details of you known.",
"Who is the best female actor in the world? Explain why.",
"Let me know how to deal with depression?",
"How to make money in short time?",
"What is the future trend of large language model?",
"The famous tech companies in the world."]
async def benchmark(batch, max_tokens=1024):
model = "any" # model used dependent on the server side
# candidate requests
prompts = []
for i in range(batch):
prompts.append(PROMPT_CANDIDATES[i % len(PROMPT_CANDIDATES)])

# avoid generating very short answers
for i in range(len(prompts)):
Expand Down Expand Up @@ -86,4 +83,9 @@ async def benchmark():
print("\n\n Response {}: \n\n {}".format(idx, output))


asyncio.run(benchmark())
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Using 'batch' and 'max_tokens' parameters for candle-vllm benchmark.")
parser.add_argument('--batch', default=16, type=int)
parser.add_argument('--max_tokens', default=1024, type=int)
args = parser.parse_args()
asyncio.run(benchmark(args.batch, args.max_tokens))
2 changes: 0 additions & 2 deletions src/openai/streaming.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,9 @@ impl Stream for Streamer {
Poll::Ready(Some(Ok(Event::default().data("[DONE]"))))
}
},

Err(e) => {
if self.status == StreamingStatus::Started && e == flume::TryRecvError::Disconnected
{
//no TryRecvError::Disconnected returned even if the client closed the stream or disconnected
self.status = StreamingStatus::Interrupted;
Poll::Ready(None)
} else {
Expand Down

0 comments on commit c7ff257

Please sign in to comment.