Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(weave): Add benchmarking script for basic openai flow #3300

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
237 changes: 237 additions & 0 deletions scripts/benchmark_openai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,237 @@
#!/usr/bin/env python3

import argparse
import os
import statistics
import subprocess
import sys
import textwrap

from rich.console import Console
from rich.progress import Progress, SpinnerColumn, TimeElapsedColumn
from rich.table import Table


def test_with_weave(vcr_mode=False):
vcr_setup = """import vcr
my_vcr = vcr.VCR(record_mode='once')
with my_vcr.use_cassette('fixtures/openai_with_weave.yaml'):
"""
cmd = textwrap.dedent(
"""
import time
{}
start_import = time.perf_counter()
import openai
import weave
end_import = time.perf_counter()
print(end_import - start_import)

start_init = time.perf_counter()
weave.init('openai-benchmark')
oaiclient = openai.OpenAI()
end_init = time.perf_counter()
print(end_init - start_init)

start_call = time.perf_counter()
response = oaiclient.chat.completions.create(
model="gpt-4o",
messages=[{{"role": "user", "content": "Tell me a short joke"}}],
max_tokens=60
)
end_call = time.perf_counter()
print(end_call - start_call)
"""
).format(vcr_setup if vcr_mode else "")

res = subprocess.run(
[sys.executable, "-c", cmd],
capture_output=True,
text=True,
env=os.environ,
)
print(res.stderr)
import_time, *_, init_time, _, call_time = res.stdout.strip().splitlines()
return float(import_time), float(init_time), float(call_time)


def test_without_weave(vcr_mode=False):
vcr_setup = """import vcr
my_vcr = vcr.VCR(record_mode='once')
with my_vcr.use_cassette('fixtures/openai_without_weave.yaml'):
"""
cmd = textwrap.dedent(
"""
import time
{}
start_import = time.perf_counter()
import openai
end_import = time.perf_counter()
print(end_import - start_import)

start_init = time.perf_counter()
oaiclient = openai.OpenAI()
end_init = time.perf_counter()
print(end_init - start_init)

start_call = time.perf_counter()
response = oaiclient.chat.completions.create(
model="gpt-4o",
messages=[{{"role": "user", "content": "Tell me a short joke"}}],
max_tokens=60
)
end_call = time.perf_counter()
print(end_call - start_call)
"""
).format(vcr_setup if vcr_mode else "")

res = subprocess.run(
[sys.executable, "-c", cmd],
capture_output=True,
text=True,
env=os.environ,
)
print(res.stderr)
import_time, init_time, call_time = res.stdout.strip().split("\n")
return float(import_time), float(init_time), float(call_time)


def benchmark(iterations=5, vcr_mode=False):
console = Console()
import_times_without_weave = []
init_times_without_weave = []
call_times_without_weave = []
import_times_with_weave = []
init_times_with_weave = []
call_times_with_weave = []

with Progress(
SpinnerColumn(),
*Progress.get_default_columns(),
TimeElapsedColumn(),
console=console,
) as progress:
task = progress.add_task("[cyan]Running OpenAI API tests...", total=iterations)

for _ in range(iterations):
import_time, init_time, call_time = test_without_weave(vcr_mode)
import_times_without_weave.append(import_time)
init_times_without_weave.append(init_time)
call_times_without_weave.append(call_time)

import_time, init_time, call_time = test_with_weave(vcr_mode)
import_times_with_weave.append(import_time)
init_times_with_weave.append(init_time)
call_times_with_weave.append(call_time)

progress.advance(task)

table = Table(title="OpenAI API Benchmark Results", padding=(0, 1))
table.add_column("Component", style="yellow")
table.add_column("Metric", style="cyan")
table.add_column("Without Weave", style="green", justify="right")
table.add_column("With Weave", style="blue", justify="right")
table.add_column("Difference", style="red", justify="right")
table.add_column("% Increase", style="red", justify="right")

metrics = [
("Mean time", statistics.mean),
("Median time", statistics.median),
("Std dev", statistics.stdev),
("Min time", min),
("Max time", max),
]

components = [
("Import", import_times_without_weave, import_times_with_weave),
("Init", init_times_without_weave, init_times_with_weave),
("API Call", call_times_without_weave, call_times_with_weave),
(
"Total",
[
sum(x)
for x in zip(
import_times_without_weave,
init_times_without_weave,
call_times_without_weave,
)
],
[
sum(x)
for x in zip(
import_times_with_weave,
init_times_with_weave,
call_times_with_weave,
)
],
),
]

for comp_name, without_times, with_times in components:
for name, func in metrics:
without_val = func(without_times)
with_val = func(with_times)
diff = with_val - without_val
pct_increase = (
(diff / without_val) * 100 if without_val != 0 else float("inf")
)

table.add_row(
comp_name,
name,
f"{without_val:8.4f}s",
f"{with_val:8.4f}s",
f"{diff:+8.4f}s",
f"{pct_increase:+7.1f}%",
)

console.print("\n")
console.print(table)

times_table = Table(title="Individual API Call Times", padding=(0, 1))
times_table.add_column("Run #", style="cyan")
times_table.add_column("Component", style="yellow")
times_table.add_column("Without Weave", style="green", justify="right")
times_table.add_column("With Weave", style="blue", justify="right")
times_table.add_column("Difference", style="red", justify="right")
times_table.add_column("% Increase", style="red", justify="right")

for i in range(iterations):
for comp_name, without_times, with_times in components:
without_val = without_times[i]
with_val = with_times[i]
diff = with_val - without_val
pct_increase = (
(diff / without_val) * 100 if without_val != 0 else float("inf")
)

times_table.add_row(
str(i + 1),
comp_name,
f"{without_val:8.4f}s",
f"{with_val:8.4f}s",
f"{diff:+8.4f}s",
f"{pct_increase:+7.1f}%",
)

console.print("\n")
console.print(times_table)


if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark OpenAI API with and without Weave"
)
parser.add_argument(
"--vcr", action="store_true", help="Enable VCR.py recording/playback"
)
parser.add_argument(
"--iterations", type=int, default=5, help="Number of iterations to run"
)
args = parser.parse_args()

# Create fixtures directory if it doesn't exist
if args.vcr:
os.makedirs("fixtures", exist_ok=True)

benchmark(iterations=args.iterations, vcr_mode=args.vcr)
Loading