Skip to content

Commit 23380e1

Browse files
committed
Update to latest version of code
1 parent e411e71 commit 23380e1

29 files changed

+1355
-351
lines changed

simulator/config/config.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
import argparse
22
import datetime
3-
#import hashlib
3+
import hashlib
44
import os
55

66
import yaml
@@ -27,7 +27,7 @@ def _parse_args(self):
2727

2828
def _add_derived_args(self):
2929
print(self._args)
30-
self._args.output_dir = f"{self._args.output_dir}/{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
30+
self._args.output_dir = f"{self._args.output_dir}/{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}"
3131
os.makedirs(self._args.output_dir, exist_ok=True)
3232

3333
def _update_namespace(self, config_dict, parent_key=""):

simulator/config/default.yml

+52-28
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
seed: 42
22
log_level: info
33
output_dir: ./simulator_output/
4-
cache_dir: ./cache
4+
cache_dir: ./tmpfs_cache
55
write_json_trace: false
6-
write_chrome_trace: true
6+
write_chrome_trace: false
77
write_metrics: true
88

99
cluster:
@@ -15,7 +15,7 @@ replica:
1515
memory_margin_fraction: 0.1
1616
# parallelism
1717
num_pipeline_stages: 1
18-
num_tensor_parallel_workers: 1
18+
num_tensor_parallel_workers: 8
1919
# Model Specs
2020
# GPT-3
2121
# num_layers: 96
@@ -34,30 +34,39 @@ replica:
3434
# use_gated_mlp: false
3535
# vocab_size: 65024
3636
# LLama2 7b
37+
model_name: meta-llama/Llama-2-7b-hf
3738
num_layers: 32
3839
num_q_heads: 32
3940
num_kv_heads: 32
4041
embedding_dim: 4096
4142
mlp_hidden_dim: 11008
4243
use_gated_mlp: true
4344
vocab_size: 32768
44-
# A100
45-
# fp16_tflops: 312
46-
# total_memory_gb: 80
47-
# A40
48-
fp16_tflops: 150
49-
total_memory_gb: 45
45+
# LLama2 70b
46+
# num_layers: 80
47+
# num_q_heads: 64
48+
# num_kv_heads: 8
49+
# embedding_dim: 8192
50+
# mlp_hidden_dim: 28672
51+
# use_gated_mlp: true
52+
# vocab_size: 32768
53+
# a100
54+
fp16_tflops: 312
55+
total_memory_gb: 80
56+
# a100
57+
# fp16_tflops: 150
58+
# total_memory_gb: 45
5059

5160
request_generator:
5261
provider: synthetic
5362
max_tokens: 4096
5463

5564
synthetic_request_generator:
56-
length_provider: zipf
57-
interval_provider: static
58-
min_tokens: 2048
65+
length_provider: trace
66+
interval_provider: poisson
67+
min_tokens: 1024
5968
prefill_to_decode_ratio: 10
60-
num_requests: 100
69+
num_requests: 10000
6170
# duration: 100
6271

6372
trace_request_generator:
@@ -70,7 +79,7 @@ trace_request_generator:
7079
# Config for synthetic trace generator
7180
trace_request_length_generator:
7281
trace_file: ./data/processed_traces/lmsys_chat_1m_conversation_stats_llama2_tokenizer.csv
73-
prefill_scale_factor: 0.3
82+
prefill_scale_factor: 1
7483
decode_scale_factor: 1
7584

7685
trace_request_interval_generator:
@@ -80,7 +89,7 @@ trace_request_interval_generator:
8089
time_scale_factor: 0.3
8190

8291
poisson_request_interval_generator:
83-
qps: 0.2
92+
qps: 16.0
8493

8594
gamma_request_interval_generator:
8695
cv: 0.5
@@ -95,24 +104,38 @@ execution_time_predictor:
95104
# provider: linear_regression
96105

97106
sklearn_execution_time_predictor:
98-
compute_input_file: ./data/profiling/a40/mlp.csv
99-
attention_input_file: ./data/profiling/a40/mixed_attention.csv
100-
all_reduce_input_file: ./data/profiling/a40/all_reduce.csv
101-
send_recv_input_file: ./data/profiling/a40/p2p_inter_node.csv
102-
k_fold_cv_splits: 5
107+
# compute_input_file: ./data/profiling/a100/mlp.csv
108+
compute_input_file: ./data/profiling/a100/mlp.csv
109+
# compute_input_file: ./llama7b_mlp_results_4.csv
110+
# compute_input_file: ./llama70b_mlp_results.csv
111+
# attention_input_file: ./data/profiling/a100/mixed_attention.csv
112+
attention_input_file: ./data/profiling/a100/mixed_attention.csv
113+
# attention_input_file: ./llama7b_attention_pav2_results.csv
114+
# attention_input_file: ./llama70b_attention_pav2_results.csv
115+
# all_reduce_input_file: ./data/profiling/a100/all_reduce.csv
116+
all_reduce_input_file: ./data/profiling/a100/all_reduce.csv
117+
# send_recv_input_file: ./data/profiling/a100/p2p_intra_node.csv
118+
send_recv_input_file: ./data/profiling/a100/p2p_intra_node.csv
119+
# cpu_overhead_input_file: ./data/profiling/a100/cpu_overheads.csv
120+
cpu_overhead_input_file: ./data/profiling/a100/cpu_overheads.csv
121+
k_fold_cv_splits: 10
103122
no_cache: false
104123
kv_cache_prediction_granularity: 8
105124
prediction_max_prefill_chunk_size: 4096
106125
prediction_max_batch_size: 100
107126
prediction_max_tokens_per_request: 4096
127+
attention_decode_overhead_percentage: 0.0
128+
nccl_cpu_launch_overhead_ms: 0.020
129+
nccl_cpu_skew_overhead_per_device_ms: 0
108130

109131
random_forrest_execution_time_predictor:
110132
num_estimators:
133+
# - 250
111134
- 500
112135
- 750
113136
max_depth:
114-
- 8
115-
- 16
137+
# - 8
138+
# - 16
116139
- 32
117140
min_samples_split:
118141
- 2
@@ -140,35 +163,36 @@ simulator:
140163
time_limit: null
141164

142165
global_scheduler:
143-
provider: lor
166+
provider: round_robin
144167

145168
replica_scheduler:
146169
provider: vllm
147-
batch_size_cap: 5
170+
batch_size_cap: 128
171+
num_blocks: 0
148172

149173
orca_scheduler:
150174
use_single_prefill_per_batch: false
151175

152176
sarathi_scheduler:
153-
chunk_size: 1024
177+
chunk_size: 512
154178
enable_rolling_prefills: true
155179
prefill_fitting_tolerance: 0.2
156180

157181
vllm_scheduler:
158182
watermark_blocks_fraction: 0.01
159183
max_tokens_in_batch: 4096
160-
max_batch_size_amplification_factor: 2
184+
max_batch_size_amplification_factor: 1
161185

162186
dsarathi_scheduler:
163187
chunk_size: 1024
164188
enable_rolling_prefills: true
165189
prefill_fitting_tolerance: 0.2
166190
watermark_blocks_fraction: 0.01
167-
max_batch_size_amplification_factor: 2
191+
max_batch_size_amplification_factor: 1
168192

169193
metrics_store:
170194
wandb_project: "llm-simulator"
171-
wandb_group: "vllm-benchmark-test"
195+
wandb_group: ""
172196
wandb_run_name: ""
173197
subsamples: 500
174198
save_table_to_wandb: false

simulator/entities/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -3,5 +3,6 @@
33
from simulator.entities.cluster import Cluster
44
from simulator.entities.replica import Replica
55
from simulator.entities.request import Request
6+
from simulator.entities.execution_time import ExecutionTime
67

7-
__all__ = [Request, Replica, Batch, Cluster, BatchStage]
8+
__all__ = [Request, Replica, Batch, Cluster, BatchStage, ExecutionTime]

simulator/entities/batch.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def request_ids(self) -> List[int]:
9999
return [request.id for request in self._requests]
100100

101101
@property
102-
def allcompleted(self) -> bool:
102+
def completed(self) -> bool:
103103
return all([request.completed for request in self._requests])
104104

105105
def on_schedule(
@@ -112,8 +112,8 @@ def on_schedule(
112112
for request in self._requests:
113113
request.on_batch_schedule(time)
114114

115-
if self._id % 1000 == 0:
116-
logger.info(f"Finished scheduling {self._id} batches.")
115+
# if self._id % 1000 == 0:
116+
# logger.info(f"Finished scheduling {self._id} batches.")
117117

118118
def on_batch_end(self, time: float):
119119
self._completed = True

simulator/entities/batch_stage.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ def __init__(
2323
batch_id: int,
2424
replica_id: int,
2525
pipeline_stage: int,
26-
execution_time: int,
26+
execution_time: float,
27+
model_execution_time: float,
2728
requests: List[Request],
2829
num_tokens: List[Request],
2930
) -> None:
@@ -35,6 +36,7 @@ def __init__(
3536
self._replica_id = replica_id
3637
self._pipeline_stage = pipeline_stage
3738
self._execution_time = execution_time
39+
self._model_execution_time = model_execution_time
3840

3941
self._scheduled_at = None
4042
self._completed_at = None
@@ -58,6 +60,10 @@ def completed_at(self) -> float:
5860
def execution_time(self) -> int:
5961
return self._execution_time
6062

63+
@property
64+
def model_execution_time(self) -> int:
65+
return self._model_execution_time
66+
6167
@property
6268
def pipeline_stage(self) -> int:
6369
return self._pipeline_stage
@@ -95,13 +101,14 @@ def on_stage_end(
95101
self._completed_at = time
96102

97103
for request in self._requests:
98-
request.on_batch_stage_end(time, self._execution_time)
104+
request.on_batch_stage_end(time, self._execution_time, self._model_execution_time)
99105

100106
def to_dict(self) -> dict:
101107
return {
102108
"id": self._id,
103109
"size": self.size,
104110
"execution_time": self._execution_time,
111+
"model_execution_time": self._model_execution_time,
105112
"scheduled_at": self._scheduled_at,
106113
"completed_at": self._completed_at,
107114
"replica_id": self._replica_id,
@@ -116,8 +123,8 @@ def to_chrome_trace(self, time: int) -> dict:
116123
return {
117124
"name": f"{self.request_ids}",
118125
"ph": "X",
119-
"ts": (time - self.execution_time) * 1e6,
120-
"dur": self.execution_time * 1e6,
126+
"ts": (time - self._execution_time) * 1e6,
127+
"dur": self._execution_time * 1e6,
121128
"pid": self._replica_id,
122129
"tid": self._pipeline_stage,
123130
"args": {

simulator/entities/cluster.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,4 @@ def _write_cluster_info_to_file(self) -> None:
3838
cluster_info = {"replicas": replica_dicts}
3939

4040
cluster_file = f"{self._config.output_dir}/cluster.json"
41-
with open(cluster_file, "w") as fd:
42-
json.dump(cluster_info, fd)
41+
json.dump(cluster_info, open(cluster_file, "w"))

0 commit comments

Comments
 (0)