1
1
seed : 42
2
2
log_level : info
3
3
output_dir : ./simulator_output/
4
- cache_dir : ./cache
4
+ cache_dir : ./tmpfs_cache
5
5
write_json_trace : false
6
- write_chrome_trace : true
6
+ write_chrome_trace : false
7
7
write_metrics : true
8
8
9
9
cluster :
@@ -15,7 +15,7 @@ replica:
15
15
memory_margin_fraction : 0.1
16
16
# parallelism
17
17
num_pipeline_stages : 1
18
- num_tensor_parallel_workers : 1
18
+ num_tensor_parallel_workers : 8
19
19
# Model Specs
20
20
# GPT-3
21
21
# num_layers: 96
@@ -34,30 +34,39 @@ replica:
34
34
# use_gated_mlp: false
35
35
# vocab_size: 65024
36
36
# LLama2 7b
37
+ model_name : meta-llama/Llama-2-7b-hf
37
38
num_layers : 32
38
39
num_q_heads : 32
39
40
num_kv_heads : 32
40
41
embedding_dim : 4096
41
42
mlp_hidden_dim : 11008
42
43
use_gated_mlp : true
43
44
vocab_size : 32768
44
- # A100
45
- # fp16_tflops: 312
46
- # total_memory_gb: 80
47
- # A40
48
- fp16_tflops : 150
49
- total_memory_gb : 45
45
+ # LLama2 70b
46
+ # num_layers: 80
47
+ # num_q_heads: 64
48
+ # num_kv_heads: 8
49
+ # embedding_dim: 8192
50
+ # mlp_hidden_dim: 28672
51
+ # use_gated_mlp: true
52
+ # vocab_size: 32768
53
+ # a100
54
+ fp16_tflops : 312
55
+ total_memory_gb : 80
56
+ # a100
57
+ # fp16_tflops: 150
58
+ # total_memory_gb: 45
50
59
51
60
request_generator :
52
61
provider : synthetic
53
62
max_tokens : 4096
54
63
55
64
synthetic_request_generator :
56
- length_provider : zipf
57
- interval_provider : static
58
- min_tokens : 2048
65
+ length_provider : trace
66
+ interval_provider : poisson
67
+ min_tokens : 1024
59
68
prefill_to_decode_ratio : 10
60
- num_requests : 100
69
+ num_requests : 10000
61
70
# duration: 100
62
71
63
72
trace_request_generator :
@@ -70,7 +79,7 @@ trace_request_generator:
70
79
# Config for synthetic trace generator
71
80
trace_request_length_generator :
72
81
trace_file : ./data/processed_traces/lmsys_chat_1m_conversation_stats_llama2_tokenizer.csv
73
- prefill_scale_factor : 0.3
82
+ prefill_scale_factor : 1
74
83
decode_scale_factor : 1
75
84
76
85
trace_request_interval_generator :
@@ -80,7 +89,7 @@ trace_request_interval_generator:
80
89
time_scale_factor : 0.3
81
90
82
91
poisson_request_interval_generator :
83
- qps : 0.2
92
+ qps : 16.0
84
93
85
94
gamma_request_interval_generator :
86
95
cv : 0.5
@@ -95,24 +104,38 @@ execution_time_predictor:
95
104
# provider: linear_regression
96
105
97
106
sklearn_execution_time_predictor :
98
- compute_input_file : ./data/profiling/a40/mlp.csv
99
- attention_input_file : ./data/profiling/a40/mixed_attention.csv
100
- all_reduce_input_file : ./data/profiling/a40/all_reduce.csv
101
- send_recv_input_file : ./data/profiling/a40/p2p_inter_node.csv
102
- k_fold_cv_splits : 5
107
+ # compute_input_file: ./data/profiling/a100/mlp.csv
108
+ compute_input_file : ./data/profiling/a100/mlp.csv
109
+ # compute_input_file: ./llama7b_mlp_results_4.csv
110
+ # compute_input_file: ./llama70b_mlp_results.csv
111
+ # attention_input_file: ./data/profiling/a100/mixed_attention.csv
112
+ attention_input_file : ./data/profiling/a100/mixed_attention.csv
113
+ # attention_input_file: ./llama7b_attention_pav2_results.csv
114
+ # attention_input_file: ./llama70b_attention_pav2_results.csv
115
+ # all_reduce_input_file: ./data/profiling/a100/all_reduce.csv
116
+ all_reduce_input_file : ./data/profiling/a100/all_reduce.csv
117
+ # send_recv_input_file: ./data/profiling/a100/p2p_intra_node.csv
118
+ send_recv_input_file : ./data/profiling/a100/p2p_intra_node.csv
119
+ # cpu_overhead_input_file: ./data/profiling/a100/cpu_overheads.csv
120
+ cpu_overhead_input_file : ./data/profiling/a100/cpu_overheads.csv
121
+ k_fold_cv_splits : 10
103
122
no_cache : false
104
123
kv_cache_prediction_granularity : 8
105
124
prediction_max_prefill_chunk_size : 4096
106
125
prediction_max_batch_size : 100
107
126
prediction_max_tokens_per_request : 4096
127
+ attention_decode_overhead_percentage : 0.0
128
+ nccl_cpu_launch_overhead_ms : 0.020
129
+ nccl_cpu_skew_overhead_per_device_ms : 0
108
130
109
131
random_forrest_execution_time_predictor :
110
132
num_estimators :
133
+ # - 250
111
134
- 500
112
135
- 750
113
136
max_depth :
114
- - 8
115
- - 16
137
+ # - 8
138
+ # - 16
116
139
- 32
117
140
min_samples_split :
118
141
- 2
@@ -140,35 +163,36 @@ simulator:
140
163
time_limit : null
141
164
142
165
global_scheduler :
143
- provider : lor
166
+ provider : round_robin
144
167
145
168
replica_scheduler :
146
169
provider : vllm
147
- batch_size_cap : 5
170
+ batch_size_cap : 128
171
+ num_blocks : 0
148
172
149
173
orca_scheduler :
150
174
use_single_prefill_per_batch : false
151
175
152
176
sarathi_scheduler :
153
- chunk_size : 1024
177
+ chunk_size : 512
154
178
enable_rolling_prefills : true
155
179
prefill_fitting_tolerance : 0.2
156
180
157
181
vllm_scheduler :
158
182
watermark_blocks_fraction : 0.01
159
183
max_tokens_in_batch : 4096
160
- max_batch_size_amplification_factor : 2
184
+ max_batch_size_amplification_factor : 1
161
185
162
186
dsarathi_scheduler :
163
187
chunk_size : 1024
164
188
enable_rolling_prefills : true
165
189
prefill_fitting_tolerance : 0.2
166
190
watermark_blocks_fraction : 0.01
167
- max_batch_size_amplification_factor : 2
191
+ max_batch_size_amplification_factor : 1
168
192
169
193
metrics_store :
170
194
wandb_project : " llm-simulator"
171
- wandb_group : " vllm-benchmark-test "
195
+ wandb_group : " "
172
196
wandb_run_name : " "
173
197
subsamples : 500
174
198
save_table_to_wandb : false
0 commit comments