microsoft
diff --git a/‎simulator/config/config.py
+2-2 b/‎simulator/config/config.py
+2-2
diff --git a/‎simulator/config/default.yml
+52-28 b/‎simulator/config/default.yml
+52-28
diff --git a/‎simulator/entities/__init__.py
+2-1 b/‎simulator/entities/__init__.py
+2-1
diff --git a/‎simulator/entities/batch.py
+3-3 b/‎simulator/entities/batch.py
+3-3
diff --git a/‎simulator/entities/batch_stage.py
+11-4 b/‎simulator/entities/batch_stage.py
+11-4
diff --git a/‎simulator/entities/cluster.py
+1-2 b/‎simulator/entities/cluster.py
+1-2
@@ -1,6 +1,6 @@
 import argparse
 import datetime
-#import hashlib
+import hashlib
 import os
 
 import yaml
@@ -27,7 +27,7 @@ def _parse_args(self):
 
     def _add_derived_args(self):
         print(self._args)
-        self._args.output_dir = f"{self._args.output_dir}/{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
+        self._args.output_dir = f"{self._args.output_dir}/{datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S-%f')}"
         os.makedirs(self._args.output_dir, exist_ok=True)
 
     def _update_namespace(self, config_dict, parent_key=""):
 
@@ -1,9 +1,9 @@
 seed: 42
 log_level: info
 output_dir: ./simulator_output/
-cache_dir: ./cache
+cache_dir: ./tmpfs_cache
 write_json_trace: false
-write_chrome_trace: true
+write_chrome_trace: false
 write_metrics: true
 
 cluster:
@@ -15,7 +15,7 @@ replica:
   memory_margin_fraction: 0.1
   # parallelism
   num_pipeline_stages: 1
-  num_tensor_parallel_workers: 1
+  num_tensor_parallel_workers: 8
   # Model Specs
   # GPT-3
   # num_layers: 96
@@ -34,30 +34,39 @@ replica:
   # use_gated_mlp: false
   # vocab_size: 65024 
   # LLama2 7b
+  model_name: meta-llama/Llama-2-7b-hf
   num_layers: 32
   num_q_heads: 32
   num_kv_heads: 32
   embedding_dim: 4096
   mlp_hidden_dim: 11008
   use_gated_mlp: true
   vocab_size: 32768 
-  # A100
-  # fp16_tflops: 312
-  # total_memory_gb: 80
-  # A40
-  fp16_tflops: 150
-  total_memory_gb: 45
+  # LLama2 70b
+  # num_layers: 80
+  # num_q_heads: 64
+  # num_kv_heads: 8
+  # embedding_dim: 8192
+  # mlp_hidden_dim: 28672
+  # use_gated_mlp: true
+  # vocab_size: 32768 
+  # a100
+  fp16_tflops: 312
+  total_memory_gb: 80
+  # a100
+  # fp16_tflops: 150
+  # total_memory_gb: 45
 
 request_generator:
   provider: synthetic
   max_tokens: 4096
 
 synthetic_request_generator:
-  length_provider: zipf
-  interval_provider: static
-  min_tokens: 2048
+  length_provider: trace
+  interval_provider: poisson
+  min_tokens: 1024
   prefill_to_decode_ratio: 10
-  num_requests: 100
+  num_requests: 10000
   # duration: 100
 
 trace_request_generator:
@@ -70,7 +79,7 @@ trace_request_generator:
 # Config for synthetic trace generator
 trace_request_length_generator:
   trace_file: ./data/processed_traces/lmsys_chat_1m_conversation_stats_llama2_tokenizer.csv
-  prefill_scale_factor: 0.3
+  prefill_scale_factor: 1
   decode_scale_factor:  1
 
 trace_request_interval_generator:
@@ -80,7 +89,7 @@ trace_request_interval_generator:
   time_scale_factor: 0.3
 
 poisson_request_interval_generator:
-  qps: 0.2
+  qps: 16.0
 
 gamma_request_interval_generator:
   cv: 0.5
@@ -95,24 +104,38 @@ execution_time_predictor:
   # provider: linear_regression
 
 sklearn_execution_time_predictor:
-  compute_input_file: ./data/profiling/a40/mlp.csv
-  attention_input_file: ./data/profiling/a40/mixed_attention.csv
-  all_reduce_input_file: ./data/profiling/a40/all_reduce.csv
-  send_recv_input_file: ./data/profiling/a40/p2p_inter_node.csv
-  k_fold_cv_splits: 5
+  # compute_input_file: ./data/profiling/a100/mlp.csv
+  compute_input_file: ./data/profiling/a100/mlp.csv
+  # compute_input_file: ./llama7b_mlp_results_4.csv
+  # compute_input_file: ./llama70b_mlp_results.csv
+  # attention_input_file: ./data/profiling/a100/mixed_attention.csv
+  attention_input_file: ./data/profiling/a100/mixed_attention.csv
+  # attention_input_file: ./llama7b_attention_pav2_results.csv
+  # attention_input_file: ./llama70b_attention_pav2_results.csv
+  # all_reduce_input_file: ./data/profiling/a100/all_reduce.csv
+  all_reduce_input_file: ./data/profiling/a100/all_reduce.csv
+  # send_recv_input_file: ./data/profiling/a100/p2p_intra_node.csv
+  send_recv_input_file: ./data/profiling/a100/p2p_intra_node.csv
+  # cpu_overhead_input_file: ./data/profiling/a100/cpu_overheads.csv
+  cpu_overhead_input_file: ./data/profiling/a100/cpu_overheads.csv
+  k_fold_cv_splits: 10
   no_cache: false
   kv_cache_prediction_granularity: 8
   prediction_max_prefill_chunk_size: 4096
   prediction_max_batch_size: 100
   prediction_max_tokens_per_request: 4096
+  attention_decode_overhead_percentage: 0.0
+  nccl_cpu_launch_overhead_ms: 0.020
+  nccl_cpu_skew_overhead_per_device_ms: 0
 
 random_forrest_execution_time_predictor:
   num_estimators:
+    # - 250
     - 500
     - 750
   max_depth:
-    - 8
-    - 16
+    # - 8
+    # - 16
     - 32
   min_samples_split:
     - 2
@@ -140,35 +163,36 @@ simulator:
   time_limit: null
 
 global_scheduler:
-  provider: lor
+  provider: round_robin
 
 replica_scheduler:
   provider: vllm
-  batch_size_cap: 5
+  batch_size_cap: 128
+  num_blocks: 0
 
 orca_scheduler:
   use_single_prefill_per_batch: false
 
 sarathi_scheduler:
-  chunk_size: 1024
+  chunk_size: 512
   enable_rolling_prefills: true
   prefill_fitting_tolerance: 0.2
 
 vllm_scheduler:
   watermark_blocks_fraction: 0.01
   max_tokens_in_batch: 4096
-  max_batch_size_amplification_factor: 2
+  max_batch_size_amplification_factor: 1
 
 dsarathi_scheduler:
   chunk_size: 1024
   enable_rolling_prefills: true
   prefill_fitting_tolerance: 0.2
   watermark_blocks_fraction: 0.01
-  max_batch_size_amplification_factor: 2
+  max_batch_size_amplification_factor: 1
 
 metrics_store:
   wandb_project: "llm-simulator"
-  wandb_group: "vllm-benchmark-test"
+  wandb_group: ""
   wandb_run_name: ""
   subsamples: 500
   save_table_to_wandb: false
 
@@ -3,5 +3,6 @@
 from simulator.entities.cluster import Cluster
 from simulator.entities.replica import Replica
 from simulator.entities.request import Request
+from simulator.entities.execution_time import ExecutionTime
 
-__all__ = [Request, Replica, Batch, Cluster, BatchStage]
+__all__ = [Request, Replica, Batch, Cluster, BatchStage, ExecutionTime]
@@ -99,7 +99,7 @@ def request_ids(self) -> List[int]:
         return [request.id for request in self._requests]
 
     @property
-    def allcompleted(self) -> bool:
+    def completed(self) -> bool:
         return all([request.completed for request in self._requests])
 
     def on_schedule(
@@ -112,8 +112,8 @@ def on_schedule(
         for request in self._requests:
             request.on_batch_schedule(time)
 
-        if self._id % 1000 == 0:
-            logger.info(f"Finished scheduling {self._id} batches.")
+        # if self._id % 1000 == 0:
+            # logger.info(f"Finished scheduling {self._id} batches.")
 
     def on_batch_end(self, time: float):
         self._completed = True
 
@@ -23,7 +23,8 @@ def __init__(
         batch_id: int,
         replica_id: int,
         pipeline_stage: int,
-        execution_time: int,
+        execution_time: float,
+        model_execution_time: float,
         requests: List[Request],
         num_tokens: List[Request],
     ) -> None:
@@ -35,6 +36,7 @@ def __init__(
         self._replica_id = replica_id
         self._pipeline_stage = pipeline_stage
         self._execution_time = execution_time
+        self._model_execution_time = model_execution_time
 
         self._scheduled_at = None
         self._completed_at = None
@@ -58,6 +60,10 @@ def completed_at(self) -> float:
     def execution_time(self) -> int:
         return self._execution_time
 
+    @property
+    def model_execution_time(self) -> int:
+        return self._model_execution_time
+
     @property
     def pipeline_stage(self) -> int:
         return self._pipeline_stage
@@ -95,13 +101,14 @@ def on_stage_end(
         self._completed_at = time
 
         for request in self._requests:
-            request.on_batch_stage_end(time, self._execution_time)
+            request.on_batch_stage_end(time, self._execution_time, self._model_execution_time)
 
     def to_dict(self) -> dict:
         return {
             "id": self._id,
             "size": self.size,
             "execution_time": self._execution_time,
+            "model_execution_time": self._model_execution_time,
             "scheduled_at": self._scheduled_at,
             "completed_at": self._completed_at,
             "replica_id": self._replica_id,
@@ -116,8 +123,8 @@ def to_chrome_trace(self, time: int) -> dict:
         return {
             "name": f"{self.request_ids}",
             "ph": "X",
-            "ts": (time - self.execution_time) * 1e6,
-            "dur": self.execution_time * 1e6,
+            "ts": (time - self._execution_time) * 1e6,
+            "dur": self._execution_time * 1e6,
             "pid": self._replica_id,
             "tid": self._pipeline_stage,
             "args": {
 
@@ -38,5 +38,4 @@ def _write_cluster_info_to_file(self) -> None:
         cluster_info = {"replicas": replica_dicts}
 
         cluster_file = f"{self._config.output_dir}/cluster.json"
-        with open(cluster_file, "w") as fd:
-            json.dump(cluster_info, fd)
+        json.dump(cluster_info, open(cluster_file, "w"))