forked from mit-carbon/Graphite
-
Notifications
You must be signed in to change notification settings - Fork 3
/
carbon_sim.cfg
411 lines (358 loc) · 15.8 KB
/
carbon_sim.cfg
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
# Configuration file for the Graphite simulator
# This file is organized into sections defined in [] brackets as in [section].
# Sections may be hierarchical with sub-sections split by the '/' character as
# in [section/sub_section].
#
# values can be "strings" , numbers, or true/false, existing values
# should indicate the type
# This section controls various high-level simulation parameters.
[general]
# output_file: This is the name of the file that the statistics
# for each core are written to.
# The directory where this file is placed is dynamically generated.
# The default location is ./results/$(DATE)/
# where $(DATE) format is %Y-%m-%d_%H-%M-%S
# You can use OUTPUT_DIR=<directory> to place the output of the simulation
# in a specific directory
# e.g., make radix_bench_test OUTPUT_DIR=radix
# will place the output in the directory ./results/radix/
# Also, ./results/latest/ is a symbolic link that points to the
# output directory of the most recently started simulation
# Each output directory, in addition to the output_file also contains
# (a) the configuration file used ('carbon_sim.cfg')
# (b) the command used to run the simulation ('command')
# (c) log files that are produced ('*.log')
# To remove output directories, do make clean_output_dirs
# The above command will remove all automatically generated output
# directories of the form ./results/$(DATE)/ and the symbolic link
# ./results/latest/
output_file = "sim.out"
# Total number of cores in the simulation
total_cores = 64
# This defines the number of processes that will used to
# perform the simulation
num_processes = 1
# These flags are used to disable certain sub-systems of the simulator
enable_core_modeling = true
enable_power_modeling = false
enable_area_modeling = false
enable_shared_mem = true
# Simulator Mode (full, lite)
mode = lite
# Trigger models within application using CarbonEnableModels() and CarbonDisableModels()
trigger_models_within_application = false
# Technology Node: Used for area and power modeling of caches and network
# McPAT works at (22,32,45,65,90) nm and DSENT works at (11,22,32,45) nm
# Taking intersection, allowed values are 22,32,45 (all in nanometers)
technology_node = 45
# Maximum frequency (in GHz)
max_frequency = 2.0
# Global temperature setting (in Kelvin), used for power models in McPAT and DSENT
temperature = 300
# Width of a Tile (in millimeters), used by the network performance and power models
tile_width = 1.0
# This option defines the ports on which the various processes will communicate
# in distributed simulations. Note that several ports will be used above this
# number for each process, thus requiring a port-range to be opened for
# distributed simulations.
[transport]
base_port = 2000
# This section is used to fine-tune the logging information. The logging may
# be disabled for performance runs or enabled for debugging.
[log]
enabled = true
stack_trace = false
disabled_modules = ""
enabled_modules = "memory"
[progress_trace]
enabled = false
interval = 5000
# This section defines the clock skew management schemes. For more information
# on tradeoffs between the different schemes, see the Graphite paper from HPCA 2010.
[clock_skew_management]
# Valid schemes are lax, lax_barrier and lax_p2p
scheme = lax_barrier
# These are the various parameters used for each clock skew management scheme
[clock_skew_management/lax_barrier]
# Lax-Barrier: Synchronize the clocks of all cores after every time 'quantum'.
# This scheme does not work with message passing applications.
# (Use laxp2p or lax for message passing applications.)
# Quantum: The time interval between successive barriers (in nanoseconds)
quantum = 1000
[clock_skew_management/lax_p2p]
# Lax-P2P: Each core picks a random core after every time 'quantum' and synchronizes
# its clock with it. The faster core is forced to wait (i.e., put to sleep)
# if its target time is ahead by 'slack' (nanoseconds) from the slower core.
# Quantum: The time interval between successive P2P checks (in nanoseconds)
quantum = 1000
# Slack: The time interval above which the faster core is forced to wait (in nanoseconds)
slack = 1000
# Sleep Fraction: This is the fraction of the predicted time period for which the
# faster core sleeps. The time period is predicted using the rate of simulation progress.
sleep_fraction = 1.0
# Since the memory is emulated to ensure correctness on distributed simulations, we
# must manage a stack for each thread. These parameters control information about
# the stacks that are managed.
[stack]
# Stack Base: This is the start address of the managed stacks
stack_base = 2415919104
# Stack Size per Core: This is the size of each thread stack
stack_size_per_core = 2097152
# The process map is used for multi-machine distributed simulations. Each process
# must have a hostname associated with it and this mapping below describes the
# mapping between processes and hosts.
[process_map]
process0 = "127.0.0.1"
process1 = "127.0.0.1"
process2 = "127.0.0.1"
process3 = "127.0.0.1"
process4 = "127.0.0.1"
process5 = "127.0.0.1"
process6 = "127.0.0.1"
process7 = "127.0.0.1"
process8 = "127.0.0.1"
process9 = "127.0.0.1"
process10 = "127.0.0.1"
process11 = "127.0.0.1"
process12 = "127.0.0.1"
process13 = "127.0.0.1"
process14 = "127.0.0.1"
process15 = "127.0.0.1"
process16 = "127.0.0.1"
# This section describes runtime energy and power modeling
[runtime_energy_modeling]
interval = 1000 # In nanoseconds. This is how often energy and power are sampled.
[runtime_energy_modeling/power_trace]
enabled = false # Is writing the periodically calculated energy into a file enabled?
[dvfs]
# List of dvfs domains.
# Each domain has the format <frequency (GHz), module1, module2, ...>
# Example: "<1.0, CORE, L1_ICACHE, L1_DCACHE>, <2.0, L2_CACHE, DIRECTORY>, <0.5, NETWORK_USER, NETWORK_MEMORY>"
domains = "<1.0, CORE, L1_ICACHE, L1_DCACHE, L2_CACHE, DIRECTORY, NETWORK_USER, NETWORK_MEMORY>"
# synchronization delay for communication across asynchronous boundaries. The default value
# is 2 cycles.
synchronization_delay = 2 # in cycles
# This section describes parameters for the core model
[tile]
# Format: "tuple_1, tuple_2, ..., tuple_n"
# where tuple_i = <number of cores, core type, L1-I cache config, L1-D cache config, L2 cache config>
# Use 'default' to accept the default values for any parameter
# Default Number of Cores = 'general/total_cores'
# Valid core types are simple, iocoom
# Default Core Type = simple
# L1-I, L1-D and L2 cache configurations
# Default cache configuration = T1
# Note: Earlier, we had the ability to also configure the starting frequency of tiles in this list.
# We have since removed this feature in favor of just configuring the starting frequency of each
# dvfs domain (see [dvfs/domains]).
# But if you still want the functionality, add a call in software to change
# the frequency of the tile at the start of each software thread
model_list = "<default,iocoom,T1,T1,T1>"
[core]
[core/iocoom]
# The core should adhere to the x86 TSO memory consistency model
num_load_queue_entries = 8
num_store_queue_entries = 8
speculative_loads_enabled = true
multiple_outstanding_RFOs_enabled = true
# This section describes the number of cycles for
# various arithmetic instructions.
[core/static_instruction_costs]
generic = 1
mov = 1
ialu = 1
imul = 3
idiv = 18
falu = 3
fmul = 5
fdiv = 6
xmm_ss = 6
xmm_sd = 6
xmm_ps = 6
[branch_predictor]
type = one_bit
mispredict_penalty = 14 # In cycles
size = 1024
# L1-I, L1-D and L2 Caches are in the same clock domain as the core
[l1_icache/T1]
cache_line_size = 64 # In Bytes
cache_size = 16 # In KB
associativity = 4
num_banks = 1
replacement_policy = lru
data_access_time = 1 # In cycles
tags_access_time = 1 # In cycles
perf_model_type = parallel # Options are [parallel,sequential]
track_miss_types = false
[l1_dcache/T1]
cache_line_size = 64 # In Bytes
cache_size = 32 # In KB
associativity = 4
num_banks = 1
replacement_policy = lru
data_access_time = 1 # In cycles
tags_access_time = 1 # In cycles
perf_model_type = parallel # Options are [parallel,sequential]
track_miss_types = false
[l2_cache/T1]
cache_line_size = 64 # In Bytes
cache_size = 512 # In KB
associativity = 8
num_banks = 2
replacement_policy = lru
data_access_time = 8 # In cycles
tags_access_time = 3 # In cycles
perf_model_type = parallel # Options are [parallel,sequential]
track_miss_types = false
[caching_protocol]
type = pr_l1_pr_l2_dram_directory_msi
# Available values are
# 1) pr_l1_pr_l2_dram_directory_msi
# 2) pr_l1_pr_l2_dram_directory_mosi
# 3) pr_l1_sh_l2_msi
# 4) pr_l1_sh_l2_mesi
[l2_directory]
max_hw_sharers = 64 # number of sharers supported in hardware (ignored if directory_type = full_map)
directory_type = full_map # Supported (full_map, limited_broadcast, limited_no_broadcast, ackwise, limitless)
[dram_directory]
total_entries = auto # If auto, then automatically set depending on L2 cache size, else enter a numeric value
associativity = 16
max_hw_sharers = 64 # number of sharers supported in hardware (ignored if directory_type = full_map)
directory_type = full_map # Supported (full_map, limited_broadcast, limited_no_broadcast, ackwise, limitless)
access_time = auto # If auto, then automatically set based on dram directory size, else enter a numeric value (in cycles)
[limitless]
software_trap_penalty = 200
# number of cycles added to clock when trapping into software
# (pulled number from Chaiken papers, which explores 25-150 cycle penalties)
[dram]
latency = 100 # In nanoseconds
per_controller_bandwidth = 5 # In GB/s
num_controllers = ALL
# "ALL" denotes that a memory controller is present on every tile(/core). Set num_controllers to a numeric value less than or equal to the number of cores
controller_positions = ""
[dram/queue_model]
enabled = true
type = history_tree
# This describes the various models used for the different networks on the core
[network]
# Valid Network Models :
# 1) magic
# 2) emesh_hop_counter, emesh_hop_by_hop
# 3) atac
user = emesh_hop_counter
memory = emesh_hop_counter
# Enable shared memory shortcut for network models (works only with a single host process)
enable_shared_memory_shortcut = false
# emesh_hop_counter (Electrical Mesh Network)
# - No contention models
# - Just models hop latency and serialization latency
[network/emesh_hop_counter]
flit_width = 64 # In bits
[network/emesh_hop_counter/router]
delay = 1 # In cycles
num_flits_per_port_buffer = 4 # Number of flits per output buffer per port
[network/emesh_hop_counter/link]
delay = 1 # In cycles
type = electrical_repeated
# emesh_hop_by_hop (Electrical Mesh Network)
# - Link Contention Models present
# - Infinite Output Buffering (Finite Output Buffers assumed for power modeling)
[network/emesh_hop_by_hop]
flit_width = 64 # In bits
broadcast_tree_enabled = true # Is broadcast tree enabled?
[network/emesh_hop_by_hop/router]
delay = 1 # In cycles
num_flits_per_port_buffer = 4 # Number of flits per output buffer per port
[network/emesh_hop_by_hop/link]
delay = 1 # In cycles
type = electrical_repeated
[network/emesh_hop_by_hop/queue_model]
enabled = true
type = history_tree
# atac (ATAC network model)
# - Link Contention Models present (both optical and electrical)
# - Infinite Output Buffering (Finite Output Buffers assumed for power modeling)
[network/atac]
flit_width = 64
cluster_size = 4 # Number of cores per cluster
receive_network_type = star # [htree, star]
num_receive_networks_per_cluster = 2 # Number of receive networks per cluster
num_optical_access_points_per_cluster = 4 # Number of Optical Access Points per cluster
global_routing_strategy = cluster_based # [cluster_based, distance_based]
# Distance above which unicasts are sent on ONet. Only works for distance_based routing strategy
unicast_distance_threshold = 4
electrical_link_type = electrical_repeated # electrical_repeated
[network/atac/enet]
[network/atac/enet/router]
delay = 1 # In cycles (ENet is modeled similar to an electrical mesh now)
num_flits_per_port_buffer = 4 # Number of Buffer flits per port (Finite Buffering assumed for power modeling)
[network/atac/onet]
[network/atac/onet/send_hub]
[network/atac/onet/send_hub/router]
delay = 1 # In cycles
num_flits_per_port_buffer = 4 # Number of Buffer flits per port (Finite Buffering assumed for power modeling)
[network/atac/onet/receive_hub]
[network/atac/onet/receive_hub/router]
delay = 1 # In cycles
num_flits_per_port_buffer = 4 # Number of Buffer flits per port (Finite Buffering assumed for power modeling)
[network/atac/star_net]
[network/atac/star_net/router]
delay = 1 # In cycles
num_flits_per_port_buffer = 4 # Number of Buffer flits per port (Finite Buffering assumed for power modeling)
[network/atac/queue_model]
enabled = true
type = history_tree
[link_model]
# Optical Link Model
[link_model/optical]
# Optical waveguide delay per mm (in nanoseconds)
waveguide_delay_per_mm = 10e-3
# Conversion delay from electrical to optical (in cycles)
E-O_conversion_delay = 1
# Conversion delay from optical to electrical (in cycles)
O-E_conversion_delay = 1
# Laser type: Available choices (in increasing degree of optimism):
# standard, throttled
laser_type = throttled
# Laser Modes: Comma separated list containing one or more of [unicast,broadcast]
# Unicast: Send to one reader, Broadcast: Send to all readers
# If laser_type = standard, must be only one of [unicast] or [broadcast] (since laser power is fixed)
# If laser_type = throtted, must be at least one of [unicast,broadcast] (since laser power is variable)
laser_modes = "unicast,broadcast"
# Thermal tuning strategy: Available choices (in increasing degree of optimism):
# full_thermal, thermal_reshuffle, electrical_assist, athermal
ring_tuning_strategy = athermal
# Queue Models
[queue_model/basic]
moving_avg_enabled = true
moving_avg_window_size = 64
moving_avg_type = arithmetic_mean
[queue_model/history_list]
# Uses the analytical model (if enabled) to calculate delay
# if cannot be calculated using the history list
max_list_size = 100
analytical_model_enabled = true
interleaving_enabled = true
[queue_model/history_tree]
# Uses the analytical model (if enabled) to calculate delay
# if cannot be calculated using the history tree
max_list_size = 100
analytical_model_enabled = true
# Collect time-varying statistics from the simulator
# For tracing to be done
# (1) Set [statistics_trace/enabled] = true
# (2) Set the statistics that you want to measure periodically in [statistics_trace/statistics]
# (3) Use the lax_barrier synchronization model (set [clock_skew_management/scheme] = lax_barrier)
# (4) Use a sampling interval >= [clock_skew_management/barrier/quantum] and a multiple of it
# Note: cache_line_replication only works with the pr_l1_pr_l2_dram_directory_mosi memory subsystem
[statistics_trace]
enabled = false
# Comma separated list of statistics for which tracing is done when enabled.
# Choose from [cache_line_replication, network_utilization]
statistics = "cache_line_replication, network_utilization"
# Interval between successive samples of the trace (in nanoseconds)
sampling_interval = 10000
[statistics_trace/network_utilization]
# Comma separated list of networks for which injection rate is traced when enabled
# Choose from [user, memory]
enabled_networks = "memory"