carbon_sim.cfg

# Configuration file for the Graphite simulator

# This file is organized into sections defined in [] brackets as in [section].
# Sections may be hierarchical with sub-sections split by the '/' character as
# in [section/sub_section].
#
# values can be "strings" , numbers, or true/false, existing values
# should indicate the type

# This section controls various high-level simulation parameters.
[general]
# output_file: This is the name of the file that the statistics
#   for each core are written to.
# The directory where this file is placed is dynamically generated.
#   The default location is ./results/$(DATE)/
#   where $(DATE) format is %Y-%m-%d_%H-%M-%S
# You can use OUTPUT_DIR=<directory> to place the output of the simulation
#   in a specific directory
#   e.g., make radix_bench_test OUTPUT_DIR=radix
#   will place the output in the directory ./results/radix/
# Also, ./results/latest/ is a symbolic link that points to the
#   output directory of the most recently started simulation
# Each output directory, in addition to the output_file also contains
#   (a) the configuration file used ('carbon_sim.cfg')
#   (b) the command used to run the simulation ('command')
#   (c) log files that are produced ('*.log')
# To remove output directories, do make clean_output_dirs
#   The above command will remove all automatically generated output
#   directories of the form ./results/$(DATE)/ and the symbolic link
#   ./results/latest/
output_file = "sim.out"

# Total number of cores in the simulation
total_cores = 64

# This defines the number of processes that will used to
# perform the simulation
num_processes = 1

# These flags are used to disable certain sub-systems of the simulator
enable_core_modeling = true
enable_power_modeling = false
enable_area_modeling = false
enable_shared_mem = true

# Simulator Mode (full, lite)
mode = lite

# Trigger models within application using CarbonEnableModels() and CarbonDisableModels()
trigger_models_within_application = false

# Technology Node: Used for area and power modeling of caches and network
# McPAT works at (22,32,45,65,90) nm and DSENT works at (11,22,32,45) nm
# Taking intersection, allowed values are 22,32,45 (all in nanometers)
technology_node = 45

# Maximum frequency (in GHz)
max_frequency = 2.0

# Global temperature setting (in Kelvin), used for power models in McPAT and DSENT
temperature = 300

# Width of a Tile (in millimeters), used by the network performance and power models
tile_width = 1.0

# This option defines the ports on which the various processes will communicate
# in distributed simulations. Note that several ports will be used above this
# number for each process, thus requiring a port-range to be opened for
# distributed simulations.
[transport]
base_port = 2000

# This section is used to fine-tune the logging information. The logging may
# be disabled for performance runs or enabled for debugging.
[log]
enabled = true
stack_trace = false
disabled_modules = ""
enabled_modules = "memory"

[progress_trace]
enabled = false
interval = 5000

# This section defines the clock skew management schemes. For more information
# on tradeoffs between the different schemes, see the Graphite paper from HPCA 2010.
[clock_skew_management]
# Valid schemes are lax, lax_barrier and lax_p2p
scheme = lax_barrier

# These are the various parameters used for each clock skew management scheme
[clock_skew_management/lax_barrier]
# Lax-Barrier: Synchronize the clocks of all cores after every time 'quantum'.
#     This scheme does not work with message passing applications.
#     (Use laxp2p or lax for message passing applications.)
# Quantum: The time interval between successive barriers (in nanoseconds)
quantum = 1000
[clock_skew_management/lax_p2p]
# Lax-P2P: Each core picks a random core after every time 'quantum' and synchronizes
#     its clock with it. The faster core is forced to wait (i.e., put to sleep)
#     if its target time is ahead by 'slack' (nanoseconds) from the slower core.
# Quantum: The time interval between successive P2P checks (in nanoseconds)
quantum = 1000
# Slack: The time interval above which the faster core is forced to wait (in nanoseconds)
slack = 1000
# Sleep Fraction: This is the fraction of the predicted time period for which the
#     faster core sleeps. The time period is predicted using the rate of simulation progress.
sleep_fraction = 1.0

# Since the memory is emulated to ensure correctness on distributed simulations, we
# must manage a stack for each thread. These parameters control information about
# the stacks that are managed.
[stack]
# Stack Base: This is the start address of the managed stacks
stack_base = 2415919104
# Stack Size per Core: This is the size of each thread stack
stack_size_per_core = 2097152

# The process map is used for multi-machine distributed simulations. Each process
# must have a hostname associated with it and this mapping below describes the
# mapping between processes and hosts. 
[process_map]
process0 = "127.0.0.1"
process1 = "127.0.0.1"
process2 = "127.0.0.1"
process3 = "127.0.0.1"
process4 = "127.0.0.1"
process5 = "127.0.0.1"
process6 = "127.0.0.1"
process7 = "127.0.0.1"
process8 = "127.0.0.1"
process9 = "127.0.0.1"
process10 = "127.0.0.1"
process11 = "127.0.0.1"
process12 = "127.0.0.1"
process13 = "127.0.0.1"
process14 = "127.0.0.1"
process15 = "127.0.0.1"
process16 = "127.0.0.1"

# This section describes runtime energy and power modeling
[runtime_energy_modeling]
interval = 1000                         # In nanoseconds. This is how often energy and power are sampled.
[runtime_energy_modeling/power_trace]
enabled = false                         # Is writing the periodically calculated energy into a file enabled?

[dvfs]
# List of dvfs domains.
# Each domain has the format <frequency (GHz), module1, module2, ...>
# Example: "<1.0, CORE, L1_ICACHE, L1_DCACHE>, <2.0, L2_CACHE, DIRECTORY>, <0.5, NETWORK_USER, NETWORK_MEMORY>"
domains = "<1.0, CORE, L1_ICACHE, L1_DCACHE, L2_CACHE, DIRECTORY, NETWORK_USER, NETWORK_MEMORY>"

# synchronization delay for communication across asynchronous boundaries. The default value
# is 2 cycles.
synchronization_delay = 2  # in cycles

# This section describes parameters for the core model
[tile]
# Format: "tuple_1, tuple_2, ..., tuple_n"
#   where tuple_i = <number of cores, core type, L1-I cache config, L1-D cache config, L2 cache config>
# Use 'default' to accept the default values for any parameter

# Default Number of Cores = 'general/total_cores'

# Valid core types are simple, iocoom
# Default Core Type = simple

# L1-I, L1-D and L2 cache configurations
# Default cache configuration = T1

# Note: Earlier, we had the ability to also configure the starting frequency of tiles in this list.
# We have since removed this feature in favor of just configuring the starting frequency of each
# dvfs domain (see [dvfs/domains]).
# But if you still want the functionality, add a call in software to change
# the frequency of the tile at the start of each software thread
model_list = "<default,iocoom,T1,T1,T1>"

[core]

[core/iocoom]
# The core should adhere to the x86 TSO memory consistency model
num_load_queue_entries = 8
num_store_queue_entries = 8
speculative_loads_enabled = true
multiple_outstanding_RFOs_enabled = true

# This section describes the number of cycles for
# various arithmetic instructions.
[core/static_instruction_costs]
generic = 1
mov = 1
ialu = 1
imul = 3
idiv = 18
falu = 3
fmul = 5
fdiv = 6
xmm_ss = 6
xmm_sd = 6
xmm_ps = 6

[branch_predictor]
type = one_bit
mispredict_penalty = 14                     # In cycles
size = 1024

# L1-I, L1-D and L2 Caches are in the same clock domain as the core
[l1_icache/T1]
cache_line_size = 64                      # In Bytes
cache_size = 16                           # In KB
associativity = 4
num_banks = 1
replacement_policy = lru
data_access_time = 1                      # In cycles
tags_access_time = 1                      # In cycles
perf_model_type = parallel                # Options are [parallel,sequential]
track_miss_types = false

[l1_dcache/T1]
cache_line_size = 64                      # In Bytes
cache_size = 32                           # In KB
associativity = 4
num_banks = 1
replacement_policy = lru 
data_access_time = 1                      # In cycles
tags_access_time = 1                      # In cycles
perf_model_type = parallel                # Options are [parallel,sequential]
track_miss_types = false

[l2_cache/T1]
cache_line_size = 64                      # In Bytes
cache_size = 512                          # In KB
associativity = 8
num_banks = 2
replacement_policy = lru
data_access_time = 8                      # In cycles
tags_access_time = 3                      # In cycles
perf_model_type = parallel                # Options are [parallel,sequential]
track_miss_types = false

[caching_protocol]
type = pr_l1_pr_l2_dram_directory_msi
# Available values are
# 1) pr_l1_pr_l2_dram_directory_msi
# 2) pr_l1_pr_l2_dram_directory_mosi
# 3) pr_l1_sh_l2_msi
# 4) pr_l1_sh_l2_mesi

[l2_directory]
max_hw_sharers = 64                       # number of sharers supported in hardware (ignored if directory_type = full_map)
directory_type = full_map                 # Supported (full_map, limited_broadcast, limited_no_broadcast, ackwise, limitless)

[dram_directory]
total_entries = auto                      # If auto, then automatically set depending on L2 cache size, else enter a numeric value 
associativity = 16
max_hw_sharers = 64                       # number of sharers supported in hardware (ignored if directory_type = full_map)
directory_type = full_map                 # Supported (full_map, limited_broadcast, limited_no_broadcast, ackwise, limitless)
access_time = auto                        # If auto, then automatically set based on dram directory size, else enter a numeric value (in cycles)

[limitless]
software_trap_penalty = 200
# number of cycles added to clock when trapping into software 
# (pulled number from Chaiken papers, which explores 25-150 cycle penalties)

[dram]
latency = 100                             # In nanoseconds
per_controller_bandwidth = 5              # In GB/s
num_controllers = ALL
# "ALL" denotes that a memory controller is present on every tile(/core). Set num_controllers to a numeric value less than or equal to the number of cores
controller_positions = ""
[dram/queue_model]
enabled = true
type = history_tree

# This describes the various models used for the different networks on the core
[network]
# Valid Network Models : 
# 1) magic 
# 2) emesh_hop_counter, emesh_hop_by_hop
# 3) atac
user = emesh_hop_counter
memory = emesh_hop_counter

# Enable shared memory shortcut for network models (works only with a single host process)
enable_shared_memory_shortcut = false

# emesh_hop_counter (Electrical Mesh Network)
#  - No contention models
#  - Just models hop latency and serialization latency
[network/emesh_hop_counter]
flit_width = 64                  # In bits
[network/emesh_hop_counter/router]
delay = 1                        # In cycles
num_flits_per_port_buffer = 4    # Number of flits per output buffer per port
[network/emesh_hop_counter/link]
delay = 1                        # In cycles
type = electrical_repeated

# emesh_hop_by_hop (Electrical Mesh Network)
#  - Link Contention Models present
#  - Infinite Output Buffering (Finite Output Buffers assumed for power modeling)
[network/emesh_hop_by_hop]
flit_width = 64                  # In bits
broadcast_tree_enabled = true    # Is broadcast tree enabled?
[network/emesh_hop_by_hop/router]
delay = 1                        # In cycles
num_flits_per_port_buffer = 4    # Number of flits per output buffer per port
[network/emesh_hop_by_hop/link]
delay = 1                        # In cycles
type = electrical_repeated
[network/emesh_hop_by_hop/queue_model]
enabled = true
type = history_tree

# atac (ATAC network model)
#  - Link Contention Models present (both optical and electrical)
#  - Infinite Output Buffering (Finite Output Buffers assumed for power modeling)
[network/atac]
flit_width = 64
cluster_size = 4                             # Number of cores per cluster
receive_network_type = star                  # [htree, star]
num_receive_networks_per_cluster = 2         # Number of receive networks per cluster
num_optical_access_points_per_cluster = 4    # Number of Optical Access Points per cluster
global_routing_strategy = cluster_based      # [cluster_based, distance_based]
# Distance above which unicasts are sent on ONet. Only works for distance_based routing strategy
unicast_distance_threshold = 4
electrical_link_type = electrical_repeated   # electrical_repeated

[network/atac/enet]
[network/atac/enet/router]
delay = 1                        # In cycles (ENet is modeled similar to an electrical mesh now) 
num_flits_per_port_buffer = 4    # Number of Buffer flits per port (Finite Buffering assumed for power modeling)

[network/atac/onet]
[network/atac/onet/send_hub]
[network/atac/onet/send_hub/router]
delay = 1                        # In cycles 
num_flits_per_port_buffer = 4    # Number of Buffer flits per port (Finite Buffering assumed for power modeling)

[network/atac/onet/receive_hub]
[network/atac/onet/receive_hub/router]
delay = 1                        # In cycles 
num_flits_per_port_buffer = 4    # Number of Buffer flits per port (Finite Buffering assumed for power modeling)

[network/atac/star_net]
[network/atac/star_net/router]
delay = 1                        # In cycles 
num_flits_per_port_buffer = 4    # Number of Buffer flits per port (Finite Buffering assumed for power modeling)

[network/atac/queue_model]
enabled = true
type = history_tree

[link_model]
# Optical Link Model
[link_model/optical]
# Optical waveguide delay per mm (in nanoseconds)
waveguide_delay_per_mm = 10e-3
# Conversion delay from electrical to optical (in cycles)
E-O_conversion_delay = 1
# Conversion delay from optical to electrical (in cycles)
O-E_conversion_delay = 1
# Laser type: Available choices (in increasing degree of optimism):
# standard, throttled
laser_type = throttled 
# Laser Modes: Comma separated list containing one or more of [unicast,broadcast]
# Unicast: Send to one reader, Broadcast: Send to all readers
# If laser_type = standard, must be only one of [unicast] or [broadcast] (since laser power is fixed)
# If laser_type = throtted, must be at least one of [unicast,broadcast] (since laser power is variable)
laser_modes = "unicast,broadcast"
# Thermal tuning strategy: Available choices (in increasing degree of optimism):
# full_thermal, thermal_reshuffle, electrical_assist, athermal
ring_tuning_strategy = athermal

# Queue Models
[queue_model/basic]
moving_avg_enabled = true
moving_avg_window_size = 64
moving_avg_type = arithmetic_mean

[queue_model/history_list]
# Uses the analytical model (if enabled) to calculate delay
# if cannot be calculated using the history list
max_list_size = 100
analytical_model_enabled = true
interleaving_enabled = true 

[queue_model/history_tree]
# Uses the analytical model (if enabled) to calculate delay
# if cannot be calculated using the history tree
max_list_size = 100
analytical_model_enabled = true

# Collect time-varying statistics from the simulator
# For tracing to be done
#  (1) Set [statistics_trace/enabled] = true
#  (2) Set the statistics that you want to measure periodically in [statistics_trace/statistics]
#  (3) Use the lax_barrier synchronization model (set [clock_skew_management/scheme] = lax_barrier)
#  (4) Use a sampling interval >= [clock_skew_management/barrier/quantum] and a multiple of it
# Note: cache_line_replication only works with the pr_l1_pr_l2_dram_directory_mosi memory subsystem
[statistics_trace]
enabled = false
# Comma separated list of statistics for which tracing is done when enabled.
# Choose from [cache_line_replication, network_utilization]
statistics = "cache_line_replication, network_utilization"
# Interval between successive samples of the trace (in nanoseconds)
sampling_interval = 10000
[statistics_trace/network_utilization]
# Comma separated list of networks for which injection rate is traced when enabled
# Choose from [user, memory]
enabled_networks = "memory"