-
Notifications
You must be signed in to change notification settings - Fork 10
/
krun.py
executable file
·374 lines (304 loc) · 14.9 KB
/
krun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
#!/usr/bin/env python2.7
"""
Benchmark, running many fresh processes.
usage: runner.py <config_file.krun>
"""
import argparse, locale, logging, os, sys, traceback
from logging import debug, info, warn
import krun.util as util
from krun.config import Config
from krun.platform import detect_platform
from krun.results import Results
from krun.scheduler import ExecutionScheduler, ManifestManager
from krun import ABS_TIME_FORMAT
from krun.mail import Mailer
HERE = os.path.abspath(os.getcwd())
DIR = os.path.abspath(os.path.dirname(__file__))
CONSOLE_FORMATTER = PLAIN_FORMATTER = logging.Formatter(
'[%(asctime)s: %(levelname)s] %(message)s',
ABS_TIME_FORMAT)
try:
import colorlog
CONSOLE_FORMATTER = colorlog.ColoredFormatter(
"%(log_color)s[%(asctime)s %(levelname)s] %(message)s%(reset)s",
ABS_TIME_FORMAT)
except ImportError:
pass
def usage(parser):
parser.print_help()
sys.exit(1)
def sanity_checks(config, platform):
debug("Running sanity checks")
vms_that_will_run = []
# check all necessary benchmark files exist
for bench, bench_param in config.BENCHMARKS.items():
for vm_name, vm_info in config.VMS.items():
for variant in vm_info["variants"]:
entry_point = config.VARIANTS[variant]
key = "%s:%s:%s" % (bench, vm_name, variant)
debug("Sanity check files for '%s'" % key)
if config.should_skip(key):
continue # won't execute, so no check needed
vm_info["vm_def"].check_benchmark_files(bench, entry_point)
vms_that_will_run.append(vm_name)
# per-VM sanity checks
for vm_name, vm_info in config.VMS.items():
if vm_name not in vms_that_will_run:
# User's SKIP config directive may mean a defined VM never runs.
# This may be deliberate, e.g. the user does not yet have it built.
# In this case, sanity checks can't run for this VM, so skip them.
debug("VM '%s' is not used, not sanity checking." % vm_name)
else:
debug("Running VM sanity check for '%s'" % vm_name)
vm_info["vm_def"].sanity_checks()
# platform specific sanity checks
debug("Running platform sanity checks")
platform.sanity_checks()
def create_arg_parser():
"""Create a parser to process command-line options.
"""
parser = argparse.ArgumentParser(description="Benchmark, running many fresh processes.")
parser.add_argument("--debug", "-g", action="store", default='INFO',
dest="debug_level", required=False,
help=("Debug level used by logger. Must be one of: " +
"DEBUG, INFO, WARN, DEBUG, CRITICAL, ERROR"))
parser.add_argument("--dump-audit", action="store_const",
dest="dump", const="audit", required=False,
help=("Print the audit section of a Krun " +
"results file to STDOUT"))
parser.add_argument("--dump-config", action="store_const",
dest="dump", const="config", required=False,
help=("Print the config section of a Krun " +
"results file to STDOUT"))
parser.add_argument("--dump-reboots", action="store_const",
dest="dump", const="reboots", required=False,
help=("Print the reboots section of a Krun " +
"results file to STDOUT"))
parser.add_argument("--dump-etas", action="store_const",
dest="dump", const="eta_estimates", required=False,
help=("Print the eta_estimates section of a Krun " +
"results file to STDOUT"))
parser.add_argument("--dump-temps", action="store_const",
dest="dump", const="starting_temperatures",
required=False,
help=("Print the starting_temperatures section of " +
"a Krun results file to STDOUT"))
parser.add_argument("--dump-data", action="store_const",
dest="dump", const="data", required=False,
help=("Print the data section of " +
"a Krun results file to STDOUT"))
parser.add_argument("--info", action="store_true",
help=("Print session info for specified "
"config file and exit"))
parser.add_argument("--hardware-reboots", action="store_true", default=False,
help=("Reboot physical hardware before each benchmark "
"execution. Off by default."))
parser.add_argument("--daemonise", "-D", action="store_true",
default=False, help=("Daemonise Krun"))
# Developer switches
parser.add_argument("--quick", action="store_true", default=False,
help="No delays. For development only.")
parser.add_argument("--no-user-change", action="store_true", default=False,
help="Do not change user to benchmark. "
"For development only.")
parser.add_argument("--dry-run", "-d", action="store_true", default=False,
help=("Don't really run benchmarks. "
"For development only."))
parser.add_argument("--no-pstate-check", action="store_true", default=False,
help=("Don't check Intel P-states are disabled in the"
"Linux kernel. For development only."))
parser.add_argument("--no-tickless-check", action="store_true", default=False,
help=("Don't check if the Linux kernel is tickless. "
"Linux kernel. For development only."))
filename_help = ("Krun configuration or results file. FILENAME should" +
" be a configuration file when running benchmarks " +
"(e.g. experiment.krun) and a results file " +
"(e.g. experiment_results.json.bz2) when calling " +
"krun with --dump-config, --dump_audit, " +
"--dump-reboots, --dump-etas, --dump-temps, or"
"--dump-data")
parser.add_argument("filename", action="store", # Required by default.
metavar="FILENAME",
help=(filename_help))
return parser
def main(parser):
args = parser.parse_args()
if args.dump is not None:
if not args.filename.endswith(".json.bz2"):
usage(parser)
else:
Results.ok_to_instantiate = True
results = Results(None, None, results_file=args.filename)
text = results.dump(args.dump)
# String data read in from JSON are unicode objects. This matters
# for us as some data in the audit includes unicode characters.
# If it does, a simple print no longer suffices if the system
# locale is (e.g.) ASCII. In this case print will raise an
# exception. The correct thing to do is to encode() the unicode to
# the system locale.
print(text.encode(locale.getpreferredencoding()))
sys.exit(0)
if not args.filename.endswith(".krun"):
usage(parser)
try:
if os.stat(args.filename).st_size <= 0:
util.fatal('Krun configuration file %s is empty.' % args.filename)
except OSError:
util.fatal('Krun configuration file %s does not exist.' % args.filename)
config = Config(args.filename)
if args.info:
# Info mode doesn't run the experiment.
# Just prints some metrics and exits.
util.print_session_info(config)
return
manifest_filename = ManifestManager.get_filename(config)
on_first_invocation = not (os.path.isfile(manifest_filename) and
os.stat(manifest_filename).st_size > 0)
log_file = config.log_filename()
if on_first_invocation and os.path.exists(log_file):
util.fatal("Log file '%s' already exists. "
"Move the file away before running Krun." % log_file)
attach_log_file(config, not on_first_invocation)
debug("Krun invoked with arguments: %s" % sys.argv)
mail_recipients = config.MAIL_TO
if type(mail_recipients) is not list:
util.fatal("MAIL_TO config should be a list")
mailer = Mailer(mail_recipients, max_mails=config.MAX_MAILS)
try:
inner_main(mailer, on_first_invocation, config, args)
except Exception as exn:
error_info = sys.exc_info()
subject = "Fatal Krun Exception"
lines = ["Fatal Krun error: %s\n" % str(error_info[1])]
for frame in traceback.format_tb(error_info[2]):
lines.append(frame)
msg = "".join(lines)
util.log_and_mail(mailer, debug, subject, msg, bypass_limiter=True)
raise exn
def inner_main(mailer, on_first_invocation, config, args):
out_file = config.results_filename()
out_file_exists = os.path.exists(out_file)
instr_dir = util.get_instr_json_dir(config)
instr_dir_exists = os.path.exists(instr_dir)
envlog_dir = util.get_envlog_dir(config)
envlog_dir_exists = os.path.exists(envlog_dir)
if out_file_exists and not os.path.isfile(out_file):
util.fatal(
"Output file '%s' exists but is not a regular file" % out_file)
if out_file_exists and on_first_invocation:
util.fatal("Output results file '%s' already exists. "
"Move the file away before running Krun." % out_file)
if instr_dir_exists and on_first_invocation:
util.fatal("Instrumentation dir '%s' exists." % instr_dir)
if envlog_dir_exists and on_first_invocation:
util.fatal("Env log dir '%s' exists." % envlog_dir)
if not out_file_exists and not on_first_invocation:
util.fatal("No results file to resume. Expected '%s'" % out_file)
# Initialise platform instance and assign to VM defs.
# This needs to be done early, so VM sanity checks can run.
platform = detect_platform(mailer, config)
platform.quick_mode = args.quick
platform.no_user_change = args.no_user_change
platform.no_tickless_check = args.no_tickless_check
platform.no_pstate_check = args.no_pstate_check
platform.hardware_reboots = args.hardware_reboots
# Create the instrumentation directory if required
if on_first_invocation:
# We only want make a dir if >=1 VM is in instrumentation mode.
for vm in config.VMS.itervalues():
if vm['vm_def'].instrument:
util.make_instr_dir(config)
break
debug("Checking platform preliminaries")
platform.check_preliminaries()
# Make a bit of noise if this is a virtualised environment
if platform.is_virtual():
warn("This appears to be a virtualised host. The results will be flawed. "
"Use bare-metal for reliable results!")
platform.collect_audit()
# At this point the config file is OK, and on-disk state is consistent,
# so let's daemonise (if requested).
if args.daemonise:
util.daemonise()
if not on_first_invocation:
# output file must exist, due to check above
assert(out_file_exists)
debug("Using pre-recorded initial temperature readings")
manifest = ManifestManager(config, platform)
platform_temps = {}
for sensor, tup in manifest.starting_temperatures.iteritems():
platform_temps[sensor] = tup[1]
platform.starting_temperatures = platform_temps
else:
manifest = ManifestManager(config, platform, new_file=True)
if manifest.num_execs_left == 0:
# No executions, or all skipped
fatal("Empty schedule!")
try:
info(("Wait %s secs to allow system to cool prior to "
"collecting initial temperature readings") %
config.TEMP_READ_PAUSE)
# This part is wrapped in hooks, so that if daemons or networking are
# taken down for process executions, then the initial temperature
# reading gets the same treatment.
util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS,)
platform.sleep(config.TEMP_READ_PAUSE)
debug("Taking fresh initial temperature readings")
platform.starting_temperatures = platform.take_temperature_readings()
manifest.set_starting_temperatures(platform.starting_temperatures)
# Write out an empty results file. After the initial reboot Krun
# will expect this to exist.
Results.ok_to_instantiate = True
results = Results(config, platform)
results.write_to_file()
except:
raise
finally:
util.run_shell_cmd_list(config.POST_EXECUTION_CMDS,)
log_path = config.log_filename(resume=False)
util.log_and_mail(mailer, debug,
"Benchmarking started",
"Benchmarking started.\nLogging to %s" %
log_path, bypass_limiter=True)
util.reboot(manifest, platform)
# Assign platform to VM defs -- needs to happen early for sanity checks
util.assign_platform(config, platform)
sanity_checks(config, platform)
# Build job queue -- each job is an execution
sched = ExecutionScheduler(config,
mailer,
platform,
dry_run=args.dry_run)
sched.run()
def setup_logging(parser):
# Colours help to distinguish benchmark stderr from messages printed
# by the runner. We also print warnings and errors in red so that it
# is quite impossible to miss them.
args = parser.parse_args()
# We default to "info" level, user can change by passing
# in a different argument to --debug on the command line.
level_str = args.debug_level.upper()
if level_str not in ("DEBUG", "INFO", "WARN", "DEBUG", "CRITICAL", "ERROR"):
util.fatal("Bad debug level: %s" % level_str)
level = getattr(logging, level_str.upper())
logging.root.setLevel(level)
stream = logging.StreamHandler()
stream.setLevel(level)
stream.setFormatter(CONSOLE_FORMATTER)
logging.root.handlers = [stream]
def attach_log_file(config, resume):
log_filename = config.log_filename(resume)
mode = 'a' if resume else 'w'
fh = logging.FileHandler(log_filename, mode=mode)
fh.setFormatter(PLAIN_FORMATTER)
logging.root.addHandler(fh)
debug("Attached log file: %s" % log_filename)
if __name__ == "__main__":
debug("Krun starting...")
debug("arguments: %s" % " ".join(sys.argv[1:]))
parser = create_arg_parser()
setup_logging(parser)
main(parser)
# All fatal exceptions (FatalKrunError, AssertionError, ...) end up here.
# Although Some do get caught deeper in the stack, (to try to recover from
# various failures) but they are always re-raised.