-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmanager.py
executable file
·488 lines (381 loc) · 16 KB
/
manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#!/usr/bin/env python
##
## Manages all stages of the HTTPS dashboard pipeline:
## 1) Gather HAR files for sites of interest.
## 2) Generate profiles based on the HAR files.
##
import os
import sys
import argparse
import logging
import subprocess
import tempfile
import shutil
import time
import datetime
import json
import glob
import smtplib
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email.MIMEText import MIMEText
from email.Utils import COMMASPACE, formatdate
from email import Encoders
from logging import handlers
import thumbnailer
import purge
# tools
ALEXA_URL_FETCHER = './alexa_top_sites.py'
URL_PREPARER = './prepare_url_list.py'
HAR_GENERATOR = './web-profiler/tools/har_generator.py'
SCREENSHOT_GENERATOR = './web-profiler/tools/screenshot_generator.py'
PROFILER = './profiler.py'
RSYNC = '/usr/bin/env rsync'
RESULT_CHECKER = './check_results.py'
def setup_logging():
# make log directory if it doesn't exist
if not os.path.exists(os.path.join(conf['PREFIX'], 'logs')):
os.makedirs(os.path.join(conf['PREFIX'], 'logs'))
logfmt = "%(levelname) -10s %(asctime)s %(module)s:%(lineno) -7s %(message)s"
if args.quiet:
level = logging.WARNING
elif args.verbose:
level = logging.DEBUG
else:
level = logging.INFO
logging.getLogger('').setLevel(level)
# log to file (capped at 10 MB)
file_handler = handlers.RotatingFileHandler(conf['MANAGER_LOG'],\
maxBytes=10*1024*1024, backupCount=3)
file_handler.setFormatter(logging.Formatter(fmt=logfmt))
file_handler.setLevel(level)
logging.getLogger('').addHandler(file_handler)
# email me on error or exception
smtp_conf = None
with open(conf['SMTP_CONF'], 'r') as f:
smtp_conf = eval(f.read())
email_handler = handlers.SMTPHandler(\
smtp_conf['server'], '[email protected]',\
['[email protected]'], 'HTTPS Dashboard Error',\
credentials=smtp_conf['credentials'], secure=())
email_handler.setFormatter(logging.Formatter(fmt=logfmt))
email_handler.setLevel(logging.ERROR)
logging.getLogger('').addHandler(email_handler)
def load_conf(conf_file):
conf = None
try:
with open(conf_file, 'r') as f:
conf = eval(f.read())
for path in conf['PATHS_TO_PREFIX']:
conf[path] = os.path.join(conf['PREFIX'], conf[path])
except:
logging.exception('Error reading configuration: %s', conf_file)
return conf
def send_mail(send_from, send_to, subject, text, server, credentials, files=[]):
assert isinstance(send_to, list)
assert isinstance(files, list)
msg = MIMEMultipart()
msg['From'] = send_from
msg['To'] = COMMASPACE.join(send_to)
msg['Date'] = formatdate(localtime=True)
msg['Subject'] = subject
msg.attach( MIMEText(text) )
for f in files:
part = MIMEBase('application', "octet-stream")
part.set_payload( open(f,"rb").read() )
Encoders.encode_base64(part)
part.add_header('Content-Disposition', 'attachment; filename="%s"' % os.path.basename(f))
msg.attach(part)
smtp = smtplib.SMTP_SSL(server)
smtp.login(*credentials)
smtp.sendmail(send_from, send_to, msg.as_string())
smtp.close()
class TimeLog(object):
def __init__(self, format='%H:%M:%S'):
self._times = [] # list of tuples: (tag, timestamp)
self._format = format
def record_time(self, tag):
self._times.append((tag, datetime.datetime.now()))
def __str__(self):
string = ''
last_timestamp = None
for tag, timestamp in self._times:
difference = ''
if last_timestamp:
difference = '\t(%s)' % (timestamp - last_timestamp)
string += '%s\t%s%s\n' % (tag, timestamp.strftime(self._format), difference)
last_timestamp = timestamp
string += '\nTOTAL TIME ELAPSED: %s' % (self._times[-1][1] - self._times[0][1])
return string
def __repr__(self):
return self.__str__()
def main():
logging.info('=============== MANAGER LAUNCHED ===============')
timelog = TimeLog()
timelog.record_time('Manager launched')
##
## Prepare temp directories
##
try:
if not conf['TEMPDIR']:
conf['TEMPDIR'] = os.path.join(tempfile.gettempdir(), 'https-dashboard')
if os.path.exists(conf['TEMPDIR']):
logging.info('Removing existing temp directory')
shutil.rmtree(conf['TEMPDIR'])
os.makedirs(conf['TEMPDIR'])
# make a subdir for each user agent
for user_agent_tag in conf['USER_AGENTS']:
os.makedirs(os.path.join(conf['TEMPDIR'], user_agent_tag))
logging.info('Set up temp directory: %s', conf['TEMPDIR'])
except:
logging.exception('Error preparing temp directory')
sys.exit(-1)
timelog.record_time('Set up temp directory')
##
## Prepare output directories
##
today = None
try:
# make new directory in outdir named with today's date
today = datetime.datetime.now().strftime('%Y-%m-%d')
conf['OUT_SUBDIR'] = os.path.abspath(os.path.join(conf['OUTDIR'], today))
if os.path.exists(conf['OUT_SUBDIR']):
backup_dir = '%s_backup-%s' %\
(conf['OUT_SUBDIR'], datetime.datetime.now().strftime('%H-%M-%S'))
logging.warn('Subdirectory for today already exists; moving to: %s', backup_dir)
shutil.move(conf['OUT_SUBDIR'], backup_dir)
os.makedirs(conf['OUT_SUBDIR'])
# make a subdir for each user agent
for user_agent_tag in conf['USER_AGENTS']:
os.makedirs(os.path.join(conf['OUT_SUBDIR'], user_agent_tag))
# write manifest file
# TODO: add crawl date
manifest = {
'user-agents': conf['USER_AGENTS'],
}
manifest_file = os.path.join(conf['OUT_SUBDIR'], 'crawl-manifest.json')
with open(manifest_file, 'w') as f:
json.dump(manifest, f)
logging.info('Set up output subdirectory: %s', conf['OUT_SUBDIR'])
except:
logging.exception('Error preparing output directory')
sys.exit(-1)
timelog.record_time('Set up output directory')
##
## Prepare URL list
##
if not conf['URL_FILE']:
try:
alexa_url_list = '%s/alexa_url_list.txt' % conf['TEMPDIR']
prepared_url_list = '%s/prepared_url_list.txt' % conf['TEMPDIR']
# Get top 500 Alexa URLs
# TODO: top 500
alexa_cmd = '%s -n 500 > %s' % (ALEXA_URL_FETCHER, alexa_url_list)
logging.info('Getting Alexa URLs: %s' % alexa_cmd)
subprocess.check_call(alexa_cmd, shell=True) # TODO: careful!
# Make a file with the HTTP and HTTPS versions of those URLs
prepare_cmd = '%s %s %s' %\
(URL_PREPARER, alexa_url_list, prepared_url_list)
logging.info('Preparing URL list: %s' % prepare_cmd)
subprocess.check_call(prepare_cmd.split())
conf['URL_FILE'] = prepared_url_list
except:
logging.exception('Error preparing URL list')
sys.exit(-1)
timelog.record_time('Prepared URL list')
##
## Generate profiles
##
for user_agent_tag in conf['USER_AGENTS']:
logging.info('Generating profiles for user agent: %s' % user_agent_tag)
uagent_tmpdir = os.path.join(conf['TEMPDIR'], user_agent_tag)
uagent_outdir = os.path.join(conf['OUT_SUBDIR'], user_agent_tag)
##
## STAGE ONE: Capture HARs for the URLs
##
try:
har_cmd = '%s -f %s -o %s -g %s -t %s -v' %\
(HAR_GENERATOR, conf['URL_FILE'], uagent_tmpdir,\
conf['HAR_GENERATOR_LOG'], conf['HAR_GENERATOR_STDOUT'])
if conf['USER_AGENTS'][user_agent_tag]['string']:
har_cmd += ' -u "%s"' % conf['USER_AGENTS'][user_agent_tag]['string']
logging.debug('Running HAR genrator: %s', har_cmd)
with open(conf['HAR_GENERATOR_STDOUT'], 'a') as f:
subprocess.check_call(har_cmd, stdout=f,\
stderr=subprocess.STDOUT, shell=True) # TODO: careful!
except:
logging.exception('Error capturing HARs for user agent %s', user_agent_tag)
# TODO: mark error?
timelog.record_time('%s: HARs' % user_agent_tag)
##
## STAGE ONE-2: Save compressed copy of the HARs
##
try:
tarball_path = os.path.join(conf['HAR_ARCHIVE_DIR'], '%s_%s.tgz' %\
(today, user_agent_tag))
tar_cmd = '(cd %s && tar -czf %s *.har)' % (uagent_tmpdir, tarball_path)
logging.debug('Making tarball of HARs: %s', tar_cmd)
subprocess.check_call(tar_cmd, shell=True) # TODO: careful!
except:
logging.exception('Error saving compressed HARs to archive.')
##
## STAGE TWO: Capture screenshots for the URLs
##
try:
screenshot_cmd = '%s -f %s -o %s -g %s -v' %\
(SCREENSHOT_GENERATOR, conf['URL_FILE'], uagent_tmpdir, conf['SCREENSHOT_GENERATOR_LOG'])
if conf['USER_AGENTS'][user_agent_tag]['string']:
screenshot_cmd += ' -u "%s"' % conf['USER_AGENTS'][user_agent_tag]['string']
logging.debug('Running screenshot genrator: %s', screenshot_cmd)
with open(conf['SCREENSHOT_GENERATOR_STDOUT'], 'a') as f:
subprocess.check_call(screenshot_cmd, stdout=f,\
stderr=subprocess.STDOUT, shell=True) # TODO: careful!
except:
logging.exception('Error capturing screenshots for user agent %s', user_agent_tag)
# TODO: mark error?
timelog.record_time('%s: screenshots' % user_agent_tag)
##
## STAGE TWO-2: Copy pickled results from tmpdir to outdir
##
try:
for pickle_file in glob.glob(uagent_tmpdir + '/*.pickle'):
shutil.copy(pickle_file, uagent_outdir)
except:
logging.exception('Error copying pickled results.')
##
## STAGE THREE: Generate profiles
##
try:
profiler_cmd = '%s -d %s -o %s -g %s -v' %\
(PROFILER, uagent_tmpdir, uagent_outdir, conf['PROFILER_LOG'])
logging.debug('Running profiler: %s', profiler_cmd)
with open(conf['PROFILER_STDOUT'], 'a') as f:
subprocess.check_call(profiler_cmd.split(), stdout=f,\
stderr=subprocess.STDOUT)
except:
logging.exception('Error profiling user agent %s', user_agent_tag)
# TODO: mark error?
timelog.record_time('%s: profiles' % user_agent_tag)
##
## STAGE FOUR: Prepare image thumbnails
##
try:
screenshot_dir = os.path.join(uagent_outdir, 'site_screenshots')
thumbnailer.process_image_dir(screenshot_dir)
except:
logging.exception('Error processing thumbnails for user agent %s', user_agent_tag)
# TODO: mark error?
timelog.record_time('%s: thumbnails' % user_agent_tag)
##
## If successful, update main manifest
##
# TODO: only update if everything was OK?
try:
# read the current manifest
main_manifest_file = os.path.join(conf['OUTDIR'], 'main-manifest.json')
if os.path.exists(main_manifest_file):
with open(main_manifest_file, 'r') as f:
main_manifest = json.load(f)
if 'dates' not in main_manifest:
main_manifest['dates'] = []
else:
main_manifest = {'dates': []}
# add today's crawl
if today not in main_manifest['dates']:
main_manifest['dates'].append(today)
main_manifest['dates'] = sorted(main_manifest['dates'], reverse=True)
with open(main_manifest_file, 'w') as f:
json.dump(main_manifest, f)
except:
logging.exception('Error saving main manifest')
##
## Purge old crawl data
##
# keep only the last week of screenshots
try:
logging.info('Purging old screenshots')
purge.purge_screenshots(conf['OUTDIR'], 7)
except:
logging.exception('Error purging old screenshots')
# keep the last week of HAR tarballs plus one per week before that
try:
logging.info('Purging old HAR archives')
purge.purge_hars(conf['HAR_ARCHIVE_DIR'], 7, 7)
except:
logging.exception('Error purging old HAR archives')
# delete stdout files larger than 10 MB
try:
logging.info('Purging large stdout logs')
purge.purge_logs(os.path.join(conf['PREFIX'], 'logs'), 10*1024*1024)
except:
logging.exception('Error purging large logs')
##
## Copy files to web server
##
try:
# make an rsync exclude file
rsync_exclude_path = os.path.join(conf['TEMPDIR'], 'rsync.exclude')
with open(rsync_exclude_path, 'w') as f:
for entry in conf['RSYNC_EXCLUDE']:
f.write('%s\n' % entry)
# TODO: remove
# renew kerberos ticket
#subprocess.check_call('kinit -R'.split())
# sync files
rsync_cmd = '%s -avz --no-g --delete --delete-excluded --exclude-from=%s %s %s' %\
(RSYNC, rsync_exclude_path, conf['OUTDIR'], conf['WEB_SERVER_DIR'])
logging.debug('Running rsync: %s', rsync_cmd)
subprocess.check_call(rsync_cmd.split())
except:
logging.exception('Error copying profiles to web server')
timelog.record_time('Uploaded to AFS')
##
## Send summary email
##
try:
# generate HAR summary file
har_summary_path = os.path.join(conf['TEMPDIR'], 'har_summary.txt')
checker_cmd = '%s %s -f %s > %s'\
% (RESULT_CHECKER, conf['OUTDIR'], 'har_generator_results.pickle',\
har_summary_path)
logging.debug('Running checker for HAR files: %s', checker_cmd)
subprocess.check_call(checker_cmd, shell=True) # TODO: careful!
# generate screenshot summary file
screenshot_summary_path = os.path.join(conf['TEMPDIR'], 'screenshot_summary.txt')
checker_cmd = '%s %s -f %s > %s'\
% (RESULT_CHECKER, conf['OUTDIR'],\
'screenshot_generator_results.pickle', screenshot_summary_path)
logging.debug('Running checker for screenshot files: %s', checker_cmd)
subprocess.check_call(checker_cmd, shell=True) # TODO: careful!
# email summary files
smtp_conf = None
with open(conf['SMTP_CONF'], 'r') as f:
smtp_conf = eval(f.read())
send_mail('[email protected]',\
['[email protected]'],\
'HTTPS Dashboard Crawl Summary',\
'%s\n\n' % timelog,\
smtp_conf['server'],\
smtp_conf['credentials'],\
files=[har_summary_path, screenshot_summary_path])
except:
logging.exception('Error sending summary email.')
##
## Delete temp directories
##
#shutil.rmtree(conf['TEMPDIR'])
logging.info('Done.')
if __name__ == "__main__":
# set up command line args
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter,\
description='Manage all the stages of site crawling for HTTPS Dashboard.')
parser.add_argument('-c', '--config', default='./default.conf', help='Manager configuration file')
parser.add_argument('-q', '--quiet', action='store_true', default=False, help='only print errors')
parser.add_argument('-v', '--verbose', action='store_true', default=False, help='print debug info. --quiet wins if both are present')
#parser.add_argument('-g', '--logfile', default=None, help='Path for log file.')
args = parser.parse_args()
# load conf
conf = load_conf(args.config)
# set up logging
setup_logging()
main()