-
Notifications
You must be signed in to change notification settings - Fork 0
/
dump.py
executable file
·94 lines (82 loc) · 3.81 KB
/
dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python3
import re, urllib.request, urllib.error, urllib.parse
from bs4 import BeautifulSoup
# Some configurable things
DEFAULT_ENTRY_URL = 'http://kirjoitusalusta.fi/hacklab'
ETHERPAD_BASE = 'http://kirjoitusalusta.fi/'
# How to extract the pad name from the pad url
PAD_NAME_RE = re.compile('^%s(?P<padname>[^/]+).*$' % ETHERPAD_BASE.replace('http://', 'https?://'))
# How to form the HTML export URL, takes single %s where the pad name goes
PAD_EXPORT_URL_FORMAT = 'http://kirjoitusalusta.fi/ep/pad/export/%s/latest?format=html'
# By default recurse to any and all urls under this same etherpad instance
PAD_RECURSE_HREF_RE = re.compile('^%s' % ETHERPAD_BASE.replace('http://', 'https?://'))
# Again takes single %s which is the pad name
PAD_EXPORT_FILENAME = '%s.html'
# pad url and export url example (as a reminder)
# http://kirjoitusalusta.fi/hacklab
# http://kirjoitusalusta.fi/ep/pad/export/hacklab/latest?format=html
class jobmanager:
def __init__(self, entry_url):
self.job_queue = [entry_url, ]
self.seen_urls = { entry_url: True }
self.fetcher = fetcher(ETHERPAD_BASE, self)
def add_to_queue(self, url):
"""Add given URL to job queue unless it has previously been added there already"""
if url in self.seen_urls:
return False
self.job_queue.append(url)
self.seen_urls[url] = True
self.seen_urls[url.replace('http://', 'https://')] = True
self.seen_urls[url.replace('https://', 'http://')] = True
return True
def run(self):
while(len(self.job_queue) > 0):
url = self.job_queue.pop(0)
self.fetcher.fetch(url)
class fetcher:
"""This will take a pad URL (under ETHERPAD_BASE) and dump it as HTML, then follow any links to same etherpad server and dump those as well"""
def __init__(self, base, queuemanager):
self.base = base
# This is just a reference so we can push jobs to the queue
self.queuemanager = queuemanager
def fetch(self, pad_url):
"""This will fetch a single HTML export of a pad, add any other pads found in it to the job queue and rewrite links to local files"""
m = PAD_NAME_RE.search(pad_url)
if not m:
return False
pad_name = m.group('padname')
htmlfile = PAD_EXPORT_FILENAME % pad_name
export_url = PAD_EXPORT_URL_FORMAT % pad_name
print("Fetching %s" % export_url)
try:
fp = urllib.request.urlopen(export_url)
except urllib.error.URLError as e:
print("Failed to fetchs %s: %s" % (export_url, e))
return False
soup = BeautifulSoup(fp, "html5lib")
recurse_links = soup.find_all('a', href=PAD_RECURSE_HREF_RE)
for tag in recurse_links:
# Doublecheck the url is sane pad
new_pad_url = tag['href']
m2 = PAD_NAME_RE.search(new_pad_url)
if not m2:
continue
new_pad_name = m2.group('padname')
new_htmlfile = PAD_EXPORT_FILENAME % new_pad_name
# Add to processing list
self.queuemanager.add_to_queue(new_pad_url)
# Rewrite the link to point to the local file to be (since doing it afterwards, while slightly safer, is just too much work for now)
tag['href'] = './%s' % new_htmlfile
# Dump the soup to a file
with open(htmlfile, 'wb') as f:
f.write(soup.prettify().encode('utf-8'))
if __name__ == '__main__':
# TODO: allow specifying entry URL via command line ?
# Ugly but so is making a separate running these commands and this python program..
import os
os.system('git pull')
jm = jobmanager(DEFAULT_ENTRY_URL)
jm.run()
os.system('git add *.html')
os.system("git commit -m 'automatic commit from the dump.py script'")
os.system('git push')