-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrevue_to_hugo.py
171 lines (127 loc) · 4.39 KB
/
revue_to_hugo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import html
import sys
import textwrap
from urllib import request
from bs4 import BeautifulSoup
import html2text
# CSS classes used by Revue
CLS_BLOCKQUOTE = "revue-blockquote"
CLS_H2 = "header-text"
CLS_IMG = "img"
CLS_P = "revue-p"
CLS_UL = "ul"
CLS_OL = "ol"
def _one_sentence_per_line(text):
text = text.replace(".** ", ".**\n")
text = text.replace("!** ", "!**\n")
text = text.replace("?** ", "?**\n")
text = text.replace(". ", ".\n")
text = text.replace("! ", "!\n")
text = text.replace("? ", "?\n")
return text
def transform_element(html_element):
"""Transform an HTML element from Revue into markdown text and return it."""
cls = html_element["class"][0]
# Skip empty elements
if html_element.text == "" and cls != CLS_IMG:
return ""
converter = html2text.HTML2Text()
converter.body_width = 0
# Blockquotes
if cls == CLS_BLOCKQUOTE:
text = converter.handle(str(html_element)).strip()
# Make text one sentence per line
text = text.replace(". ", ".\n> ")
text = text.replace("! ", "!\n> ")
text = text.replace("? ", "?\n> ")
# Headers
elif cls == CLS_H2:
text = f"## {html_element.text.strip()}"
# Images
elif cls == CLS_IMG:
url = html_element.attrs["src"]
alt = html_element.attrs["alt"].strip()
text = f"![{alt}]({url})\n"
if alt:
text += f"\n_{alt}_"
# Paragraphs
elif cls == CLS_P:
text = converter.handle(str(html_element))
# Make text one sentence per line
text = _one_sentence_per_line(text)
# Unordered lists
elif cls == CLS_UL:
text = converter.handle(str(html_element))
# Remove indent
text = text.replace(" * ", "* ")
# Ordered lists
elif cls == CLS_OL:
text = converter.handle(str(html_element))
# Remove indent
text = "\n".join([line.strip() for line in text.split("\n")])
else:
raise ValueError("Unimplemented class")
return f"{text.strip()}\n\n"
def load_issue(issue_id, base_url="https://revue.dynamicallytyped.com"):
"""Download an issue and return its HTML contents."""
url = f"{base_url}/issues/0-{issue_id}"
req = request.Request(
url, headers={"User-Agent": "Totally a real browser and not a bot, yep"}
)
return request.urlopen(req).read().decode("utf-8")
def revue_to_md(issue_id):
html_doc = load_issue(issue_id)
soup = BeautifulSoup(html_doc, "html.parser")
# Clean content to make it ready for transformer
# (html2text needs semantic HTML for blockquotes)
quotes = soup.find_all(class_=CLS_BLOCKQUOTE)
for tag in quotes:
tag.name = "blockquote"
# (only selecting on classes so we add class="ul" to <ul>s)
uls = soup.find_all(name="ul")
for tag in uls:
tag.attrs["class"] = [CLS_UL]
# (same for <ol>s)
ols = soup.find_all(name="ol")
for tag in ols:
tag.attrs["class"] = [CLS_OL]
# (images)
images = soup.find_all("img", width="600")
for tag in images:
if tag.attrs["alt"] != "Dynamically Typed":
tag.attrs["class"] = [CLS_IMG]
# Extract relevant content
content = soup.find_all(
class_=lambda cls: cls
in [CLS_BLOCKQUOTE, CLS_H2, CLS_IMG, CLS_P, CLS_UL, CLS_OL]
)
# Transform content
title = html.escape(soup.title.text.split("|")[0])
date = soup.find("time").attrs["datetime"].split("T")[0]
revue_link = soup.find("link", {"rel": "canonical"}).get("href").split(".com")[-1]
number = title.split(":")[0].strip("#")
markdown = "".join(transform_element(tag) for tag in content).strip()
# Add Hugo info
hugo_info = textwrap.dedent(
f"""
---
title: "{title}"
date: {date}
number: {number}
aliases:
- {revue_link}
---
"""
).strip()
return number, hugo_info + "\n\n" + markdown
if __name__ == "__main__":
if len(sys.argv) != 2:
exit("Usage: `python revue_to_hugo.py issue_id` (6-digit ID in the URL)")
issue_id = sys.argv[1]
issue_number, markdown = revue_to_md(issue_id)
# Write new issue
with open(f"website/content/issues/{int(issue_number):03}.md", "w") as out_file:
out_file.write(markdown)
# Append issue ID to list of IDs
with open("issue_ids.txt", "a") as ids_file:
ids_file.write(issue_id + "\n")