-
Notifications
You must be signed in to change notification settings - Fork 5
/
extract_dictation.py
executable file
·402 lines (358 loc) · 14 KB
/
extract_dictation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
#!/usr/bin/env python3
"""Extract a mindmap from a dictated text file using ad-hoc conventions."""
__author__ = "Joseph Reagle"
__copyright__ = "Copyright (C) 2009-2023 Joseph Reagle"
__license__ = "GLPv3"
__version__ = "1.0"
import argparse # http://docs.python.org/dev/library/argparse.html
import logging as log
import re
import subprocess
import sys
import time
from pathlib import Path # https://docs.python.org/3/library/pathlib.html
from typing import TextIO
from biblio.fields import (
BIB_FIELDS, # dict of field to its shortcut
BIB_SHORTCUTS, # dict of shortcuts to a field
)
from utils.web import yasn_publish
HOME = Path.home()
MINDMAP_PREAMBLE = """<map version="freeplane 1.5.9">
<node TEXT="reading" FOLDED="false" ID="ID_327818409">
<font SIZE="18"/>
<hook NAME="MapStyle">
<map_styles>
<stylenode LOCALIZED_TEXT="styles.root_node" STYLE="oval"
UNIFORM_SHAPE="true" VGAP_QUANTITY="24.0 pt">
<font SIZE="24"/>
<stylenode LOCALIZED_TEXT="styles.user-defined"
POSITION="right" STYLE="bubble">
<stylenode TEXT="author" COLOR="#338800"/>
<stylenode TEXT="title" COLOR="#090f6b"/>
<stylenode TEXT="cite" COLOR="#ff33b8"/>
<stylenode TEXT="annotation" COLOR="#999999"/>
<stylenode TEXT="quote" COLOR="#166799"/>
<stylenode TEXT="paraphrase" COLOR="#8b12d6"/>
</stylenode>
</stylenode>
</map_styles>
</hook>
"""
# CL_CO = {'annotation': '#999999', 'author': '#338800', 'title': '#090f6b',
# 'cite': '#ff33b8', 'author': '#338800',
# 'quote': '#166799', 'paraphrase': '#8b12d6',
# 'default': '#000000', None: None}
# CO_CL = dict([(label, color) for color, label in list(CL_CO.items())])
def clean(text):
"""Clean and encode text."""
# TODO: Maybe make use of b.smart_punctuation_to_ascii() and
# utils_web.escape_XML()
text = text.strip(", \f\r\n")
REPLACEMENTS = [
("&", "&"),
("\N{APOSTROPHE}", "'"),
("\N{QUOTATION MARK}", """),
("\N{LEFT DOUBLE QUOTATION MARK}", """),
("\N{RIGHT DOUBLE QUOTATION MARK}", """),
("\N{LEFT SINGLE QUOTATION MARK}", "\N{APOSTROPHE}"),
("\N{RIGHT SINGLE QUOTATION MARK}", "\N{APOSTROPHE}"),
(" \N{EN DASH} ", " -- "),
("\N{EN DASH}", " -- "),
]
for v1, v2 in REPLACEMENTS:
text = text.replace(v1, v2)
return text
def get_date():
now = time.localtime()
# year = time.strftime("%Y", now).lower()
# month = time.strftime("%m", now).lower()
date_token = time.strftime("%Y%m%d", now)
return date_token
def build_mm_from_txt(
mm_fd: TextIO,
line: str,
started: bool,
in_part: bool,
in_chapter: bool,
in_section: bool,
in_subsection: bool,
entry: dict,
) -> tuple[bool, bool, bool, bool, bool, dict]:
citation = ""
if line not in ("", "\r", "\n"):
# print(f"{line=}")
if line.lower().startswith("author ="):
# and re.match('([^=]+ = (?=[^=]+)){2,}', line, re.I)
if started: # Do I need to close a previous entry
if in_subsection:
mm_fd.write(""" </node>\n""")
in_subsection = False
if in_section:
mm_fd.write(""" </node>\n""")
in_section = False
if in_chapter:
mm_fd.write(""" </node>\n""")
in_chapter = False
if in_part:
mm_fd.write(""" </node>\n""")
in_part = False
mm_fd.write("""</node>\n</node>\n""")
started = False
started = True
# should space be optional '(\w+) ?='
cites = re.split(r"(\w+) =", line)[1:]
# 2 references to an iterable object that are
# unpacked with '*' and rezipped
cite_pairs = list(zip(*[iter(cites)] * 2, strict=True))
for token, value in cite_pairs:
log.info(f"{token=}, {value=}")
if token == "keyword":
log.info(f"{entry=}")
entry.setdefault("keyword", []).append(value.strip())
else:
entry[token.lower()] = value.strip()
if "author" not in entry:
entry["author"] = "Unknown"
if "title" not in entry:
entry["title"] = "Untitled"
if "subtitle" in entry:
entry["title"] += ": " + entry["subtitle"]
del entry["subtitle"]
mm_fd.write(
"""<node STYLE_REF="{}" TEXT="{}" POSITION="RIGHT">\n""".format(
"author", clean(entry["author"].title())
)
)
if "url" in entry: # write title with hyperlink
mm_fd.write(
""" <node STYLE_REF="{}" LINK="{}" TEXT="{}">\n""".format(
"title", clean(entry["url"]), clean(entry["title"])
)
)
else:
mm_fd.write( # write plain title
""" <node STYLE_REF="{}" TEXT="{}">\n""".format(
"title", clean(entry["title"])
)
)
for token, value in sorted(entry.items()):
if token not in ("author", "title", "url", "keyword"):
if token in BIB_SHORTCUTS:
t, v = token.lower(), value
elif token.lower() in BIB_FIELDS:
t, v = BIB_FIELDS[token.lower()], value
else:
raise Exception(f"{token=} not in BIB_FIELDS")
citation_add = f" {t}={v}"
citation += citation_add
if token == "keyword":
citation += " kw=" + " kw=".join(value)
if citation != "":
clean(citation)
citation += f" r={get_date()}"
mm_fd.write(f""" <node STYLE_REF="cite" TEXT="{clean(citation)}"/>\n""")
elif re.match(r"summary\.(.*)", line, re.I):
matches = re.match(r"summary\.(.*)", line, re.I)
entry["summary"] = matches.groups()[0]
mm_fd.write(
""" <node STYLE_REF="{}" TEXT="{}"/>\n""".format(
"annotation", clean(entry["summary"])
)
)
elif re.match("part.*", line, re.I):
if in_part:
if in_chapter:
mm_fd.write(""" </node>\n""") # close chapter
in_chapter = False
if in_section:
mm_fd.write(""" </node>\n""") # close section
in_section = False
if in_subsection:
mm_fd.write(""" </node>\n""") # close section
in_subsection = False
mm_fd.write(""" </node>\n""") # close part
in_part = False
mm_fd.write(
""" <node STYLE_REF="{}" TEXT="{}">\n""".format("quote", clean(line))
)
in_part = True
elif re.match("chapter.*", line, re.I):
if in_chapter:
if in_section:
mm_fd.write(""" </node>\n""") # close section
in_section = False
if in_subsection:
mm_fd.write(""" </node>\n""") # close section
in_subsection = False
mm_fd.write(""" </node>\n""") # close chapter
in_chapter = False
mm_fd.write(
""" <node STYLE_REF="{}" TEXT="{}">\n""".format("quote", clean(line))
)
in_chapter = True
elif re.match("section.*", line, re.I):
if in_subsection:
mm_fd.write(""" </node>\n""") # close section
in_subsection = False
if in_section:
mm_fd.write(""" </node>\n""")
in_section = False
mm_fd.write(
""" <node STYLE_REF="{}" TEXT="{}">\n""".format(
"quote", clean(line[9:])
)
)
in_section = True
elif re.match("subsection.*", line, re.I):
if in_subsection:
mm_fd.write(""" </node>\n""")
in_subsection = False
mm_fd.write(
""" <node STYLE_REF="{}" TEXT="{}">\n""".format(
"quote", clean(line[12:])
)
)
in_subsection = True
elif re.match("(--.*)", line, re.I):
mm_fd.write(
""" <node STYLE_REF="{}" TEXT="{}"/>\n""".format(
"default", clean(line)
)
)
else:
node_color = "paraphrase"
line_text = line
line_no = ""
# DIGIT_CHARS = '[\dcdilmxv]' # arabic and roman numbers
PAGE_NUM_PAT = r"^([\dcdilmxv]+)(\-[\dcdilmxv]+)? (.*?)(-[\dcdilmxv]+)?$"
matches = re.match(PAGE_NUM_PAT, line, re.I)
if matches:
line_no = matches.group(1)
if matches.group(2):
line_no += matches.group(2)
if matches.group(4):
line_no += matches.group(4)
line_no = line_no.lower() # lower case roman numbers
line_text = matches.group(3).strip()
if line_text.startswith("excerpt."):
node_color = "quote"
line_text = line_text[9:]
if line_text.strip().endswith("excerpt."):
node_color = "quote"
line_text = line_text[0:-9]
mm_fd.write(
""" <node STYLE_REF="{}" TEXT="{}"/>\n""".format(
node_color, clean(" ".join((line_no, line_text)))
)
)
return started, in_part, in_chapter, in_section, in_subsection, entry
def create_mm(args: argparse.Namespace, text: str, mm_file_name: Path) -> None:
import traceback
with mm_file_name.open("w", encoding="utf-8", errors="replace") as mm_fd:
entry = {} # a bibliographic entry for yasn_publish
entry["keyword"] = [] # there might not be any
started = False
in_part = False
in_chapter = False
in_section = False
in_subsection = False
line_number = 0
mm_fd.write(f"""{MINDMAP_PREAMBLE}\n<node TEXT="Readings">\n""")
for line_number, line in enumerate(text.split("\n")):
line = line.strip()
try:
(
started,
in_part,
in_chapter,
in_section,
in_subsection,
entry,
) = build_mm_from_txt(
mm_fd,
line,
started,
in_part,
in_chapter,
in_section,
in_subsection,
entry,
)
except KeyError as err:
print(err)
print(traceback.print_tb(sys.exc_info()[2]), "\n", line_number, line)
sys.exit()
if in_subsection:
mm_fd.write("""</node>""") # close the last subsection
if in_section:
mm_fd.write("""</node>""") # close the last section
if in_chapter:
mm_fd.write("""</node>""") # close the last chapter
if in_part:
mm_fd.write("""</node>""") # close the last part
mm_fd.write("""</node>\n</node>\n</node>\n""") # close the last entry
mm_fd.write("""</node>\n</map>\n""") # close the document
log.info(f"{entry=}")
if args.publish:
yasn_publish(
entry["summary"],
entry["title"],
None,
entry["url"],
" ".join(entry["keyword"]),
)
def process_args(argv):
"""Process arguments."""
# https://docs.python.org/3/library/argparse.html
arg_parser = argparse.ArgumentParser(
description="""Convert dictated notes to mindmap.
`author` must be first in citation pairs, e.g., "author = ...
"""
)
# positional arguments
arg_parser.add_argument("file_names", nargs="+", type=Path, metavar="FILE_NAMES")
# optional arguments
arg_parser.add_argument(
"-p",
"--publish",
action="store_true",
default=False,
help="publish to social networks",
)
arg_parser.add_argument(
"-L",
"--log-to-file",
action="store_true",
default=False,
help="log to file %(prog)s.log",
)
arg_parser.add_argument(
"-V",
"--verbose",
action="count",
default=0,
help="increase verbosity from critical though error, warning, info, and debug",
)
arg_parser.add_argument("--version", action="version", version="0.1")
args = arg_parser.parse_args(argv)
log_level = (log.CRITICAL) - (args.verbose * 10)
LOG_FORMAT = "%(levelname).4s %(funcName).10s:%(lineno)-4d| %(message)s"
if args.log_to_file:
log.basicConfig(
filename="extract-dictation.log",
filemode="w",
level=log_level,
format=LOG_FORMAT,
)
else:
log.basicConfig(level=log_level, format=LOG_FORMAT)
return args
if __name__ == "__main__":
args = process_args(sys.argv[1:])
log.info(f"{args=}")
for source_fn in args.file_names:
text = source_fn.read_text(encoding="utf-8-sig")
mm_file_name = source_fn.with_suffix(".mm")
create_mm(args, text, mm_file_name)
subprocess.call(["open", "-a", "Freeplane.app", mm_file_name])