Skip to content

Commit 5e239ed

Browse files
author
Kalle Westerling
committed
Expand in-code documentation to ease potential contributions
Contributes to #72
1 parent 121f2a3 commit 5e239ed

6 files changed

+196
-46
lines changed

src/alto2txt/extract_publications_text.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,12 @@ def main():
9696
parser = ArgumentParser(
9797
description="Converts XML publications to plaintext articles"
9898
)
99-
parser.add_argument("xml_in_dir", help="Input directory with XML publications")
100-
parser.add_argument("txt_out_dir", help="Output directory for plaintext articles")
99+
parser.add_argument(
100+
"xml_in_dir", help="Input directory with XML publications"
101+
)
102+
parser.add_argument(
103+
"txt_out_dir", help="Output directory for plaintext articles"
104+
)
101105
parser.add_argument(
102106
"-p",
103107
"--process-type",

src/alto2txt/logging_utils.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import logging
66

77

8-
def configure_logging(log_file):
8+
def configure_logging(log_file: str) -> None:
99
"""
1010
Configure console and file logging.
1111
@@ -17,7 +17,9 @@ def configure_logging(log_file):
1717
formatter = logging.Formatter(format)
1818

1919
logging.basicConfig(level=logging.INFO, format=format)
20+
2021
file_logger = logging.FileHandler(log_file)
2122
file_logger.setLevel(logging.INFO)
2223
file_logger.setFormatter(formatter)
24+
2325
logging.getLogger().addHandler(file_logger)

src/alto2txt/multiprocess_xml_to_text.py

+43-6
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,12 @@
1818

1919

2020
def publication_to_text(
21-
publications_dir, publication, txt_out_dir, log_file, downsample=1
22-
):
21+
publications_dir: str,
22+
publication: str,
23+
txt_out_dir: str,
24+
log_file: str,
25+
downsample: int = 1,
26+
) -> None:
2327
"""
2428
Converts issues of an XML publication to plaintext articles and
2529
generates minimal metadata.
@@ -38,20 +42,35 @@ def publication_to_text(
3842
:param downsample: Downsample, converting every Nth issue only
3943
:type downsample: int
4044
"""
41-
# This function will run in a separate process so reconfigure
42-
# logging.
45+
# This function will run in a separate process so reconfigure logging.
4346
configure_logging(log_file)
47+
48+
# Load a set of XSLT files
4449
xslts = xml.load_xslts()
50+
51+
# Set up the publication_dir
4552
publication_dir = os.path.join(publications_dir, publication)
53+
54+
# Check if publication_dir is not a directory
4655
if not os.path.isdir(publication_dir):
4756
logger.warning("Unexpected file: %s", publication_dir)
57+
# TODO: Should this "return" here as well?
58+
# (see spark_xml_to_text.publication_to_text)
59+
60+
# Construct a path to the output directory
4861
publication_txt_out_dir = os.path.join(txt_out_dir, publication)
62+
63+
# Convert the XML files in the publication directory to plaintext articles
64+
# using the XSLT files and saves the resulting plaintext articles in the
65+
# output directory
4966
xml_to_text.publication_to_text(
5067
publication_dir, publication_txt_out_dir, xslts, downsample
5168
)
5269

5370

54-
def publications_to_text(publications_dir, txt_out_dir, log_file, downsample=1):
71+
def publications_to_text(
72+
publications_dir: str, txt_out_dir: str, log_file: str, downsample: int = 1
73+
) -> None:
5574
"""
5675
Converts XML publications to plaintext articles and generates
5776
minimal metadata.
@@ -97,19 +116,37 @@ def publications_to_text(publications_dir, txt_out_dir, log_file, downsample=1):
97116
:type downsample: int
98117
"""
99118
logger.info("Processing: %s", publications_dir)
119+
120+
# Get publications from list of files in publications_dir
100121
publications = os.listdir(publications_dir)
122+
123+
# Set pool size
101124
pool_size = min(multiprocessing.cpu_count(), len(publications))
125+
126+
# Log info
102127
logger.info(
103128
"Publications: %d CPUs: %d Process pool size: %d",
104129
len(publications),
105130
multiprocessing.cpu_count(),
106131
pool_size,
107132
)
133+
134+
# Set up pool for multiprocessing
108135
pool = Pool(pool_size)
136+
137+
# Add publication_to_text to pool asynchronously
109138
for publication in os.listdir(publications_dir):
110139
pool.apply_async(
111140
publication_to_text,
112-
args=(publications_dir, publication, txt_out_dir, log_file, downsample),
141+
args=(
142+
publications_dir,
143+
publication,
144+
txt_out_dir,
145+
log_file,
146+
downsample,
147+
),
113148
)
149+
150+
# Run the multiprocessing and close
114151
pool.close()
115152
pool.join()

src/alto2txt/spark_xml_to_text.py

+33-6
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,12 @@
2121

2222

2323
def publication_to_text(
24-
publications_dir, publication, txt_out_dir, log_file, downsample=1
25-
):
24+
publications_dir: str,
25+
publication: str,
26+
txt_out_dir: str,
27+
log_file: str,
28+
downsample: int = 1,
29+
) -> None:
2630
"""
2731
Converts issues of an XML publication to plaintext articles and
2832
generates minimal metadata.
@@ -41,23 +45,38 @@ def publication_to_text(
4145
:param downsample: Downsample, converting every Nth issue only
4246
:type downsample: int
4347
"""
44-
# This function will run on Spark worker node so reconfigure
45-
# logging.
48+
# This function will run on Spark worker node so reconfigure logging.
4649
configure_logging(log_file)
50+
51+
# Load a set of XSLT files
4752
xslts = xml.load_xslts()
53+
54+
# Set up the publication_dir
4855
publication_dir = os.path.join(publications_dir, publication)
56+
57+
# Check if publication_dir is not a directory
4958
if not os.path.isdir(publication_dir):
5059
logger.warning("Unexpected file: %s", publication_dir)
5160
return
61+
62+
# Construct a path to the output directory
5263
publication_txt_out_dir = os.path.join(txt_out_dir, publication)
64+
65+
# Convert the XML files in the publication directory to plaintext articles
66+
# using the XSLT files and saves the resulting plaintext articles in the
67+
# output directory
5368
xml_to_text.publication_to_text(
5469
publication_dir, publication_txt_out_dir, xslts, downsample
5570
)
5671

5772

5873
def publications_to_text(
59-
publications_dir, txt_out_dir, log_file, num_cores=1, downsample=1
60-
):
74+
publications_dir: str,
75+
txt_out_dir: str,
76+
log_file: str,
77+
num_cores: int = 1,
78+
downsample: int = 1,
79+
) -> None:
6180
"""
6281
Converts XML publications to plaintext articles and generates
6382
minimal metadata.
@@ -105,13 +124,21 @@ def publications_to_text(
105124
:type downsample: int
106125
"""
107126
logger.info("Processing: %s", publications_dir)
127+
128+
# Get publications from list of files in publications_dir
108129
publications = os.listdir(publications_dir)
130+
131+
# Set up Spark + its context
109132
conf = SparkConf()
110133
conf.setAppName(__name__)
111134
conf.set("spark.cores.max", num_cores)
112135
context = SparkContext(conf=conf)
136+
137+
# Parallelize the publications
113138
rdd_publications = context.parallelize(publications, num_cores)
114139
rdd_publications = context.parallelize(publications)
140+
141+
# Map and run the publication_to_text for each publication
115142
rdd_publications.map(
116143
lambda publication: publication_to_text(
117144
publications_dir, publication, txt_out_dir, log_file, downsample

src/alto2txt/xml.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,11 @@ def load_xslts():
9191
:rtype: dict(str: lxml.etree.XSLT)
9292
"""
9393
xsl_transforms = {}
94+
9495
for xslt_name in [METS_18_XSLT, METS_13_XSLT, BLN_XSLT, UKP_XSLT]:
9596
xslt_file = get_path(xslts, xslt_name)
9697
xsl_transforms[xslt_name] = etree.XSLT(etree.parse(xslt_file))
98+
9799
return xsl_transforms
98100

99101

@@ -110,6 +112,7 @@ def get_xml(filename):
110112
document_tree = None
111113
parser = etree.XMLParser()
112114
document_tree = etree.parse(f, parser)
115+
113116
return document_tree
114117

115118

@@ -143,15 +146,19 @@ def get_xml_metadata(document_tree):
143146
# Convert schema_locations from "namespaceURI schemaURI ..."
144147
# to dictionary with namespaceURI:schemaURI
145148
uris = schema_locations.split(" ")
146-
schema_locations = {uris[i]: uris[i + 1] for i in range(0, len(uris), 2)}
149+
schema_locations = {
150+
uris[i]: uris[i + 1] for i in range(0, len(uris), 2)
151+
}
147152
else:
148153
schema_locations = {}
154+
149155
metadata = {}
150156
metadata[XML_ROOT] = root_element_tag
151157
metadata[XML_DOCTYPE] = doctype
152158
metadata[XML_NS] = namespaces
153159
metadata[XML_NO_NS_SCHEMA_LOCATION] = no_ns_schema_location
154160
metadata[XML_SCHEMA_LOCATIONS] = schema_locations
161+
155162
return metadata
156163

157164

0 commit comments

Comments
 (0)