Expand in-code documentation to ease potential contributions

Kalle Westerling · Kalle Westerling · commit 5e239edf8f1d · 2023-03-24T16:23:23.000Z
Contributes to #72
diff --git a/src/alto2txt/extract_publications_text.py b/src/alto2txt/extract_publications_text.py
@@ -96,8 +96,12 @@ def main():
     parser = ArgumentParser(
         description="Converts XML publications to plaintext articles"
     )
-    parser.add_argument("xml_in_dir", help="Input directory with XML publications")
-    parser.add_argument("txt_out_dir", help="Output directory for plaintext articles")
+    parser.add_argument(
+        "xml_in_dir", help="Input directory with XML publications"
+    )
+    parser.add_argument(
+        "txt_out_dir", help="Output directory for plaintext articles"
+    )
     parser.add_argument(
         "-p",
         "--process-type",
diff --git a/src/alto2txt/logging_utils.py b/src/alto2txt/logging_utils.py
@@ -5,7 +5,7 @@
 import logging
 
 
-def configure_logging(log_file):
+def configure_logging(log_file: str) -> None:
     """
     Configure console and file logging.
 
@@ -17,7 +17,9 @@ def configure_logging(log_file):
     formatter = logging.Formatter(format)
 
     logging.basicConfig(level=logging.INFO, format=format)
+
     file_logger = logging.FileHandler(log_file)
     file_logger.setLevel(logging.INFO)
     file_logger.setFormatter(formatter)
+
     logging.getLogger().addHandler(file_logger)
diff --git a/src/alto2txt/multiprocess_xml_to_text.py b/src/alto2txt/multiprocess_xml_to_text.py
@@ -18,8 +18,12 @@
 
 
 def publication_to_text(
-    publications_dir, publication, txt_out_dir, log_file, downsample=1
-):
+    publications_dir: str,
+    publication: str,
+    txt_out_dir: str,
+    log_file: str,
+    downsample: int = 1,
+) -> None:
     """
     Converts issues of an XML publication to plaintext articles and
     generates minimal metadata.
@@ -38,20 +42,35 @@ def publication_to_text(
     :param downsample: Downsample, converting every Nth issue only
     :type downsample: int
     """
-    # This function will run in a separate process so reconfigure
-    # logging.
+    # This function will run in a separate process so reconfigure logging.
     configure_logging(log_file)
+
+    # Load a set of XSLT files
     xslts = xml.load_xslts()
+
+    # Set up the publication_dir
     publication_dir = os.path.join(publications_dir, publication)
+
+    # Check if publication_dir is not a directory
     if not os.path.isdir(publication_dir):
         logger.warning("Unexpected file: %s", publication_dir)
+        # TODO: Should this "return" here as well?
+        # (see spark_xml_to_text.publication_to_text)
+
+    # Construct a path to the output directory
     publication_txt_out_dir = os.path.join(txt_out_dir, publication)
+
+    # Convert the XML files in the publication directory to plaintext articles
+    # using the XSLT files and saves the resulting plaintext articles in the
+    # output directory
     xml_to_text.publication_to_text(
         publication_dir, publication_txt_out_dir, xslts, downsample
     )
 
 
-def publications_to_text(publications_dir, txt_out_dir, log_file, downsample=1):
+def publications_to_text(
+    publications_dir: str, txt_out_dir: str, log_file: str, downsample: int = 1
+) -> None:
     """
     Converts XML publications to plaintext articles and generates
     minimal metadata.
@@ -97,19 +116,37 @@ def publications_to_text(publications_dir, txt_out_dir, log_file, downsample=1):
     :type downsample: int
     """
     logger.info("Processing: %s", publications_dir)
+
+    # Get publications from list of files in publications_dir
     publications = os.listdir(publications_dir)
+
+    # Set pool size
     pool_size = min(multiprocessing.cpu_count(), len(publications))
+
+    # Log info
     logger.info(
         "Publications: %d CPUs: %d Process pool size: %d",
         len(publications),
         multiprocessing.cpu_count(),
         pool_size,
     )
+
+    # Set up pool for multiprocessing
     pool = Pool(pool_size)
+
+    # Add publication_to_text to pool asynchronously
     for publication in os.listdir(publications_dir):
         pool.apply_async(
             publication_to_text,
-            args=(publications_dir, publication, txt_out_dir, log_file, downsample),
+            args=(
+                publications_dir,
+                publication,
+                txt_out_dir,
+                log_file,
+                downsample,
+            ),
         )
+
+    # Run the multiprocessing and close
     pool.close()
     pool.join()
diff --git a/src/alto2txt/spark_xml_to_text.py b/src/alto2txt/spark_xml_to_text.py
@@ -21,8 +21,12 @@
 
 
 def publication_to_text(
-    publications_dir, publication, txt_out_dir, log_file, downsample=1
-):
+    publications_dir: str,
+    publication: str,
+    txt_out_dir: str,
+    log_file: str,
+    downsample: int = 1,
+) -> None:
     """
     Converts issues of an XML publication to plaintext articles and
     generates minimal metadata.
@@ -41,23 +45,38 @@ def publication_to_text(
     :param downsample: Downsample, converting every Nth issue only
     :type downsample: int
     """
-    # This function will run on Spark worker node so reconfigure
-    # logging.
+    # This function will run on Spark worker node so reconfigure logging.
     configure_logging(log_file)
+
+    # Load a set of XSLT files
     xslts = xml.load_xslts()
+
+    # Set up the publication_dir
     publication_dir = os.path.join(publications_dir, publication)
+
+    # Check if publication_dir is not a directory
     if not os.path.isdir(publication_dir):
         logger.warning("Unexpected file: %s", publication_dir)
         return
+
+    # Construct a path to the output directory
     publication_txt_out_dir = os.path.join(txt_out_dir, publication)
+
+    # Convert the XML files in the publication directory to plaintext articles
+    # using the XSLT files and saves the resulting plaintext articles in the
+    # output directory
     xml_to_text.publication_to_text(
         publication_dir, publication_txt_out_dir, xslts, downsample
     )
 
 
 def publications_to_text(
-    publications_dir, txt_out_dir, log_file, num_cores=1, downsample=1
-):
+    publications_dir: str,
+    txt_out_dir: str,
+    log_file: str,
+    num_cores: int = 1,
+    downsample: int = 1,
+) -> None:
     """
     Converts XML publications to plaintext articles and generates
     minimal metadata.
@@ -105,13 +124,21 @@ def publications_to_text(
     :type downsample: int
     """
     logger.info("Processing: %s", publications_dir)
+
+    # Get publications from list of files in publications_dir
     publications = os.listdir(publications_dir)
+
+    # Set up Spark + its context
     conf = SparkConf()
     conf.setAppName(__name__)
     conf.set("spark.cores.max", num_cores)
     context = SparkContext(conf=conf)
+
+    # Parallelize the publications
     rdd_publications = context.parallelize(publications, num_cores)
     rdd_publications = context.parallelize(publications)
+
+    # Map and run the publication_to_text for each publication
     rdd_publications.map(
         lambda publication: publication_to_text(
             publications_dir, publication, txt_out_dir, log_file, downsample
diff --git a/src/alto2txt/xml.py b/src/alto2txt/xml.py
@@ -91,9 +91,11 @@ def load_xslts():
     :rtype: dict(str: lxml.etree.XSLT)
     """
     xsl_transforms = {}
+
     for xslt_name in [METS_18_XSLT, METS_13_XSLT, BLN_XSLT, UKP_XSLT]:
         xslt_file = get_path(xslts, xslt_name)
         xsl_transforms[xslt_name] = etree.XSLT(etree.parse(xslt_file))
+
     return xsl_transforms
 
 
@@ -110,6 +112,7 @@ def get_xml(filename):
         document_tree = None
         parser = etree.XMLParser()
         document_tree = etree.parse(f, parser)
+
     return document_tree
 
 
@@ -143,15 +146,19 @@ def get_xml_metadata(document_tree):
         # Convert schema_locations from "namespaceURI schemaURI ..."
         # to dictionary with namespaceURI:schemaURI
         uris = schema_locations.split(" ")
-        schema_locations = {uris[i]: uris[i + 1] for i in range(0, len(uris), 2)}
+        schema_locations = {
+            uris[i]: uris[i + 1] for i in range(0, len(uris), 2)
+        }
     else:
         schema_locations = {}
+
     metadata = {}
     metadata[XML_ROOT] = root_element_tag
     metadata[XML_DOCTYPE] = doctype
     metadata[XML_NS] = namespaces
     metadata[XML_NO_NS_SCHEMA_LOCATION] = no_ns_schema_location
     metadata[XML_SCHEMA_LOCATIONS] = schema_locations
+
     return metadata
 
 
diff --git a/src/alto2txt/xml_to_text.py b/src/alto2txt/xml_to_text.py