18
18
19
19
20
20
def publication_to_text (
21
- publications_dir , publication , txt_out_dir , log_file , downsample = 1
22
- ):
21
+ publications_dir : str ,
22
+ publication : str ,
23
+ txt_out_dir : str ,
24
+ log_file : str ,
25
+ downsample : int = 1 ,
26
+ ) -> None :
23
27
"""
24
28
Converts issues of an XML publication to plaintext articles and
25
29
generates minimal metadata.
@@ -38,20 +42,35 @@ def publication_to_text(
38
42
:param downsample: Downsample, converting every Nth issue only
39
43
:type downsample: int
40
44
"""
41
- # This function will run in a separate process so reconfigure
42
- # logging.
45
+ # This function will run in a separate process so reconfigure logging.
43
46
configure_logging (log_file )
47
+
48
+ # Load a set of XSLT files
44
49
xslts = xml .load_xslts ()
50
+
51
+ # Set up the publication_dir
45
52
publication_dir = os .path .join (publications_dir , publication )
53
+
54
+ # Check if publication_dir is not a directory
46
55
if not os .path .isdir (publication_dir ):
47
56
logger .warning ("Unexpected file: %s" , publication_dir )
57
+ # TODO: Should this "return" here as well?
58
+ # (see spark_xml_to_text.publication_to_text)
59
+
60
+ # Construct a path to the output directory
48
61
publication_txt_out_dir = os .path .join (txt_out_dir , publication )
62
+
63
+ # Convert the XML files in the publication directory to plaintext articles
64
+ # using the XSLT files and saves the resulting plaintext articles in the
65
+ # output directory
49
66
xml_to_text .publication_to_text (
50
67
publication_dir , publication_txt_out_dir , xslts , downsample
51
68
)
52
69
53
70
54
- def publications_to_text (publications_dir , txt_out_dir , log_file , downsample = 1 ):
71
+ def publications_to_text (
72
+ publications_dir : str , txt_out_dir : str , log_file : str , downsample : int = 1
73
+ ) -> None :
55
74
"""
56
75
Converts XML publications to plaintext articles and generates
57
76
minimal metadata.
@@ -97,19 +116,37 @@ def publications_to_text(publications_dir, txt_out_dir, log_file, downsample=1):
97
116
:type downsample: int
98
117
"""
99
118
logger .info ("Processing: %s" , publications_dir )
119
+
120
+ # Get publications from list of files in publications_dir
100
121
publications = os .listdir (publications_dir )
122
+
123
+ # Set pool size
101
124
pool_size = min (multiprocessing .cpu_count (), len (publications ))
125
+
126
+ # Log info
102
127
logger .info (
103
128
"Publications: %d CPUs: %d Process pool size: %d" ,
104
129
len (publications ),
105
130
multiprocessing .cpu_count (),
106
131
pool_size ,
107
132
)
133
+
134
+ # Set up pool for multiprocessing
108
135
pool = Pool (pool_size )
136
+
137
+ # Add publication_to_text to pool asynchronously
109
138
for publication in os .listdir (publications_dir ):
110
139
pool .apply_async (
111
140
publication_to_text ,
112
- args = (publications_dir , publication , txt_out_dir , log_file , downsample ),
141
+ args = (
142
+ publications_dir ,
143
+ publication ,
144
+ txt_out_dir ,
145
+ log_file ,
146
+ downsample ,
147
+ ),
113
148
)
149
+
150
+ # Run the multiprocessing and close
114
151
pool .close ()
115
152
pool .join ()
0 commit comments