-
Notifications
You must be signed in to change notification settings - Fork 0
/
grabber4.py
63 lines (53 loc) · 1.81 KB
/
grabber4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import os, glob, string
from lxml import etree
def write_line(f, year, issue, title, lidnr, alinea):
# massage our data to get it ready for output, any further character replacements go here
year = year.strip().replace('\n', ' ')
issue = issue.strip().replace('\n', ' ')
title = title.strip().replace('\n', ' ')
lidnr = lidnr.strip().replace('\n', ' ')
alinea = alinea.strip().replace('\n', ' ')
f.write( "\"{0}\", \"{1}\", \"{2}\", \"{3}\", \"{4}\"\n".format(year, issue, title, lidnr, alinea) )
def parse_stadsblad(fname):
print "Now processing file", fname
dom = etree.parse(fname)
# open the file first, otherwise file gets overwritten in every loop
with open("{0}.txt".format(fname), "w+") as f:
# get staatsblad issue and year
stb = dom.xpath('//stb')
chs = stb[0].getchildren()
year = chs[1].text
issue = chs[2].text
#print "STAATSBLAD ", year, issue
# get all the titles
wlids = dom.xpath('//wlid')
for wlid in wlids:
titlee = wlid.xpath('al')
#print "title: ", titlee[0].text.encode('utf-8')
title = titlee[0].text.encode('utf-8')
# predefine variables
lidnr = ""
alinea = ""
# get all the 'lids'
lids = wlid.xpath('//lid')
for lid in lids:
chs = lid.getchildren()
for c in chs:
if (c.tag == 'nr'):
#print "lidnr: ", c.text.encode('utf-8')
lidnr = c.text.encode('utf-8')
if (c.tag == 'al'):
#print "alinea: ", c.text.encode('utf-8')
alinea = c.text.encode('utf-8')
write_line(f, year, issue, title, lidnr, alinea)
def main():
try:
os.chdir("data/")
for file in glob.glob("stb-1995-295.xml"):
parse_stadsblad(file)
print("="*80)
except Exception as ex:
print("(!!!) Something went wrong, possibly a file with the wrong format.")
if __name__ == "__main__":
main()
print("My job here is done. Goodbye!")