-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmain.py
executable file
·153 lines (125 loc) · 4.77 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
import argparse
import re
IDS_URL = "http://lingweb.eva.mpg.de/cgi-bin/ids/ids.pl?"\
"com=simple_browse&lg_id={}"
EXTRA_BRACKET_INFO = re.compile(r'(.*)\s?\((.*)\)')
def process_word(word):
commenting = EXTRA_BRACKET_INFO.match(word)
com = ""
if commenting:
word, com = commenting.groups()
word = re.sub(r'\s', '<b/>', word)
return (com, word)
def scrape_words(lg_id, lg_id2):
print('Sending request to '+IDS_URL.format(lg_id))
p = requests.get(IDS_URL.format(lg_id))
p.encoding = 'utf-8'
print('Fetched resource.')
print('Sending request to '+IDS_URL.format(lg_id2))
p2 = requests.get(IDS_URL.format(lg_id2))
p2.encoding = 'utf-8'
print('Fetched resource.')
soupl = BeautifulSoup(p.text, 'lxml')
soupr = BeautifulSoup(p2.text, 'lxml')
data = []
selector2 = selector1 = "#data_1"
lg1 = soupl.select('a')[0].text
lg2 = soupr.select('a')[0].text
if lg1 == 'rus':
selector1 = '#russian'
if lg1 == 'por':
selector1 = '#portugese'
if lg1 == 'fra':
selector1 = '#french'
if lg1 == 'spa':
selector1 = '#spanish'
if lg2 == 'rus':
selector2 = '#russian'
if lg2 == 'por':
selector2 = '#portugese'
if lg2 == 'fra':
selector2 = '#french'
if lg2 == 'spa':
selector2 = '#spanish'
for (lword_out, rword_out, english) in zip(soupl.select(selector1),
soupr.select(selector2),
soupr.select("#english")):
for (lword, rword, eng_word) in zip(lword_out.select('tr > td'),
rword_out.select('tr > td'),
english.select('tr > td')):
lwords = lword.text.strip().replace('/', ';')\
.replace(',', ';')\
.split(';')
rwords = rword.text.strip().replace('/', ';')\
.replace(',', ';')\
.split(';')
lwords = map(lambda x: x.strip(), lwords)
rwords = map(lambda x: x.strip(), rwords)
grammar_info = ""
if 'noun' in eng_word.text:
grammar_info += '<s n="n"/>'
if 'vb' in eng_word.text:
grammar_info += '<s n="vblex"/>'
if 'trans' in eng_word.text:
grammar_info += '<s n="tv"/>'
if 'intrans' in eng_word.text:
grammar_info += '<s n="iv"/>'
# Split multiple words in one definition by ';'
for w in lwords:
for w2 in rwords:
if w != '--' and w2 != '--' and all([w, w2]):
comment1, p1 = process_word(w)
comment2, p2 = process_word(w2)
if p1 and p2:
data.append((p1+grammar_info,
p2+grammar_info,
comment1+comment2))
return (data, lg1, lg2)
def bidix_output(data):
content = """<?xml version="1.0" encoding="UTF-8"?>
<!-- Data scraped from Intercontinental Dictionary Series -->
<dictionary>
<section id="main" type="standard">
"""
for (key, value, comment) in data:
if '(' in key or '(' in value:
content += " <!-- gave up on: "
content += """ <e><p><l>{}</l> <r>{}</r></p></e> -->"""\
.format(key,
value)
else:
content += """ <e><p><l>{}</l> <r>{}</r></p></e>""".format(key,
value)
if comment:
content += """ <!-- {} --> \n""".format(comment)
else:
content += "\n"
content += """
</section>
</dictionary>
"""
return content
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
"lg_id1", help="Language ID of the first language (left language)",
type=int
)
parser.add_argument(
"lg_id2", help="Language ID of the second language (right language)",
type=int)
parser.add_argument("--file", "-f", help="Output file name for bidix",
default=None)
args = parser.parse_args()
d_result, lgc1, lgc2 = scrape_words(args.lg_id1, args.lg_id2)
if not args.file:
file_name = "apertium-{}-{}.dix".format(lgc1, lgc2)
else:
file_name = args.file
print("Writing to {}".format(file_name))
with open(file_name, 'w') as fhandle:
fhandle.write(bidix_output(d_result))