forked from thomasboevith/ddo-cli
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathddo.py
executable file
·442 lines (371 loc) · 15.7 KB
/
ddo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import docopt
import itertools
import logging
import os
import re
import requests
import sys
import time
import urllib
version = '0.1'
__doc__ = """
ddo.py {version} --- look up words in Den Danske Ordbog
A command-line interface for looking up words in the Danish online dictionary
Den Danske Ordbog which can be found at http://ordnet.dk/ddo
Usage:
{filename} [-S] [-s ...] [-v ...] [-i] <word>
{filename} (-h | --help)
{filename} --version
Options:
-S Very short output (same as -ssss)
-s Short output (add up to four s'es for shorter output).
-i Print word and its inflections only.
-h, --help Show this screen.
--version Show version.
-v Print info (-vv for printing lots of info (debug)).
Examples:
{filename} behændig
Copyright (C) 2016 Thomas Boevith
License: GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>.
This is free software: you are free to change and redistribute it. There is NO
WARRANTY, to the extent permitted by law.
""".format(filename=os.path.basename(__file__), version=version)
class Word:
"""A word class."""
def __init__(self, word, senses=[], numsenses=0):
self.word = word
self.senses = senses
self.numsenses = numsenses
self.download()
def download(self):
"""Retrieves dictionary word entry from a word page."""
page = get_page(word=args['<word>'])
if page is None:
log.debug('Page is empty')
sys.exit(1)
else:
log.debug('Got page for word: %s' % args['<word>'])
# Get all senses of headword from wordpage
senseurls = getsenseurls(page)
self.numsenses = len(senseurls)
for senseurl in senseurls:
log.debug('senseurl: %s' % (senseurl))
sensepage = get_page(url=senseurl)
sense = get_sense(sensepage, headword=self)
if sense is not None:
sense.prettyprint()
self.senses.append(sense)
class Sense:
"""A word sense class."""
newid = itertools.count().next
def __init__(self, headword=None, sense=None, sensenum=None,
sensenumstring=None, senseurl=None, comment=None,
pronounciation=None, inflection=None, part_of_speech=None,
meanings=None, etymology=None, synonyms=None,
examples=None, wordformations=None):
self.headword = headword
self.id = Sense.newid()
self.sense = sense
self.sensenum = sensenum
self.sensenumstring = sensenumstring
self.senseurl = senseurl
self.comment = comment
self.pronounciation = pronounciation
self.inflection = inflection
self.part_of_speech = part_of_speech
self.meanings = meanings
self.etymology = etymology
self.wordformations = wordformations
def prettyprint(self):
"""Prints word sense to standard out."""
if self.sense is None:
return
if args['-i']:
printu(self.sense)
for i in self.inflection.split(','):
if 'eller' in i.strip():
for a in i.split('eller'):
printu("%s%s" % (self.sense, a.strip().strip('-')))
elif i.strip()[0] == '-':
printu("%s%s" % (self.sense, i.strip().strip('-')))
else:
printu(i.strip())
return
printun(self.sense)
if self.sensenumstring is not None:
printun(self.sensenumstring)
elif self.sensenum is not None:
printun(self.sensenum)
if self.pronounciation is not None:
printun(self.pronounciation)
if self.part_of_speech is not None:
printun(self.part_of_speech)
print
if args['-s'] > 3:
return
if self.comment is not None:
printu(self.comment)
if self.inflection is not None:
print('Bøjning:'),
printu(self.inflection)
if args['-s'] > 2:
print
return
if self.etymology is not None:
printun('Oprindelse:')
printu(self.etymology)
if args['-s'] > 1:
print
return
if self.meanings != []:
print
printu('Betydninger:')
for i, meaning in enumerate(self.meanings):
if meaning['id'] is not None:
if len(meaning['id']) == 1:
printun(str(meaning['id'][0]) + '.')
elif len(meaning['id']) == 2:
printun(str(meaning['id'][0]) + '.'
+ chr(int(meaning['id'][1]) + ord('a')))
else:
printun('.'.join(meaning['id']))
if meaning['topic'] is not None:
printun(meaning['topic'].upper())
printu(meaning['meaning'])
if meaning['onyms'] is not None:
for j, onym in enumerate(meaning['onyms']):
printu(onym)
if i < len(self.meanings) - 1:
print
if self.wordformations != []:
print
printu('Orddannelser:')
for i, formation in enumerate(self.wordformations):
printu(formation)
if self.id < self.headword.numsenses - 1:
printu('-' * 79)
def gettext(soupfind):
"""Get the text of an element, stripping off white space."""
if soupfind is not None:
try:
return soupfind.get_text().strip()
except:
return None
def supernumeral(num, encoding='utf-8'):
"""Convert integer to superscript integer."""
if encoding == 'utf-8':
superscripts = ['⁰', '¹', '²', '³', '⁴', '⁵', '⁶', '⁷', '⁸', '⁹']
superdigit = ''
for digit in str(int(num)):
superdigit += superscripts[int(digit)]
return superdigit
else:
return '(' + str(num) + ')'
def getsenseurls(page):
"""Retrieve all URLS for the senses of a word."""
soup = BeautifulSoup(page, 'lxml')
senseurls = []
try:
queryresult = soup.find('dl', {'id': 'search-panel'}).find('dd',
class_='portletResult')
except:
e = sys.exc_info()[0]
log.error('Could not get query results (error: %s)' % e)
sys.exit(1)
for resultatbox in queryresult.find_all('div', class_='searchResultBox'):
links = resultatbox.find_all('a')
for link in links:
# Convert URL to accepted ASCII encoding
url = (link.get('href')).encode('latin-1')
senseurls.append(url)
return senseurls
def printu(s):
print("%s" % s.encode('utf-8'))
def printun(s):
print("%s" % s.encode('utf-8')),
def get_page(word=None, url=None):
"""Download page for a word using either the word or the complete url."""
if url is not None:
url = url
else:
url = 'http://ordnet.dk/ddo/ordbog?query=' + word
url = urllib.quote(url, safe=',.:=&/?:')
r = requests.get(url)
status_code = r.status_code
content = r.content
if status_code == 200:
log.debug('status code: %s OK' % status_code)
return content
if status_code == 404:
if word is not None:
print('Ingen resultater i Den Danske Ordbog for: %s' % word)
log.debug('status code: %s Not Found' % status_code)
soup = BeautifulSoup(content, 'lxml')
subwarning = gettext(soup.find('div', class_="subWarning"))
if subwarning is not None:
print(subwarning),
try:
for tag in soup.find_all('li', class_='visFlere'):
tag.replaceWith('')
for suggestion in soup.find('div',
class_='nomatch').find('ul',
{'id': 'more-alike-list-long'}).find_all('a'):
print(gettext(suggestion)),
except:
return None
return None
else:
log.debug('request status_code: %s:' % status_code)
return None
def get_sense(sensepage, headword):
"""Extract elements of a word sense by parsning the HTML page."""
if sensepage is None:
log.error('Page is empty: %s' % senseurl)
return None
soup = BeautifulSoup(sensepage, 'lxml')
s = Sense(headword=headword)
artikel = soup.find('div', class_='artikel')
if artikel is None:
log.error('Could not retrieve artikel for: %s' % senseurl)
return None
sense = artikel.find('div', class_='definitionBoxTop').find('span',
class_='match')
s.sense = sense.find(text=True, recursive=False)
s.sensenum = gettext(sense.find(text=False, recursive=False))
s.part_of_speech = gettext(artikel.find('div',
class_='definitionBoxTop').find('span',
class_='tekstmedium allow-glossing'))
inflection = artikel.find('div', {'id': 'id-boj'})
if inflection is not None:
inflection = inflection.find('span',
class_='tekstmedium allow-glossing')
dividerdouble = inflection.find_all('span',
class_='dividerDouble')
if dividerdouble is not None:
for e in dividerdouble:
e.string = '||'
if inflection is not None:
s.inflection = inflection.get_text()
comment = artikel.find('div', class_='definitionBox').find('span',
class_='tekst')
if comment is not None:
s.comment = comment.get_text()
pronounciation = artikel.find('div', {'id': 'id-udt'})
if pronounciation is not None:
pronounciation = pronounciation.find('span',
class_='lydskrift')
s.pronounciation = pronounciation.get_text().strip()
etymology = artikel.find('div', {'id': 'id-ety'})
if etymology is not None:
for link in etymology.find_all('a'):
link.insert_before('_')
link.insert_after('_')
etymology = etymology.find('span',
class_='tekstmedium allow-glossing')
ordform = etymology.find_all('span', class_='ordform')
if ordform is not None:
for e in ordform:
e.string = '/' + e.string + '/'
dividerdot = etymology.find_all('span', class_='dividerDot')
if dividerdot is not None:
for e in dividerdot:
e.string = ' * '
if etymology is not None:
s.etymology = etymology.get_text().strip()
s.meanings = []
meanings = artikel.find('div', {'id': 'content-betydninger'})
if meanings is not None:
for i, b in enumerate(meanings.find_all('div',
class_='definitionIndent', recursive=False)):
meaningdict = {}
onyms = []
meaningdict['topic'] = None
for c in b.find_all('div', class_='definitionBox'):
dividerstroke = c.find_all('span', class_='dividerStroke')
if dividerstroke is not None:
for e in dividerstroke:
e.string = ' * '
if c.find('span', class_='stempelNoBorder') is not None:
meaningdict['topic'] = gettext(c.find('span',
class_='stempelNoBorder'))
if ('id' in c.attrs and
re.compile(r'^betydning-[-0-9]+$').match(c.attrs['id'])):
meaningdict['id'] = c.attrs['id'][10:].split('-')
meaningdict['meaning'] = c.find('span',
class_='definition').get_text().strip()
if 'onym' in c.attrs['class']:
for link in c.find_all('a'):
link.insert_before('_')
link.insert_after('_')
onymname = gettext(c.find('span', class_='stempel'))
onymwords = c.find('span', class_='inlineList')
dividersmall = onymwords.find_all('span',
class_='dividerSmall')
if dividersmall is not None:
for e in dividersmall:
e.string = '|'
onyms.append(onymname.upper() + ': ' + gettext(onymwords))
if 'details' in c.attrs['class']:
detailsname = gettext(c.find('span', class_='stempel'))
detailswords = c.find('span', class_='inlineList')
dividersmall = detailswords.find_all('span',
class_='dividerSmall')
if dividersmall is not None:
for e in dividersmall:
e.string = '|'
onyms.append(detailsname.upper() + ': '
+ gettext(detailswords))
citater = c.find_all('div', class_='rc-box-citater-body')
for citat in citater:
for link in citat.find_all('span', class_='kilde'):
link.insert_after('-- ')
link.extract()
onyms.append(citat.get_text().strip())
meaningdict['onyms'] = onyms
s.meanings.append(meaningdict)
s.wordformations = []
wordformations = artikel.find('div', {'id': 'content-orddannelser'})
if wordformations is not None:
for c in wordformations.find_all('div', class_='definitionBox'):
for link in c.find_all('a'):
link.insert_before('_')
# Ensure white space after links if necessary
if str(link.next.next) == ' ':
link.insert_after('_')
else:
link.insert_after('_ ')
wordformationname = gettext(c.find('span', class_='stempel'))
formation = c.find('span', class_='inlineList')
dividersmall = formation.find_all('span', class_='dividerSmall')
if dividersmall is not None:
for e in dividersmall:
e.string = '|'
dividerdouble = formation.find_all('span', class_='dividerDouble')
if dividerdouble is not None:
for e in dividerdouble:
e.string = '||'
s.wordformations.append(wordformationname.upper() + ': '
+ gettext(formation))
return s
if __name__ == '__main__':
start_time = time.time()
args = docopt.docopt(__doc__, version=str(version))
log = logging.getLogger(os.path.basename(__file__))
formatstr = '%(asctime)-15s %(name)-17s %(levelname)-5s %(message)s'
if args['-v'] >= 2:
logging.basicConfig(level=logging.DEBUG, format=formatstr)
elif args['-v'] == 1:
logging.basicConfig(level=logging.INFO, format=formatstr)
else:
logging.basicConfig(level=logging.WARNING, format=formatstr)
if args['-S']:
args['-s'] = 4
log.debug('%s started' % os.path.basename(__file__))
log.debug('docopt args=%s' % str(args).replace('\n', ''))
log.info('Looking up: %s' % args['<word>'])
word = Word('<word>')
log.debug('Processing time={0:.2f} s'.format(time.time() - start_time))
log.debug('%s ended' % os.path.basename(__file__))