forked from udieckmann/Kielipankki-utilities
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlibylesentence.py
115 lines (88 loc) · 3.55 KB
/
libylesentence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# For YLE SV material, meant to be adaptable to other materials.
# ?normalspaced(s) = normalization of whitespace
# sentences(s) = segmentation of a paragraph into sentences
import re
def sentences(para, *, info = False):
'''Partition an assumed paragraph (a string) into heuristically
somewhat sentence-like parts. This will fail, but hopefully not
often, and then some of the time the failure is due to the failure
of the assumption that the input is a paragraph, which is not a
failure of this function at all. Garbage in, garbage out.
Assume all internal whitespace is normalized to one space already!
And initial and final whitespace is normalized away.
Though *some* internal newlines may have been meaningful and are
gone at this stage. This would need investigation. Expensive.
'''
# note: .finditer returns match objects
ends = [ end
for end in sentence_boundary.finditer(para)
if not isbad(end) ]
# each end is a match of a tail of a non-final sentence of the
# paragraph
if info: logsegments(para, ends)
# Yield each sentence that ends at an end
for k, end in enumerate(ends):
yield para[ends[k - 1].end() if k else 0:end.end()]
# Yield the sentence that does not end at an end
# - but should check that the para is not empty!
if ends:
yield para[ends[-1].end():]
else:
yield para
def logsegments(para, ends):
for k, end in enumerate(ends):
b = ends[k - 1].end() if k else 0
m = end.start()
e = end.end()
head = para[b:m]
tail = para[m:e]
print('## ', head, '|', sep = '')
print('## ', tail, '||', sep = '')
else:
if ends:
r = ends[-1].end()
rest = para[r:]
print('## ', rest, '||', sep = '')
else:
print('## ', para, '||', sep = '')
print('##')
sentence_boundary = re.compile(R'''
# One or more of .?! optionally followed by any ")
# then space
# then optionally any "( then an alpanumeric not in lower case.
# Is that good enough?
# Not quite - preceding part must also be somewhat substantial.
# Maybe following too? And it must not be Mr. Etc that is split.
#
# Oops, forgot - (hyphen, when outside a word) - added.
# Also, might there be some way to make use of newlines within
# paragraphs in this corpus? (Ouch. No!)
# Make the pattern match a tail of a sentence, with a lookahead that
# matches a possible head of the following sentence. Further filtering
# can be implemented by testing what the tail is, and what actually
# precedes and follows it in the paragraph.
\S+ [ ")]* [.?!]+ [ ")]*
[ ]
(?= [-(" ]* [A-ZÅÄÖ0-9] )
# Should add the other quotation mark. - What about colons? No!
# Also those __ and **.
''', re.VERBOSE)
# Sentence_boundary matches "W. " in "George W. Bush" but there is
# no sentence boundary there. Approximately?
initial_letter = re.compile('[A-ZÅÄÖ][.] ')
def isbad(mo):
'''Return true if a candidate sentence-boundary match object is a
bad-looking boundary after all.
'''
data, tail, m, e, = mo.string, mo.group(), mo.start(), mo.end()
# those trailing spaces are a bit of a subtlety
if tail in { 't.ex. ', 'bl.a. ', 'Bl.a. ', 's.k. ', 'f.d. ',
'p.g.a. ', 'fr.o.m. ',
'St. ',
'kl. ', 'Kl. ', }:
return True
if initial_letter.fullmatch(tail):
# should also look at what follows?
return True
# could let it fall through for the lulz and default to None
return False