-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path__init__.py
201 lines (159 loc) · 5.93 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# -*- coding: utf-8 -*-
# @Author: Tasdik Rahman
# @Date: 2016-03-12
# @Last Modified by: Tasdik Rahman
# @Last Modified time: 2016-03-25 20:00:56
# @MIT License
# @http://tasdikrahman.me
# @https://github.com/prodicus
"""
Tries to remove strings obtained from OCR engines which are garbage.
An implementation of the paper
'Automatic Removal of “Garbage Strings” in OCR Text: An Implementation'
- by Kazem Taghva , Tom Nartker , Allen Condit , Julie Borsack
References
==========
[1] http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.81.8901
"""
__author__ = "Tasdik Rahman"
__email__ = "[email protected]"
__version__ = "0.0.1"
__title__ = 'pyRmgarbage'
__license__ = 'MIT'
__copyright__ = 'Copyright 2016 Tasdik Rahman'
import re
class Rmgarbage(object):
def __init__(self):
pass
def too_long(self, string):
"""
Rule L
======
tests whether the string passed is more than 40 characters, if
yes. It is garbage!
:param string: string to be tested
:returns: either True or False
"""
return True if len(string) > 40 else False
def bad_alnum_ratio(self, string):
"""
Rule A
======
if a string's ratio of alphanumeric characters to total characters is
less than 50%, the string is garbage
:param string: string to be tested
:returns: either True or False
"""
# matches [^A-Za-z0-9] (^ = not, _ is required)
pattern = re.compile('[\W_]+')
alnum_thresholds = {1: 0, # single chars can be non-alphanumeric
2: 0, # so can doublets
3: 0.32, # at least one of three should be alnum
4: 0.24, # at least one of four should be alnum
5: 0.39} # at least two of five should be alnum
threshold = alnum_thresholds[len(string)] \
if len(string) in alnum_thresholds else 0.5
if len(string) == 0: # avoid division by zero
return True
if float(len(
pattern.sub('', string)))/len(string) < threshold:
return True
return False
def consecutive_four_identical(self, string):
"""
Rule R
======
if a string has 4 identical characters in a row, it is garbage
:param string: string to be tested
:returns: either True or False
"""
pattern = re.compile(
r'((.)\2{3,})') # matches any 4 consecutive characters
if pattern.search(string):
return True
return False
def bad_consonant_vowel_ratio(self, string):
"""
Rule V
======
if a string has nothing but alphabetic characters, look at the
number of consonants and vowels. If the number of one is less than 10%
of the number of the other, then the string is garbage.
This includes a length threshold.
:param string: string to be tested
:returns: either True or False
"""
alpha_string = filter(str.isalpha, string)
vowel_count = sum(1 for char in alpha_string if char in 'aeiouAEIOU')
consonant_count = len(alpha_string) - vowel_count
if (consonant_count > 0 and vowel_count > 0):
ratio = float(vowel_count)/consonant_count
if (ratio < 0.1 or ratio > 10):
return True
elif (vowel_count == 0 and consonant_count > len('rhythms')):
return True
elif (consonant_count == 0 and vowel_count > len('IEEE')):
return True
return False
def has_two_distinct_puncts_inside(self, string):
"""
Rule P
======
Strip off the first and last characters of a string. If there
are two distinct punctuation characters in the result, then the string
is garbage
Customisation
=============
stripping off the last TWO characters as false positives
included those ending with ').' and similar.
:param string: string to be tested
:returns: either True or False
"""
non_alnum_string = ''.join(char for char in string[1:-2]
if not char.isalnum())
for char in non_alnum_string[1:]:
if char != non_alnum_string[0]:
return True
return False
def has_uppercase_within_lowercase(self, string):
"""
Rule C
======
If a string begins and ends with a lowercase letter, then if
the string contains an uppercase letter anywhere in between, then it
is removed as garbage.
Customisation
=============
false positive on "needed.The". Exclude fullstop-capital.
Extra customisation: Exclude hyphen-capital, apostrophe-capital and
forwardslash-capital
:param string: string to be tested
:returns: either True or False
"""
if (string and string[0].islower() and string[-1].islower()):
string_middle = string[1:-1]
for index, char in enumerate(string_middle):
if char.isupper() and not \
(index > 0 and string_middle[index-1] in ".-'"):
return True
return False
def is_garbage(self, string):
"""
passes the string to check for every rule and if any match is found
, it returns that rule
:param string: string to be tested
:returns: either True or False
"""
if too_long(string):
return 'L'
elif bad_alnum_ratio(string):
return 'A'
elif consecutive_four_identical(string):
return 'R'
elif bad_consonant_vowel_ratio(string):
return 'V'
elif has_two_distinct_puncts_inside(string):
return 'P'
elif has_uppercase_within_lowercase(string):
return 'C'
return False