Skip to content

Commit

Permalink
Hotfix/long word fix (#58)
Browse files Browse the repository at this point in the history
* handle really long words without doing all the work
* update changelog
  • Loading branch information
barrust authored Nov 25, 2019
1 parent d27baf5 commit 9e298a5
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 7 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,15 @@
# pyspellchecker

## Version 0.5.3
* Handle memory issues when trying to correct or find candidates for extremely long words

## Version 0.5.2
Ensure input is encoded correctly; resolves [#53](https://github.com/barrust/pyspellchecker/issues/53)

## Version 0.5.1
Handle windows encoding issues [#48](https://github.com/barrust/pyspellchecker/issues/48)
Deterministic order to corrections [#47](https://github.com/barrust/pyspellchecker/issues/47)

## Version 0.5.0
* Add tokenizer to the Spell object
* Add Support for local dictionaries to be case sensitive
Expand Down
2 changes: 1 addition & 1 deletion spellchecker/info.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
__maintainer__ = "Tyler Barrus"
__email__ = "[email protected]"
__license__ = "MIT"
__version__ = "0.5.2"
__version__ = "0.5.3"
__credits__ = ["Peter Norvig"]
__url__ = "https://github.com/barrust/pyspellchecker"
__bugtrack_url__ = "{0}/issues".format(__url__)
28 changes: 23 additions & 5 deletions spellchecker/spellchecker.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,10 @@ def candidates(self, word):
word = ENSURE_UNICODE(word)
if self.known([word]): # short-cut if word is correct already
return {word}

if not self._check_if_should_check(word):
return {word}

# get edit distance 1...
res = [x for x in self.edit_distance_1(word)]
tmp = self.known(res)
Expand Down Expand Up @@ -186,7 +190,7 @@ def known(self, words):
w
for w in tmp
if w in self._word_frequency.dictionary
or not self._check_if_should_check(w)
and self._check_if_should_check(w)
)

def unknown(self, words):
Expand Down Expand Up @@ -215,7 +219,7 @@ def edit_distance_1(self, word):
Returns:
set: The set of strings that are edit distance one from the \
provided word """
word = ENSURE_UNICODE(word).lower()
word = ENSURE_UNICODE(word).lower() if not self._case_sensitive else ENSURE_UNICODE(word)
if self._check_if_should_check(word) is False:
return {word}
letters = self._word_frequency.letters
Expand All @@ -235,7 +239,7 @@ def edit_distance_2(self, word):
Returns:
set: The set of strings that are edit distance two from the \
provided word """
word = ENSURE_UNICODE(word).lower()
word = ENSURE_UNICODE(word).lower() if not self._case_sensitive else ENSURE_UNICODE(word)
return [
e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)
]
Expand All @@ -257,10 +261,11 @@ def __edit_distance_alt(self, words):
]
return [e2 for e1 in tmp for e2 in self.edit_distance_1(e1)]

@staticmethod
def _check_if_should_check(word):
def _check_if_should_check(self, word):
if len(word) == 1 and word in string.punctuation:
return False
if len(word) > self._word_frequency.longest_word_length + 3: # magic number to allow removal of up to 2 letters.
return False
try: # check if it is a number (int, float, etc)
float(word)
return False
Expand All @@ -281,6 +286,7 @@ class WordFrequency(object):
"_letters",
"_tokenizer",
"_case_sensitive",
"_longest_word_length"
]

def __init__(self, tokenizer=None, case_sensitive=False):
Expand All @@ -289,6 +295,7 @@ def __init__(self, tokenizer=None, case_sensitive=False):
self._unique_words = 0
self._letters = set()
self._case_sensitive = case_sensitive
self._longest_word_length = 0

self._tokenizer = _parse_into_words
if tokenizer is not None:
Expand Down Expand Up @@ -351,6 +358,14 @@ def letters(self):
Not settable """
return self._letters

@property
def longest_word_length(self):
""" int: The longest word length in the dictionary
Note:
Not settable """
return self._longest_word_length

def tokenize(self, text):
""" Tokenize the provided string object into individual words
Expand Down Expand Up @@ -486,8 +501,11 @@ def remove_by_threshold(self, threshold=5):

def _update_dictionary(self):
""" Update the word frequency object """
self._longest_word_length = 0
self._total_words = sum(self._dictionary.values())
self._unique_words = len(self._dictionary.keys())
self._letters = set()
for key in self._dictionary:
if len(key) > self._longest_word_length:
self._longest_word_length = len(key)
self._letters.update(key)
25 changes: 24 additions & 1 deletion tests/spellchecker_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ def test_word_known(self):
self.assertEqual(spell.known(['sherlock']), {'sherlock'})
self.assertEqual(spell.known(['holmes']), {'holmes'})
self.assertEqual(spell.known(['known']), {'known'})
self.assertEqual(spell.known(['-']), {'-'})

self.assertEqual(spell.known(['-']), set())
self.assertEqual(spell.known(['foobar']), set())
self.assertEqual(spell.known(['ths']), set())
self.assertEqual(spell.known(['ergos']), set())
Expand Down Expand Up @@ -281,6 +281,29 @@ def test_capitalization_when_case_sensitive_defaults_to_false(self):
self.assertEqual(spell.candidates('BB'), {'bob', 'bab'})
self.assertEqual(spell.correction('BB'), 'bob')

def test_large_words(self):
''' test checking for words that are clearly larger than the largest dictionary word '''
spell = SpellChecker(language=None, distance=2)
spell.word_frequency.add('Bob')

words = ['Bb', 'bb', 'BB']
self.assertEqual(spell.unknown(words), {'bb'})

known_words = ['BOB', 'bOb']
self.assertEqual(spell.known(known_words), {'bob'})

self.assertEqual(spell.correction('bobs'), 'bob')
self.assertEqual(spell.correction('bobb'), 'bob')
self.assertEqual(spell.correction('bobby'), 'bob')
self.assertEqual(spell.word_frequency.longest_word_length, 3)
self.assertEqual(spell.correction('bobbys'), 'bobbys')

def test_extremely_large_words(self):
''' test when a word is just extreamly large '''
spell = SpellChecker()
horrible_word = 'thisisnotarealisticwordthisisnotarealisticwordthisisnotarealisticwordthisisnotarealisticword'
self.assertEqual(spell.correction(horrible_word), horrible_word)

def test_capitalization_when_case_sensitive_true(self):
''' test that capitalization affects comparisons '''
spell = SpellChecker(language=None, case_sensitive=True)
Expand Down

0 comments on commit 9e298a5

Please sign in to comment.