diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..763d6a3 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,16 @@ +language: python +python: + - "2.7" + - "3.4" + - "3.5" + - "3.6" + +install: + - pip install -r requirements/requirements-dev.txt + +script: + - coverage run --source=spellchecker setup.py test + +# commands to run after the tests successfully complete +after_success: + - coveralls diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..2dd0e09 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,10 @@ +# pyspellchecker + +## Version 0.1.0 +* Move word frequency to its own class +* Add basic tests +* Readme documentation + +## Version 0.0.1 +* Initial release using code from Peter Norvig +* Initial release to pypi diff --git a/README.md b/README.md index 38db615..f2e23c5 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,92 @@ # pyspellchecker -Pure Python Spell Checking based on https://norvig.com/spell-correct.html + +Pure Python Spell Checking based on +[Peter Norvig's](https://norvig.com/spell-correct.html) blog post on setting up +a simple spell checking algorithm. + +It uses a [Levenshtein Distance](https://en.wikipedia.org/wiki/Levenshtein_distance) +algorithm to find permutations within an edit distance of 2 from the +original word. It then compares all permutations (insertions, deletions, +replacements, and transpositions) to known words in a word frequency list. +Those words that are found more often in the frequency list are `more likely` +the correct results. + + +## Installation + +The easiest method to install is using pip: + +``` bash +pip install pyspellchecker +``` + +To install from source: +``` bash +git clone https://github.com/barrust/pyspellchecker.git +cd pyspellchecker +python setup.py install +``` + +As always, I highly recommend using the [Pipenv](https://github.com/pypa/pipenv) +package to help manage dependencies! + +## Quickstart + +After installation, using pyspellchecker should be fairly straight forward: + +``` python +from spellchecker import SpellChecker + + +spell = SpellChecker() + +# find those words that may be misspelled +misspelled = spell.unknown(['something', 'is', 'hapenning', 'here']) + +for word in misspelled: + # Get the one `most likely` answer + print(spell.correction(word)) + + # Get a list of `likely` options + print(spell.candidates(word)) +``` + +If the Word Frequency list is not to your liking, you can add additional text +to generate a more appropriate list for your use case. + +``` python +from spellChecker import SpellChecker + +spell = SpellChecker() # loads default word frequency list +spell.word_frequency.load_text_file('./my_free_text_doc.txt') + +# if I just want to make sure some words are not flagged as misspelled +spell.word_frequency.load_words(['microsoft', 'apple', 'google']) +spell.known(['microsoft', 'google']) # will return both now! +``` + +More work in storing and loading word frequency lists is planned; stay tuned. + +## Additional Methods +On-line documentation is in the future; until then you can find SpellChecker +here: + +`correction(word)`: Returns the most probable result for the misspelled word + +`candidates(word)`: Returns a set of possible candidates for the misspelled +word + +`known([words])`: Returns those words that are in the word frequency list + +`unknown([words])`: Returns those words that are not in the frequency list + +`word_probability(word)`: The frequency of the given word out of all words in +the frequency list + +#### The following are less likely to be needed by the user but are available: + +`edit_distance_1(word)`: Returns a set of all strings at a Levenshtein Distance +of one + +`edit_distance_2(word)`: Returns a set of all strings at a Levenshtein Distance +of two diff --git a/requirements/requirements-dev.txt b/requirements/requirements-dev.txt new file mode 100644 index 0000000..02db141 --- /dev/null +++ b/requirements/requirements-dev.txt @@ -0,0 +1,6 @@ +# needed for testing purposes +pycodestyle +isort +astroid +pylint +coveralls diff --git a/spellchecker/__init__.py b/spellchecker/__init__.py index 751914e..0d4185e 100644 --- a/spellchecker/__init__.py +++ b/spellchecker/__init__.py @@ -1,7 +1,7 @@ ''' SpellChecker Module ''' -from . spellchecker import SpellChecker +from . spellchecker import SpellChecker, WordFrequency from . info import (__author__, __maintainer__, __email__, __license__, __version__, __credits__, __url__, __bugtrack_url__) -__all__ = ['SpellChecker'] +__all__ = ['SpellChecker', 'WordFrequency'] diff --git a/spellchecker/info.py b/spellchecker/info.py index e7de5e1..1ed85d8 100644 --- a/spellchecker/info.py +++ b/spellchecker/info.py @@ -5,7 +5,7 @@ __maintainer__ = 'Tyler Barrus' __email__ = 'barrust@gmail.com' __license__ = 'MIT' -__version__ = '0.0.1' +__version__ = '0.1.0' __credits__ = ['Peter Norvig'] __url__ = 'https://github.com/barrust/pyspellchecker' __bugtrack_url__ = '{0}/issues'.format(__url__) diff --git a/spellchecker/spellchecker.py b/spellchecker/spellchecker.py index 8a5d6aa..8928ad5 100644 --- a/spellchecker/spellchecker.py +++ b/spellchecker/spellchecker.py @@ -1,6 +1,6 @@ ''' SpellChecker Module; simple, intuitive spell checker based on the post by Peter Norvig. See: https://norvig.com/spell-correct.html ''' -from __future__ import absolute_import +from __future__ import absolute_import, division import os import re @@ -43,7 +43,7 @@ def candidates(self, word): self.known(self.edit_distance_2(word)) or [word]) def known(self, words): - "The subset of `words` that appear in the dictionary of WORDS." + "The subset of `words` that appear in the dictionary of words." return set(w for w in words if w in self.word_frequency.dictionary) def unknown(self, words): diff --git a/tests/spellchecker_test.py b/tests/spellchecker_test.py new file mode 100644 index 0000000..5502e55 --- /dev/null +++ b/tests/spellchecker_test.py @@ -0,0 +1,59 @@ +# -*- coding: utf-8 -*- +''' +Unittest class +''' +import unittest + +from spellchecker import SpellChecker + + +class TestSpellChecker(unittest.TestCase): + def test_correction(self): + ''' test spell checker corrections ''' + spell = SpellChecker() + self.assertEqual(spell.correction('ths'), 'the') + self.assertEqual(spell.correction('ergo'), 'ergot') + self.assertEqual(spell.correction('this'), 'this') + + def test_candidates(self): + ''' test spell checker candidates ''' + spell = SpellChecker() + self.assertEqual(spell.candidates('ths'), {'tis', 'tss', 'th', 'thus', 'the', 'this', 'thy'}) + self.assertEqual(spell.candidates('the'), {'the'}) + + def test_words(self): + spell = SpellChecker() + self.assertEqual(spell.words('This is a test of this'), ['this', 'is', 'a', 'test', 'of', 'this']) + + def test_word_frequency(self): + spell = SpellChecker() + # if the default load changes so will this... + self.assertEqual(spell.word_frequency.dictionary['the'], 79809) + + def test_word_probability(self): + spell = SpellChecker() + # if the default load changes so will this... + self.assertEqual(spell.word_probability('the'), 0.07154004401278254) + + def test_word_known(self): + ''' test if the word is a `known` word or not ''' + spell = SpellChecker() + self.assertEqual(spell.known(['this']), {'this'}) + self.assertEqual(spell.known(['sherlock']), {'sherlock'}) + self.assertEqual(spell.known(['holmes']), {'holmes'}) + self.assertEqual(spell.known(['known']), {'known'}) + + self.assertEqual(spell.known(['foobar']), set()) + self.assertEqual(spell.known(['ths']), set()) + self.assertEqual(spell.known(['ergo']), set()) + + def test_unknown_words(self): + spell = SpellChecker() + self.assertEqual(spell.unknown(['this']), set()) + self.assertEqual(spell.unknown(['sherlock']), set()) + self.assertEqual(spell.unknown(['holmes']), set()) + self.assertEqual(spell.unknown(['known']), set()) + + self.assertEqual(spell.unknown(['foobar']), {'foobar'}) + self.assertEqual(spell.unknown(['ths']), {'ths'}) + self.assertEqual(spell.unknown(['ergo']), {'ergo'})