Skip to content

Commit

Permalink
Merge pull request #59 from neomatrix369/indicate-ease-of-reading-of-…
Browse files Browse the repository at this point in the history
…text

High-level feature: Indicate ease of reading of text
  • Loading branch information
neomatrix369 authored Dec 13, 2020
2 parents b3f9734 + 3d0ff42 commit 2a8894b
Show file tree
Hide file tree
Showing 16 changed files with 615 additions and 164 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -118,5 +118,17 @@ Fixes issue #57 via PR https://github.com/neomatrix369/nlp_profiler/pull/58

---

### GitHub branch `indicate-ease-of-reading-of-text` High-level feature: Indicate ease of reading of text

Just like spelling check and grammar checks, adding a high-level feature to indicate if a block of text is easy to read or not, based on the library textstat's flesch_reading_ease().

It returns values between 0 and 100 (I have seen values go past 0 and 100 depending on how bad or good the text is).

[4919a51](https://github.com/neomatrix369/nlp_profiler/commit/4919a51) [@neomatrix369](https://github.com/neomatrix369) _Sun Dec 13 18:36:42 2020 +0000_

---




Return to [README.md](README.md)
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
[![PyPI version](https://badge.fury.io/py/nlp-profiler.svg)](https://badge.fury.io/py/nlp-profiler)
[![Python versions](https://img.shields.io/pypi/pyversions/nlp_profiler.svg)](https://pypi.org/project/nlp_profiler/)
[![PyPi stats](https://img.shields.io/pypi/dm/nlp_profiler.svg?label=pypi%20downloads&logo=PyPI&logoColor=white)](https://pypistats.org/packages/nlp_profiler)
[![Downloads](https://static.pepy.tech/personalized-badge/nlp-profiler?period=total&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/nlp-profiler)


A simple NLP library that allows profiling datasets with one or more text columns.
Expand Down Expand Up @@ -40,7 +41,7 @@ In short: Think of it as using the `pandas.describe()` function or running [Pand

- Input a Pandas dataframe series as an input parameter.
- You get back a new dataframe with various features about the parsed text per row.
- High-level: sentiment analysis, objectivity/subjectivity analysis, spelling quality check, grammar quality check, etc...
- High-level: sentiment analysis, objectivity/subjectivity analysis, spelling quality check, grammar quality check, ease of readability check, etc...
- Low-level/granular: number of characters in the sentence, number of words, number of emojis, number of words, etc...
- From the above numerical data in the resulting dataframe descriptive statistics can be drawn using the `pandas.describe()` on the dataframe.

Expand Down
6 changes: 6 additions & 0 deletions nlp_profiler/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
HIGH_LEVEL_OPTION = 'high_level'
GRAMMAR_CHECK_OPTION = 'grammar_check'
SPELLING_CHECK_OPTION = 'spelling_check'
EASE_OF_READING_CHECK_OPTION = 'ease_of_reading_check'
PARALLELISATION_METHOD_OPTION = 'parallelisation_method'
NOT_APPLICABLE = "N/A"

Expand All @@ -30,6 +31,11 @@
SENTIMENT_SUBJECTIVITY_COL = 'sentiment_subjectivity'
SENTIMENT_SUBJECTIVITY_SUMMARISED_COL = 'sentiment_subjectivity_summarised'

## Spelling check
EASE_OF_READING_SCORE_COL = 'ease_of_reading_score'
EASE_OF_READING_COL = 'ease_of_reading_quality'
EASE_OF_READING_SUMMARISED_COL = 'ease_of_reading_summarised'

# ---
# Granular
DATES_COUNT_COL = 'dates_count'
Expand Down
8 changes: 6 additions & 2 deletions nlp_profiler/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,16 @@

from nlp_profiler.constants import \
PARALLELISATION_METHOD_OPTION, DEFAULT_PARALLEL_METHOD, GRANULAR_OPTION, HIGH_LEVEL_OPTION, \
GRAMMAR_CHECK_OPTION, SPELLING_CHECK_OPTION
GRAMMAR_CHECK_OPTION, SPELLING_CHECK_OPTION, EASE_OF_READING_CHECK_OPTION
from nlp_profiler.generate_features import get_progress_bar
from nlp_profiler.granular_features import apply_granular_features
from nlp_profiler.high_level_features import apply_high_level_features
from nlp_profiler.high_level_features.grammar_quality_check \
import apply_grammar_check
from nlp_profiler.high_level_features.spelling_quality_check \
import apply_spelling_check
from nlp_profiler.high_level_features.ease_of_reading_check \
import apply_ease_of_reading_check


def apply_text_profiling(dataframe: pd.DataFrame,
Expand All @@ -41,6 +43,7 @@ def apply_text_profiling(dataframe: pd.DataFrame,
GRANULAR_OPTION: True,
GRAMMAR_CHECK_OPTION: False, # default: False as slow process but can Enabled
SPELLING_CHECK_OPTION: True, # default: True although slightly slow process but can Disabled
EASE_OF_READING_CHECK_OPTION: True,
PARALLELISATION_METHOD_OPTION: DEFAULT_PARALLEL_METHOD
}

Expand All @@ -51,7 +54,8 @@ def apply_text_profiling(dataframe: pd.DataFrame,
(GRANULAR_OPTION, "Granular features", apply_granular_features),
(HIGH_LEVEL_OPTION, "High-level features", apply_high_level_features),
(GRAMMAR_CHECK_OPTION, "Grammar checks", apply_grammar_check),
(SPELLING_CHECK_OPTION, "Spelling checks", apply_spelling_check)
(SPELLING_CHECK_OPTION, "Spelling checks", apply_spelling_check),
(EASE_OF_READING_CHECK_OPTION, "Ease of reading check", apply_ease_of_reading_check)
]

for index, item in enumerate(actions_mappings.copy()):
Expand Down
68 changes: 68 additions & 0 deletions nlp_profiler/high_level_features/ease_of_reading_check.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from textstat import flesch_reading_ease
import pandas as pd
import math

from nlp_profiler.constants import NOT_APPLICABLE, NaN, DEFAULT_PARALLEL_METHOD, \
EASE_OF_READING_SCORE_COL, EASE_OF_READING_COL, EASE_OF_READING_SUMMARISED_COL
from nlp_profiler.generate_features import generate_features


def apply_ease_of_reading_check(heading: str,
new_dataframe: pd.DataFrame,
text_column: dict,
parallelisation_method: str = DEFAULT_PARALLEL_METHOD):
ease_of_reading_steps = [
(EASE_OF_READING_SCORE_COL, text_column, ease_of_reading_score),
(EASE_OF_READING_COL, EASE_OF_READING_SCORE_COL, ease_of_reading),
(EASE_OF_READING_SUMMARISED_COL, EASE_OF_READING_COL, ease_of_reading_summarised),
]
generate_features(
heading, ease_of_reading_steps,
new_dataframe, parallelisation_method
)

ease_of_reading_to_summarised_words_mapping = {
"Very Easy": "Easy",
"Easy": "Easy",
"Fairly Easy": "Easy",
"Standard": "Standard",
"Fairly Difficult": "Difficult",
"Difficult": "Difficult" ,
"Very Confusing": "Confusing"
}
def ease_of_reading_summarised(text: str) -> str:
if text in ease_of_reading_to_summarised_words_mapping:
return ease_of_reading_to_summarised_words_mapping[text]
return "N/A"


def ease_of_reading_score(text: str) -> float:
if (not isinstance(text, str)) or (len(text.strip()) == 0):
return NaN

return float(flesch_reading_ease(text))

# Docs: https://textblob.readthedocs.io/en/dev/quickstart.html
### See https://en.wikipedia.org/wiki/Words_of_estimative_probability
### The General Area of Possibility
ease_of_reading_to_words_mapping = [
["Very Easy", 90, 100],
["Easy", 80, 89],
["Fairly Easy", 70, 79],
["Standard", 60, 69],
["Fairly Difficult", 50, 59],
["Difficult", 30, 49],
["Very Confusing", 0, 29]
]
def ease_of_reading(score: int) -> str:
if math.isnan(score):
return NOT_APPLICABLE

score = float(score)
for _, each_slab in enumerate(ease_of_reading_to_words_mapping): # pragma: no cover
# pragma: no cover => early termination leads to loss of test coverage info
if ((score <= 0) and (each_slab[1] == 0)) or \
((score >= 100) and (each_slab[2] == 100)):
return each_slab[0]
elif (score >= each_slab[1]) and (score <= each_slab[2]):
return each_slab[0]
Loading

0 comments on commit 2a8894b

Please sign in to comment.