Merge pull request #59 from neomatrix369/indicate-ease-of-reading-of-…

…text High-level feature: Indicate ease of reading of text
neomatrix369 · Dec 13, 2020 · 2a8894b · 2a8894b
2 parents b3f9734 + 3d0ff42
commit 2a8894b
Show file tree

Hide file tree

Showing 16 changed files with 615 additions and 164 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -118,5 +118,17 @@ Fixes issue #57 via PR https://github.com/neomatrix369/nlp_profiler/pull/58
 
 ---
 
+### GitHub branch `indicate-ease-of-reading-of-text` High-level feature: Indicate ease of reading of text
+
+Just like spelling check and grammar checks, adding a high-level feature to indicate if a block of text is easy to read or not, based on the library textstat's flesch_reading_ease().
+
+It returns values between 0 and 100 (I have seen values go past 0 and 100 depending on how bad or good the text is).
+
+[4919a51](https://github.com/neomatrix369/nlp_profiler/commit/4919a51) [@neomatrix369](https://github.com/neomatrix369) _Sun Dec 13 18:36:42 2020 +0000_
+
+---
+
+
+
 
 Return to [README.md](README.md)
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@
 [![PyPI version](https://badge.fury.io/py/nlp-profiler.svg)](https://badge.fury.io/py/nlp-profiler) 
 [![Python versions](https://img.shields.io/pypi/pyversions/nlp_profiler.svg)](https://pypi.org/project/nlp_profiler/) 
 [![PyPi stats](https://img.shields.io/pypi/dm/nlp_profiler.svg?label=pypi%20downloads&logo=PyPI&logoColor=white)](https://pypistats.org/packages/nlp_profiler)
+[![Downloads](https://static.pepy.tech/personalized-badge/nlp-profiler?period=total&units=international_system&left_color=black&right_color=orange&left_text=Downloads)](https://pepy.tech/project/nlp-profiler)
 
 
 A simple NLP library that allows profiling datasets with one or more text columns. 
@@ -40,7 +41,7 @@ In short: Think of it as using the `pandas.describe()` function or running [Pand
 
 - Input a Pandas dataframe series as an input parameter.
 - You get back a new dataframe with various features about the parsed text per row.
-  - High-level: sentiment analysis, objectivity/subjectivity analysis, spelling quality check, grammar quality check, etc...
+  - High-level: sentiment analysis, objectivity/subjectivity analysis, spelling quality check, grammar quality check, ease of readability check, etc...
   - Low-level/granular: number of characters in the sentence, number of words, number of emojis, number of words, etc...
 - From the above numerical data in the resulting dataframe descriptive statistics can be drawn using the `pandas.describe()` on the dataframe.
 

diff --git a/nlp_profiler/constants.py b/nlp_profiler/constants.py
@@ -6,6 +6,7 @@
 HIGH_LEVEL_OPTION = 'high_level'
 GRAMMAR_CHECK_OPTION = 'grammar_check'
 SPELLING_CHECK_OPTION = 'spelling_check'
+EASE_OF_READING_CHECK_OPTION = 'ease_of_reading_check'
 PARALLELISATION_METHOD_OPTION = 'parallelisation_method'
 NOT_APPLICABLE = "N/A"
 
@@ -30,6 +31,11 @@
 SENTIMENT_SUBJECTIVITY_COL = 'sentiment_subjectivity'
 SENTIMENT_SUBJECTIVITY_SUMMARISED_COL = 'sentiment_subjectivity_summarised'
 
+## Spelling check
+EASE_OF_READING_SCORE_COL = 'ease_of_reading_score'
+EASE_OF_READING_COL = 'ease_of_reading_quality'
+EASE_OF_READING_SUMMARISED_COL = 'ease_of_reading_summarised'
+
 # ---
 # Granular
 DATES_COUNT_COL = 'dates_count'

diff --git a/nlp_profiler/core.py b/nlp_profiler/core.py
@@ -20,14 +20,16 @@
 
 from nlp_profiler.constants import \
     PARALLELISATION_METHOD_OPTION, DEFAULT_PARALLEL_METHOD, GRANULAR_OPTION, HIGH_LEVEL_OPTION, \
-    GRAMMAR_CHECK_OPTION, SPELLING_CHECK_OPTION
+    GRAMMAR_CHECK_OPTION, SPELLING_CHECK_OPTION, EASE_OF_READING_CHECK_OPTION
 from nlp_profiler.generate_features import get_progress_bar
 from nlp_profiler.granular_features import apply_granular_features
 from nlp_profiler.high_level_features import apply_high_level_features
 from nlp_profiler.high_level_features.grammar_quality_check \
     import apply_grammar_check
 from nlp_profiler.high_level_features.spelling_quality_check \
     import apply_spelling_check
+from nlp_profiler.high_level_features.ease_of_reading_check \
+    import apply_ease_of_reading_check
 
 
 def apply_text_profiling(dataframe: pd.DataFrame,
@@ -41,6 +43,7 @@ def apply_text_profiling(dataframe: pd.DataFrame,
         GRANULAR_OPTION: True,
         GRAMMAR_CHECK_OPTION: False,  # default: False as slow process but can Enabled
         SPELLING_CHECK_OPTION: True,  # default: True although slightly slow process but can Disabled
+        EASE_OF_READING_CHECK_OPTION: True,
         PARALLELISATION_METHOD_OPTION: DEFAULT_PARALLEL_METHOD
     }
 
@@ -51,7 +54,8 @@ def apply_text_profiling(dataframe: pd.DataFrame,
         (GRANULAR_OPTION, "Granular features", apply_granular_features),
         (HIGH_LEVEL_OPTION, "High-level features", apply_high_level_features),
         (GRAMMAR_CHECK_OPTION, "Grammar checks", apply_grammar_check),
-        (SPELLING_CHECK_OPTION, "Spelling checks", apply_spelling_check)
+        (SPELLING_CHECK_OPTION, "Spelling checks", apply_spelling_check),
+        (EASE_OF_READING_CHECK_OPTION, "Ease of reading check", apply_ease_of_reading_check)
     ]
 
     for index, item in enumerate(actions_mappings.copy()):

diff --git a/nlp_profiler/high_level_features/ease_of_reading_check.py b/nlp_profiler/high_level_features/ease_of_reading_check.py
@@ -0,0 +1,68 @@
+from textstat import flesch_reading_ease
+import pandas as pd
+import math
+
+from nlp_profiler.constants import NOT_APPLICABLE, NaN, DEFAULT_PARALLEL_METHOD, \
+    EASE_OF_READING_SCORE_COL, EASE_OF_READING_COL, EASE_OF_READING_SUMMARISED_COL
+from nlp_profiler.generate_features import generate_features
+
+
+def apply_ease_of_reading_check(heading: str,
+                        new_dataframe: pd.DataFrame,
+                        text_column: dict,
+                        parallelisation_method: str = DEFAULT_PARALLEL_METHOD):
+    ease_of_reading_steps = [
+        (EASE_OF_READING_SCORE_COL, text_column, ease_of_reading_score),
+        (EASE_OF_READING_COL, EASE_OF_READING_SCORE_COL, ease_of_reading),
+        (EASE_OF_READING_SUMMARISED_COL, EASE_OF_READING_COL, ease_of_reading_summarised),
+    ]
+    generate_features(
+        heading, ease_of_reading_steps,
+        new_dataframe, parallelisation_method
+    )
+
+ease_of_reading_to_summarised_words_mapping = {
+    "Very Easy": "Easy",
+    "Easy": "Easy",
+    "Fairly Easy": "Easy",
+    "Standard": "Standard",
+    "Fairly Difficult": "Difficult",
+    "Difficult": "Difficult" ,
+    "Very Confusing": "Confusing" 
+}
+def ease_of_reading_summarised(text: str) -> str:
+    if text in ease_of_reading_to_summarised_words_mapping:        
+        return ease_of_reading_to_summarised_words_mapping[text]
+    return "N/A"
+
+
+def ease_of_reading_score(text: str) -> float:
+    if (not isinstance(text, str)) or (len(text.strip()) == 0):
+        return NaN
+
+    return float(flesch_reading_ease(text))
+
+# Docs: https://textblob.readthedocs.io/en/dev/quickstart.html
+### See https://en.wikipedia.org/wiki/Words_of_estimative_probability
+### The General Area of Possibility
+ease_of_reading_to_words_mapping = [
+    ["Very Easy", 90, 100],  
+    ["Easy", 80, 89],  
+    ["Fairly Easy", 70, 79], 
+    ["Standard", 60, 69],  
+    ["Fairly Difficult", 50, 59],  
+    ["Difficult", 30, 49],
+    ["Very Confusing", 0, 29]
+]
+def ease_of_reading(score: int) -> str:
+    if math.isnan(score):
+        return NOT_APPLICABLE
+
+    score = float(score)
+    for _, each_slab in enumerate(ease_of_reading_to_words_mapping):  # pragma: no cover
+        # pragma: no cover => early termination leads to loss of test coverage info
+        if ((score <= 0) and (each_slab[1] == 0)) or \
+           ((score >= 100) and (each_slab[2] == 100)):
+            return each_slab[0]
+        elif (score >= each_slab[1]) and (score <= each_slab[2]):
+            return each_slab[0]