[FIX] LightCleaner class

* remove `@pytest.mark.xfail` * `main.py` refactoring * change on `concurrent.futures` to `np.vectorize`.
jzsmoreno · Apr 22, 2024 · acbb8be · acbb8be
1 parent ae05cb0
commit acbb8be
Show file tree

Hide file tree

Showing 7 changed files with 101 additions and 79 deletions.
diff --git a/pydbsmgr/VERSION b/pydbsmgr/VERSION
@@ -1 +1 @@
-0.9.3
+0.9.4
diff --git a/pydbsmgr/lightest.py b/pydbsmgr/lightest.py
@@ -1,4 +1,3 @@
-import concurrent.futures
 from functools import partial
 
 from pydbsmgr.main import *
@@ -42,15 +41,19 @@ def process_dates(x: str, format_type: str, auxiliary_type: str) -> str:
         try:
             x = str(pd.to_datetime(x, format=format_type, errors="raise"))[:10]
         except:
-            if auxiliary_type != None:
+            if auxiliary_type is not None:
                 x = str(pd.to_datetime(x, format=auxiliary_type, errors="ignore"))[:10]
+            else:
+                raise ValueError("Date value does not match the expected format.")
     else:
         if str(x).find(":") != -1:
             try:
                 x = str(pd.to_datetime(x[:8], format=format_type, errors="raise"))[:10]
             except:
-                if auxiliary_type != None:
+                if auxiliary_type is not None:
                     x = str(pd.to_datetime(x[:8], format=auxiliary_type, errors="ignore"))[:10]
+                else:
+                    raise ValueError("Date value does not match the expected format.")
     return x
 
 
@@ -73,31 +76,18 @@ def clean_frame(
     ) -> DataFrame:
         """`DataFrame` cleaning main function
 
+        Parameters
+        ----------
+        - sample_frac (`float`): The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%.
+        - fast_execution (`bool`): If `False` use `applymap` pandas for extra text cleanup. Default is `True`.
+
         Keyword Arguments:
         ----------
-        - fix_unicode: (`bool`): By default it is set to `True`.
-        - to_ascii: (`bool`): By default it is set to `True`.
-        - lower: (`bool`): By default it is set to `True`.
-        - normalize_whitespace: (`bool`): By default it is set to `True`.
-        - no_line_breaks: (`bool`): By default it is set to `False`.
-        - strip_lines: (`bool`): By default it is set to `True`.
-        - keep_two_line_breaks: (`bool`): By default it is set to `False`.
-        - no_urls: (`bool`): By default it is set to `False`.
-        - no_emails: (`bool`): By default it is set to `False`.
-        - no_phone_numbers: (`bool`): By default it is set to `False`.
-        - no_numbers: (`bool`): By default it is set to `False`.
-        - no_digits: (`bool`): By default it is set to `False`.
-        - no_currency_symbols: (`bool`): By default it is set to `False`.
-        - no_punct: (`bool`): By default it is set to `False`.
         - no_emoji: (`bool`): By default it is set to `False`.
-        - replace_with_url: (`str`): For example, the following `<URL>`.
-        - replace_with_email: (`str`): For example, the following `<EMAIL>`.
-        - replace_with_phone_number: (`str`): For example, the following `<PHONE>`.
-        - replace_with_number: (`str`): For example, the following `<NUMBER>`.
-        - replace_with_digit: (`str`): For example, the following `0`.
-        - replace_with_currency_symbol: (`str`): For example, the following `<CUR>`.
-        - replace_with_punct: (`str`): = For example, the following `""`.
-        - lang: (`str`): = By default it is set to `en`.
+        If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`.
+        - title_mode: (`bool`): By default it is set to `True`.
+        If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`.
+        By default, converts everything to `title`.
         """
         table = (self.df).copy()
         cols = table.columns
@@ -120,50 +110,68 @@ def clean_frame(
                         two_date_formats,
                     )
                     if auxiliary_type != None:
-                        format_type = auxiliary_type
+                        try:
+                            format_type = auxiliary_type
+                            partial_dates = partial(
+                                process_dates,
+                                format_type=format_type,
+                                auxiliary_type=None,
+                            )
+                            vpartial_dates = np.vectorize(partial_dates)
+                            table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
+                        except:
+                            format_type = main_type
+                            partial_dates = partial(
+                                process_dates,
+                                format_type=format_type,
+                                auxiliary_type=None,
+                            )
+                            vpartial_dates = np.vectorize(partial_dates)
+                            table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
                     else:
                         format_type = main_type
-                    with concurrent.futures.ThreadPoolExecutor() as executor:
                         partial_dates = partial(
                             process_dates,
                             format_type=format_type,
                             auxiliary_type=None,
                         )
-                        table[cols[column_index]] = list(
-                            executor.map(partial_dates, table[cols[column_index]])
-                        )
-                        table[cols[column_index]] = list(
-                            executor.map(coerce_datetime, table[cols[column_index]])
-                        )
+                        vpartial_dates = np.vectorize(partial_dates)
+                        table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
+                    vcoerce_datetime = np.vectorize(coerce_datetime)
+                    table[cols[column_index]] = vcoerce_datetime(table[cols[column_index]])
                     table[cols[column_index]] = pd.to_datetime(
                         table[cols[column_index]], format="%Y%m%d", errors="coerce"
                     ).dt.normalize()
                 else:
-                    if fast_execution == False:
-                        partial_clean = partial(clean, **kwargs)
-                        with concurrent.futures.ThreadPoolExecutor() as executor:
-                            table[cols[column_index]] = list(
-                                executor.map(partial_clean, table[cols[column_index]])
-                            )
-                            table[cols[column_index]] = list(
-                                executor.map(remove_char, table[cols[column_index]])
-                            )
-                            try:
-                                table[cols[column_index]] = list(
-                                    executor.map(
-                                        lambda text: text.title() if text is not None else text,
-                                        table[cols[column_index]],
-                                    )
-                                )
-                            except AttributeError as e:
-                                warning_type = "UserWarning"
-                                msg = (
-                                    "It was not possible to perform the cleaning, the column {%s} is duplicated. "
-                                    % cols[column_index]
-                                )
-                                msg += "Error: {%s}" % e
-                                print(f"{warning_type}: {msg}")
-                                sys.exit("Perform correction manually")
+                    try:
+                        table[cols[column_index]] = (
+                            table[cols[column_index]]
+                            .replace(np.nan, "")
+                            .astype(str)
+                            .str.normalize("NFKD")
+                            .str.encode("ascii", errors="ignore")
+                            .str.decode("ascii")
+                            .str.title()
+                        )
+                    except AttributeError as e:
+                        warning_type = "UserWarning"
+                        msg = (
+                            "It was not possible to perform the cleaning, the column {%s} is duplicated. "
+                            % cols[column_index]
+                        )
+                        msg += "Error: {%s}" % e
+                        print(f"{warning_type}: {msg}")
+                        sys.exit("Perform correction manually")
+                    if not fast_execution:
+                        no_emoji = kwargs["no_emoji"] if "no_emoji" in kwargs else False
+                        title_mode = kwargs["title_mode"] if "title_mode" in kwargs else True
+                        partial_clean = partial(
+                            clean,
+                            no_emoji=no_emoji,
+                            title_mode=title_mode,
+                        )
+                        vpartial_clean = np.vectorize(partial_clean)
+                        table[cols[column_index]] = vpartial_clean(table[cols[column_index]])
 
         table = self._remove_duplicate_columns(table)
         self.df = table.copy()

diff --git a/pydbsmgr/main.py b/pydbsmgr/main.py
@@ -8,7 +8,6 @@
 
 import numpy as np
 import pandas as pd
-from cleantext import clean
 from IPython.display import clear_output
 from pandas.core.frame import DataFrame
 from pandas.core.indexes.base import Index
@@ -32,6 +31,8 @@ def get_date_format(input_string: str) -> str:
     formats = ["%Y%m%d", "%Y%d%m", "%d%m%Y", "%m%d%Y", "dayfirst", "monthfirst"]
     for format, regex in enumerate(regex_formats):
         if re.search(regex, str(input_string)):
+            if formats[format] == formats[2] and int((input_string[3:5]).replace("0", "")) > 12:
+                return formats[3]
             return formats[format]
 
     return ""
@@ -63,7 +64,12 @@ def remove_numeric_char(input_string: str) -> str:
     return re.sub(r"\d", "", input_string)
 
 
-def clean_names(dirty_string: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str:
+def clean(
+    dirty_string: str,
+    pattern: str = r"[a-zA-Zñáéíóú[email protected]]+\b",
+    no_emoji: bool = False,
+    title_mode: bool = False,
+) -> str:
     """
     Receive a string and clean it of special characters
 
@@ -79,14 +85,27 @@ def clean_names(dirty_string: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -
     result : `str`
         clean character string
     """
-    result = re.findall(pattern, str(dirty_string).replace("_", ""))
-    if len(result) > 0:
-        result = "_".join(result)
+    if no_emoji:
+        emoji_pattern = re.compile(
+            "["
+            "\U0001F600-\U0001F64F"
+            "\U0001F300-\U0001F5FF"
+            "\U0001F680-\U0001F6FF"
+            "\U0001F1E0-\U0001F1FF"
+            "]+",
+            flags=re.UNICODE,
+        )
+        dirty_string = emoji_pattern.sub(r"", dirty_string)
+    dirty_string = dirty_string.lower()
+    words = dirty_string.split()
+    processed_words = ["".join(re.findall(pattern, word)) for word in words]
+    result = " ".join(processed_words)
+    # Remove any extra spaces that were introduced by
+    result = result.strip()
+    if title_mode:
+        return result.title()
     else:
-        pattern = r"[a-zA-Z]+"
-        result = re.findall(pattern, str(dirty_string).replace("_", ""))
-        result = "_".join(result)
-    return result
+        return result
 
 
 def clean_transform_helper(

diff --git a/pydbsmgr/utils/tools/tools.py b/pydbsmgr/utils/tools/tools.py
@@ -97,8 +97,8 @@ def get_frame(self, **kwargs) -> DataFrame:
     def _process_columns(self, surrounding: bool = True) -> DataFrame:
         df = (self.df).copy()
         df.columns = df.columns.str.lower()
-        df.columns = df.columns.str.replace(".", "")
-        df.columns = df.columns.str.replace(",", "")
+        df.columns = df.columns.str.replace(".", "", regex=False)
+        df.columns = df.columns.str.replace(",", "", regex=False)
         df.columns = df.columns.str.replace(r"[^a-zA-Z0-9ñáéíóú_]", "_", regex=True)
 
         df.columns = df.columns.str.replace("_+", "_", regex=True)

diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,5 @@
 numpy<2.0.0
 pandas
-clean-text
 missingno
 pyodbc
 ipython

diff --git a/test/conftest.py b/test/conftest.py
@@ -14,8 +14,8 @@ def _get_extraction_date() -> Callable:
 
 
 @pytest.fixture()
-def _clean_names() -> Callable:
-    return clean_names
+def _clean() -> Callable:
+    return clean
 
 
 @pytest.fixture()

diff --git a/test/test_functions.py b/test/test_functions.py
@@ -5,14 +5,13 @@
 import numpy as np
 import pandas as pd
 import pytest
-from cleantext import clean
 from pandas.core.frame import DataFrame
 from pandas.core.indexes.base import Index
 from pandas.core.series import Series
 
 
-def test_clean_names(_clean_names):
-    assert _clean_names("#tes$ting") == "tes_ting"
+def test_clean(_clean):
+    assert _clean("#Tes$ting method*") == "testing method"
 
 
 def test_clean_transform(_clean_transform):
@@ -49,9 +48,6 @@ def test_columns_dtypes(columns_dtypes_with_data):
     assert data_types[1] == "datetime64[ns]"
 
 
-@pytest.mark.xfail(
-    reason="Due to the use of 'concurrent.futures' you have this error. Try to run it again."
-)
 def test_lightest(lightest_with_data):
     fecha, first_date, anther_date, third_date = lightest_with_data
     comparison = ["1974-09-10", "1973-01-06", "1975-01-18", "2020-08-25"]