From acbb8be5bd885ab704f325dcbb9446b4ed4262a8 Mon Sep 17 00:00:00 2001 From: jzsmoreno <42299052+jzsmoreno@users.noreply.github.com> Date: Mon, 22 Apr 2024 15:46:32 -0600 Subject: [PATCH] [FIX] `LightCleaner` class * remove `@pytest.mark.xfail` * `main.py` refactoring * change on `concurrent.futures` to `np.vectorize`. --- pydbsmgr/VERSION | 2 +- pydbsmgr/lightest.py | 124 ++++++++++++++++++---------------- pydbsmgr/main.py | 37 +++++++--- pydbsmgr/utils/tools/tools.py | 4 +- requirements.txt | 1 - test/conftest.py | 4 +- test/test_functions.py | 8 +-- 7 files changed, 101 insertions(+), 79 deletions(-) diff --git a/pydbsmgr/VERSION b/pydbsmgr/VERSION index b3ec163..2bd77c7 100644 --- a/pydbsmgr/VERSION +++ b/pydbsmgr/VERSION @@ -1 +1 @@ -0.9.3 \ No newline at end of file +0.9.4 \ No newline at end of file diff --git a/pydbsmgr/lightest.py b/pydbsmgr/lightest.py index bf0e698..8079ce6 100644 --- a/pydbsmgr/lightest.py +++ b/pydbsmgr/lightest.py @@ -1,4 +1,3 @@ -import concurrent.futures from functools import partial from pydbsmgr.main import * @@ -42,15 +41,19 @@ def process_dates(x: str, format_type: str, auxiliary_type: str) -> str: try: x = str(pd.to_datetime(x, format=format_type, errors="raise"))[:10] except: - if auxiliary_type != None: + if auxiliary_type is not None: x = str(pd.to_datetime(x, format=auxiliary_type, errors="ignore"))[:10] + else: + raise ValueError("Date value does not match the expected format.") else: if str(x).find(":") != -1: try: x = str(pd.to_datetime(x[:8], format=format_type, errors="raise"))[:10] except: - if auxiliary_type != None: + if auxiliary_type is not None: x = str(pd.to_datetime(x[:8], format=auxiliary_type, errors="ignore"))[:10] + else: + raise ValueError("Date value does not match the expected format.") return x @@ -73,31 +76,18 @@ def clean_frame( ) -> DataFrame: """`DataFrame` cleaning main function + Parameters + ---------- + - sample_frac (`float`): The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%. + - fast_execution (`bool`): If `False` use `applymap` pandas for extra text cleanup. Default is `True`. + Keyword Arguments: ---------- - - fix_unicode: (`bool`): By default it is set to `True`. - - to_ascii: (`bool`): By default it is set to `True`. - - lower: (`bool`): By default it is set to `True`. - - normalize_whitespace: (`bool`): By default it is set to `True`. - - no_line_breaks: (`bool`): By default it is set to `False`. - - strip_lines: (`bool`): By default it is set to `True`. - - keep_two_line_breaks: (`bool`): By default it is set to `False`. - - no_urls: (`bool`): By default it is set to `False`. - - no_emails: (`bool`): By default it is set to `False`. - - no_phone_numbers: (`bool`): By default it is set to `False`. - - no_numbers: (`bool`): By default it is set to `False`. - - no_digits: (`bool`): By default it is set to `False`. - - no_currency_symbols: (`bool`): By default it is set to `False`. - - no_punct: (`bool`): By default it is set to `False`. - no_emoji: (`bool`): By default it is set to `False`. - - replace_with_url: (`str`): For example, the following ``. - - replace_with_email: (`str`): For example, the following ``. - - replace_with_phone_number: (`str`): For example, the following ``. - - replace_with_number: (`str`): For example, the following ``. - - replace_with_digit: (`str`): For example, the following `0`. - - replace_with_currency_symbol: (`str`): For example, the following ``. - - replace_with_punct: (`str`): = For example, the following `""`. - - lang: (`str`): = By default it is set to `en`. + If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`. + - title_mode: (`bool`): By default it is set to `True`. + If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`. + By default, converts everything to `title`. """ table = (self.df).copy() cols = table.columns @@ -120,50 +110,68 @@ def clean_frame( two_date_formats, ) if auxiliary_type != None: - format_type = auxiliary_type + try: + format_type = auxiliary_type + partial_dates = partial( + process_dates, + format_type=format_type, + auxiliary_type=None, + ) + vpartial_dates = np.vectorize(partial_dates) + table[cols[column_index]] = vpartial_dates(table[cols[column_index]]) + except: + format_type = main_type + partial_dates = partial( + process_dates, + format_type=format_type, + auxiliary_type=None, + ) + vpartial_dates = np.vectorize(partial_dates) + table[cols[column_index]] = vpartial_dates(table[cols[column_index]]) else: format_type = main_type - with concurrent.futures.ThreadPoolExecutor() as executor: partial_dates = partial( process_dates, format_type=format_type, auxiliary_type=None, ) - table[cols[column_index]] = list( - executor.map(partial_dates, table[cols[column_index]]) - ) - table[cols[column_index]] = list( - executor.map(coerce_datetime, table[cols[column_index]]) - ) + vpartial_dates = np.vectorize(partial_dates) + table[cols[column_index]] = vpartial_dates(table[cols[column_index]]) + vcoerce_datetime = np.vectorize(coerce_datetime) + table[cols[column_index]] = vcoerce_datetime(table[cols[column_index]]) table[cols[column_index]] = pd.to_datetime( table[cols[column_index]], format="%Y%m%d", errors="coerce" ).dt.normalize() else: - if fast_execution == False: - partial_clean = partial(clean, **kwargs) - with concurrent.futures.ThreadPoolExecutor() as executor: - table[cols[column_index]] = list( - executor.map(partial_clean, table[cols[column_index]]) - ) - table[cols[column_index]] = list( - executor.map(remove_char, table[cols[column_index]]) - ) - try: - table[cols[column_index]] = list( - executor.map( - lambda text: text.title() if text is not None else text, - table[cols[column_index]], - ) - ) - except AttributeError as e: - warning_type = "UserWarning" - msg = ( - "It was not possible to perform the cleaning, the column {%s} is duplicated. " - % cols[column_index] - ) - msg += "Error: {%s}" % e - print(f"{warning_type}: {msg}") - sys.exit("Perform correction manually") + try: + table[cols[column_index]] = ( + table[cols[column_index]] + .replace(np.nan, "") + .astype(str) + .str.normalize("NFKD") + .str.encode("ascii", errors="ignore") + .str.decode("ascii") + .str.title() + ) + except AttributeError as e: + warning_type = "UserWarning" + msg = ( + "It was not possible to perform the cleaning, the column {%s} is duplicated. " + % cols[column_index] + ) + msg += "Error: {%s}" % e + print(f"{warning_type}: {msg}") + sys.exit("Perform correction manually") + if not fast_execution: + no_emoji = kwargs["no_emoji"] if "no_emoji" in kwargs else False + title_mode = kwargs["title_mode"] if "title_mode" in kwargs else True + partial_clean = partial( + clean, + no_emoji=no_emoji, + title_mode=title_mode, + ) + vpartial_clean = np.vectorize(partial_clean) + table[cols[column_index]] = vpartial_clean(table[cols[column_index]]) table = self._remove_duplicate_columns(table) self.df = table.copy() diff --git a/pydbsmgr/main.py b/pydbsmgr/main.py index d05fa4d..75b2385 100644 --- a/pydbsmgr/main.py +++ b/pydbsmgr/main.py @@ -8,7 +8,6 @@ import numpy as np import pandas as pd -from cleantext import clean from IPython.display import clear_output from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index @@ -32,6 +31,8 @@ def get_date_format(input_string: str) -> str: formats = ["%Y%m%d", "%Y%d%m", "%d%m%Y", "%m%d%Y", "dayfirst", "monthfirst"] for format, regex in enumerate(regex_formats): if re.search(regex, str(input_string)): + if formats[format] == formats[2] and int((input_string[3:5]).replace("0", "")) > 12: + return formats[3] return formats[format] return "" @@ -63,7 +64,12 @@ def remove_numeric_char(input_string: str) -> str: return re.sub(r"\d", "", input_string) -def clean_names(dirty_string: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str: +def clean( + dirty_string: str, + pattern: str = r"[a-zA-Zñáéíóú_@.0-9]+\b", + no_emoji: bool = False, + title_mode: bool = False, +) -> str: """ Receive a string and clean it of special characters @@ -79,14 +85,27 @@ def clean_names(dirty_string: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") - result : `str` clean character string """ - result = re.findall(pattern, str(dirty_string).replace("_", "")) - if len(result) > 0: - result = "_".join(result) + if no_emoji: + emoji_pattern = re.compile( + "[" + "\U0001F600-\U0001F64F" + "\U0001F300-\U0001F5FF" + "\U0001F680-\U0001F6FF" + "\U0001F1E0-\U0001F1FF" + "]+", + flags=re.UNICODE, + ) + dirty_string = emoji_pattern.sub(r"", dirty_string) + dirty_string = dirty_string.lower() + words = dirty_string.split() + processed_words = ["".join(re.findall(pattern, word)) for word in words] + result = " ".join(processed_words) + # Remove any extra spaces that were introduced by + result = result.strip() + if title_mode: + return result.title() else: - pattern = r"[a-zA-Z]+" - result = re.findall(pattern, str(dirty_string).replace("_", "")) - result = "_".join(result) - return result + return result def clean_transform_helper( diff --git a/pydbsmgr/utils/tools/tools.py b/pydbsmgr/utils/tools/tools.py index 74f25c8..9e5411c 100644 --- a/pydbsmgr/utils/tools/tools.py +++ b/pydbsmgr/utils/tools/tools.py @@ -97,8 +97,8 @@ def get_frame(self, **kwargs) -> DataFrame: def _process_columns(self, surrounding: bool = True) -> DataFrame: df = (self.df).copy() df.columns = df.columns.str.lower() - df.columns = df.columns.str.replace(".", "") - df.columns = df.columns.str.replace(",", "") + df.columns = df.columns.str.replace(".", "", regex=False) + df.columns = df.columns.str.replace(",", "", regex=False) df.columns = df.columns.str.replace(r"[^a-zA-Z0-9ñáéíóú_]", "_", regex=True) df.columns = df.columns.str.replace("_+", "_", regex=True) diff --git a/requirements.txt b/requirements.txt index 2aedb15..e054531 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ numpy<2.0.0 pandas -clean-text missingno pyodbc ipython diff --git a/test/conftest.py b/test/conftest.py index 5ba4383..b167e0d 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -14,8 +14,8 @@ def _get_extraction_date() -> Callable: @pytest.fixture() -def _clean_names() -> Callable: - return clean_names +def _clean() -> Callable: + return clean @pytest.fixture() diff --git a/test/test_functions.py b/test/test_functions.py index 6669ce4..b552128 100644 --- a/test/test_functions.py +++ b/test/test_functions.py @@ -5,14 +5,13 @@ import numpy as np import pandas as pd import pytest -from cleantext import clean from pandas.core.frame import DataFrame from pandas.core.indexes.base import Index from pandas.core.series import Series -def test_clean_names(_clean_names): - assert _clean_names("#tes$ting") == "tes_ting" +def test_clean(_clean): + assert _clean("#Tes$ting method*") == "testing method" def test_clean_transform(_clean_transform): @@ -49,9 +48,6 @@ def test_columns_dtypes(columns_dtypes_with_data): assert data_types[1] == "datetime64[ns]" -@pytest.mark.xfail( - reason="Due to the use of 'concurrent.futures' you have this error. Try to run it again." -) def test_lightest(lightest_with_data): fecha, first_date, anther_date, third_date = lightest_with_data comparison = ["1974-09-10", "1973-01-06", "1975-01-18", "2020-08-25"]