Skip to content

Commit

Permalink
[FIX] LightCleaner class
Browse files Browse the repository at this point in the history
* remove `@pytest.mark.xfail`
* `main.py` refactoring
* change on `concurrent.futures` to `np.vectorize`.
  • Loading branch information
jzsmoreno committed Apr 22, 2024
1 parent ae05cb0 commit acbb8be
Show file tree
Hide file tree
Showing 7 changed files with 101 additions and 79 deletions.
2 changes: 1 addition & 1 deletion pydbsmgr/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.3
0.9.4
124 changes: 66 additions & 58 deletions pydbsmgr/lightest.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import concurrent.futures
from functools import partial

from pydbsmgr.main import *
Expand Down Expand Up @@ -42,15 +41,19 @@ def process_dates(x: str, format_type: str, auxiliary_type: str) -> str:
try:
x = str(pd.to_datetime(x, format=format_type, errors="raise"))[:10]
except:
if auxiliary_type != None:
if auxiliary_type is not None:
x = str(pd.to_datetime(x, format=auxiliary_type, errors="ignore"))[:10]
else:
raise ValueError("Date value does not match the expected format.")
else:
if str(x).find(":") != -1:
try:
x = str(pd.to_datetime(x[:8], format=format_type, errors="raise"))[:10]
except:
if auxiliary_type != None:
if auxiliary_type is not None:
x = str(pd.to_datetime(x[:8], format=auxiliary_type, errors="ignore"))[:10]
else:
raise ValueError("Date value does not match the expected format.")
return x


Expand All @@ -73,31 +76,18 @@ def clean_frame(
) -> DataFrame:
"""`DataFrame` cleaning main function
Parameters
----------
- sample_frac (`float`): The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%.
- fast_execution (`bool`): If `False` use `applymap` pandas for extra text cleanup. Default is `True`.
Keyword Arguments:
----------
- fix_unicode: (`bool`): By default it is set to `True`.
- to_ascii: (`bool`): By default it is set to `True`.
- lower: (`bool`): By default it is set to `True`.
- normalize_whitespace: (`bool`): By default it is set to `True`.
- no_line_breaks: (`bool`): By default it is set to `False`.
- strip_lines: (`bool`): By default it is set to `True`.
- keep_two_line_breaks: (`bool`): By default it is set to `False`.
- no_urls: (`bool`): By default it is set to `False`.
- no_emails: (`bool`): By default it is set to `False`.
- no_phone_numbers: (`bool`): By default it is set to `False`.
- no_numbers: (`bool`): By default it is set to `False`.
- no_digits: (`bool`): By default it is set to `False`.
- no_currency_symbols: (`bool`): By default it is set to `False`.
- no_punct: (`bool`): By default it is set to `False`.
- no_emoji: (`bool`): By default it is set to `False`.
- replace_with_url: (`str`): For example, the following `<URL>`.
- replace_with_email: (`str`): For example, the following `<EMAIL>`.
- replace_with_phone_number: (`str`): For example, the following `<PHONE>`.
- replace_with_number: (`str`): For example, the following `<NUMBER>`.
- replace_with_digit: (`str`): For example, the following `0`.
- replace_with_currency_symbol: (`str`): For example, the following `<CUR>`.
- replace_with_punct: (`str`): = For example, the following `""`.
- lang: (`str`): = By default it is set to `en`.
If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`.
- title_mode: (`bool`): By default it is set to `True`.
If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`.
By default, converts everything to `title`.
"""
table = (self.df).copy()
cols = table.columns
Expand All @@ -120,50 +110,68 @@ def clean_frame(
two_date_formats,
)
if auxiliary_type != None:
format_type = auxiliary_type
try:
format_type = auxiliary_type
partial_dates = partial(
process_dates,
format_type=format_type,
auxiliary_type=None,
)
vpartial_dates = np.vectorize(partial_dates)
table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
except:
format_type = main_type
partial_dates = partial(
process_dates,
format_type=format_type,
auxiliary_type=None,
)
vpartial_dates = np.vectorize(partial_dates)
table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
else:
format_type = main_type
with concurrent.futures.ThreadPoolExecutor() as executor:
partial_dates = partial(
process_dates,
format_type=format_type,
auxiliary_type=None,
)
table[cols[column_index]] = list(
executor.map(partial_dates, table[cols[column_index]])
)
table[cols[column_index]] = list(
executor.map(coerce_datetime, table[cols[column_index]])
)
vpartial_dates = np.vectorize(partial_dates)
table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
vcoerce_datetime = np.vectorize(coerce_datetime)
table[cols[column_index]] = vcoerce_datetime(table[cols[column_index]])
table[cols[column_index]] = pd.to_datetime(
table[cols[column_index]], format="%Y%m%d", errors="coerce"
).dt.normalize()
else:
if fast_execution == False:
partial_clean = partial(clean, **kwargs)
with concurrent.futures.ThreadPoolExecutor() as executor:
table[cols[column_index]] = list(
executor.map(partial_clean, table[cols[column_index]])
)
table[cols[column_index]] = list(
executor.map(remove_char, table[cols[column_index]])
)
try:
table[cols[column_index]] = list(
executor.map(
lambda text: text.title() if text is not None else text,
table[cols[column_index]],
)
)
except AttributeError as e:
warning_type = "UserWarning"
msg = (
"It was not possible to perform the cleaning, the column {%s} is duplicated. "
% cols[column_index]
)
msg += "Error: {%s}" % e
print(f"{warning_type}: {msg}")
sys.exit("Perform correction manually")
try:
table[cols[column_index]] = (
table[cols[column_index]]
.replace(np.nan, "")
.astype(str)
.str.normalize("NFKD")
.str.encode("ascii", errors="ignore")
.str.decode("ascii")
.str.title()
)
except AttributeError as e:
warning_type = "UserWarning"
msg = (
"It was not possible to perform the cleaning, the column {%s} is duplicated. "
% cols[column_index]
)
msg += "Error: {%s}" % e
print(f"{warning_type}: {msg}")
sys.exit("Perform correction manually")
if not fast_execution:
no_emoji = kwargs["no_emoji"] if "no_emoji" in kwargs else False
title_mode = kwargs["title_mode"] if "title_mode" in kwargs else True
partial_clean = partial(
clean,
no_emoji=no_emoji,
title_mode=title_mode,
)
vpartial_clean = np.vectorize(partial_clean)
table[cols[column_index]] = vpartial_clean(table[cols[column_index]])

table = self._remove_duplicate_columns(table)
self.df = table.copy()
Expand Down
37 changes: 28 additions & 9 deletions pydbsmgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

import numpy as np
import pandas as pd
from cleantext import clean
from IPython.display import clear_output
from pandas.core.frame import DataFrame
from pandas.core.indexes.base import Index
Expand All @@ -32,6 +31,8 @@ def get_date_format(input_string: str) -> str:
formats = ["%Y%m%d", "%Y%d%m", "%d%m%Y", "%m%d%Y", "dayfirst", "monthfirst"]
for format, regex in enumerate(regex_formats):
if re.search(regex, str(input_string)):
if formats[format] == formats[2] and int((input_string[3:5]).replace("0", "")) > 12:
return formats[3]
return formats[format]

return ""
Expand Down Expand Up @@ -63,7 +64,12 @@ def remove_numeric_char(input_string: str) -> str:
return re.sub(r"\d", "", input_string)


def clean_names(dirty_string: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str:
def clean(
dirty_string: str,
pattern: str = r"[a-zA-Zñáéíóú[email protected]]+\b",
no_emoji: bool = False,
title_mode: bool = False,
) -> str:
"""
Receive a string and clean it of special characters
Expand All @@ -79,14 +85,27 @@ def clean_names(dirty_string: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -
result : `str`
clean character string
"""
result = re.findall(pattern, str(dirty_string).replace("_", ""))
if len(result) > 0:
result = "_".join(result)
if no_emoji:
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F"
"\U0001F300-\U0001F5FF"
"\U0001F680-\U0001F6FF"
"\U0001F1E0-\U0001F1FF"
"]+",
flags=re.UNICODE,
)
dirty_string = emoji_pattern.sub(r"", dirty_string)
dirty_string = dirty_string.lower()
words = dirty_string.split()
processed_words = ["".join(re.findall(pattern, word)) for word in words]
result = " ".join(processed_words)
# Remove any extra spaces that were introduced by
result = result.strip()
if title_mode:
return result.title()
else:
pattern = r"[a-zA-Z]+"
result = re.findall(pattern, str(dirty_string).replace("_", ""))
result = "_".join(result)
return result
return result


def clean_transform_helper(
Expand Down
4 changes: 2 additions & 2 deletions pydbsmgr/utils/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,8 @@ def get_frame(self, **kwargs) -> DataFrame:
def _process_columns(self, surrounding: bool = True) -> DataFrame:
df = (self.df).copy()
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(".", "")
df.columns = df.columns.str.replace(",", "")
df.columns = df.columns.str.replace(".", "", regex=False)
df.columns = df.columns.str.replace(",", "", regex=False)
df.columns = df.columns.str.replace(r"[^a-zA-Z0-9ñáéíóú_]", "_", regex=True)

df.columns = df.columns.str.replace("_+", "_", regex=True)
Expand Down
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
numpy<2.0.0
pandas
clean-text
missingno
pyodbc
ipython
Expand Down
4 changes: 2 additions & 2 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ def _get_extraction_date() -> Callable:


@pytest.fixture()
def _clean_names() -> Callable:
return clean_names
def _clean() -> Callable:
return clean


@pytest.fixture()
Expand Down
8 changes: 2 additions & 6 deletions test/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,13 @@
import numpy as np
import pandas as pd
import pytest
from cleantext import clean
from pandas.core.frame import DataFrame
from pandas.core.indexes.base import Index
from pandas.core.series import Series


def test_clean_names(_clean_names):
assert _clean_names("#tes$ting") == "tes_ting"
def test_clean(_clean):
assert _clean("#Tes$ting method*") == "testing method"


def test_clean_transform(_clean_transform):
Expand Down Expand Up @@ -49,9 +48,6 @@ def test_columns_dtypes(columns_dtypes_with_data):
assert data_types[1] == "datetime64[ns]"


@pytest.mark.xfail(
reason="Due to the use of 'concurrent.futures' you have this error. Try to run it again."
)
def test_lightest(lightest_with_data):
fecha, first_date, anther_date, third_date = lightest_with_data
comparison = ["1974-09-10", "1973-01-06", "1975-01-18", "2020-08-25"]
Expand Down

0 comments on commit acbb8be

Please sign in to comment.