Skip to content

Commit

Permalink
Enhancements added to LightCleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
jzsmoreno committed Dec 5, 2024
1 parent c34fc86 commit 188b81d
Show file tree
Hide file tree
Showing 4 changed files with 125 additions and 150 deletions.
246 changes: 111 additions & 135 deletions pydbsmgr/lightest.py
Original file line number Diff line number Diff line change
@@ -1,80 +1,81 @@
import logging
import sys
from functools import partial

from pydbsmgr.main import *
from pydbsmgr.utils.tools import coerce_datetime, most_repeated_item
import numpy as np
import pandas as pd

from pydbsmgr.main import check_if_contains_dates, clean, get_date_format
from pydbsmgr.utils.tools import most_repeated_item

def process_dates(x: str, format_type: str, auxiliary_type: str, errors: str = "ignore") -> str:
"""Auxiliary function in date type string processing
logging.basicConfig(level=logging.WARNING)

Parameters
----------
x : `str`
character of type date.

Returns
----------
x : `str`
character after processing with format `YYYY-MM-DD`.
"""
# performing data type conversion
def process_dates(
x: str, format_type: str, auxiliary_type: str = None, errors: str = "ignore"
) -> str:
"""Auxiliary function in date type string processing."""
x = str(x)
if format_type in ["dayfirst", "monthfirst"] and len(x) < 10:
# split by "/" or "-"
separator = "/" if "/" in x else "-"
parts = x.split(separator)
if format_type == "dayfirst":
dmy = x.split("/") if "/" in x else x.split("-")
day = dmy[0] if len(dmy[0]) == 2 else "0" + dmy[0]
month = dmy[1] if len(dmy[1]) == 2 else "0" + dmy[1]
year = dmy[-1]
day, month, year = parts[0], parts[1], parts[-1]
elif format_type == "monthfirst":
mdy = x.split("/") if "/" in x else x.split("-")
month = mdy[0] if len(mdy[0]) == 2 else "0" + mdy[0]
day = mdy[1] if len(mdy[1]) == 2 else "0" + mdy[1]
year = mdy[-1]
month, day, year = parts[0], parts[1], parts[-1]

return str(pd.to_datetime(f"{year}{month}{day}", format="%Y%m%d", errors="raise"))[:10]

x = x.replace("/", "")
x = x.replace("-", "")

if len(x) == 8:
day = f"{int(day):02d}"
month = f"{int(month):02d}"
try:
x = str(pd.to_datetime(x, format=format_type, errors="raise"))[:10]
except:
if auxiliary_type is not None:
x = str(pd.to_datetime(x, format=auxiliary_type, errors="ignore"))[:10]
date = pd.to_datetime(f"{year}{month}{day}", format="%Y%m%d", errors="coerce")
except ValueError:
if auxiliary_type:
date = pd.to_datetime(x, format=auxiliary_type, errors="coerce")
elif errors == "raise":
raise ValueError("Date value does not match the expected format.")
else:
if str(x).find(":") != -1:
x = x.replace("/", "").replace("-", "")

if len(x) == 8:
try:
x = str(pd.to_datetime(x[:8], format=format_type, errors="raise"))[:10]
except:
if auxiliary_type is not None:
x = str(pd.to_datetime(x[:8], format=auxiliary_type, errors="ignore"))[:10]
date = pd.to_datetime(x, format=format_type, errors="coerce")
except ValueError:
if auxiliary_type:
date = pd.to_datetime(x, format=auxiliary_type, errors="coerce")
elif errors == "raise":
raise ValueError("Date value does not match the expected format.")
return x
else:
try:
date = pd.to_datetime(x[:8], format=format_type, errors="coerce")
except ValueError:
if auxiliary_type:
date = pd.to_datetime(x[:8], format=auxiliary_type, errors="coerce")
elif errors == "raise":
raise ValueError("Date value does not match the expected format.")

if not pd.isnull(date):
return date.strftime("%Y-%m-%d")
else:
return x # Return original string if no valid date is found


class LightCleaner:
"""Performs a light cleaning on the table"""
"""Performs a light cleaning on the table."""

# Increase memory efficiency
__slots__ = ["df", "dict_dtypes"]

def __init__(self, df_: DataFrame):
def __init__(self, df_: pd.DataFrame):
self.df = df_.copy()
self.dict_dtypes = dict(zip(["float", "int", "str"], ["float64", "int64", "object"]))
self.dict_dtypes = {"float": "float64", "int": "int64", "str": "object"}

def clean_frame(
self,
sample_frac: float = 0.1,
fast_execution: bool = True,
two_date_formats: bool = True,
**kwargs,
) -> DataFrame:
"""`DataFrame` cleaning main function
) -> pd.DataFrame:
"""DataFrame cleaning main function
Parameters
----------
Expand All @@ -90,10 +91,11 @@ def clean_frame(
title_mode : `bool`
By default it is set to `True`. If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`. By default, converts everything to `title`.
"""
table = (self.df).copy()
table = self.df.copy()
cols = table.columns
table_sample = table.sample(frac=sample_frac)
errors = kwargs["errors"] if "errors" in kwargs else "ignore"
table_sample = table.sample(frac=sample_frac, replace=False)
errors = kwargs.get("errors", "ignore")

for column_index, datatype in enumerate(table.dtypes):
if datatype == "object":
datetype_column = (
Expand All @@ -111,42 +113,33 @@ def clean_frame(
),
two_date_formats,
)
if auxiliary_type != None:
try:
format_type = auxiliary_type
partial_dates = partial(
process_dates,
format_type=format_type,
auxiliary_type=None,
errors=errors,
)
vpartial_dates = np.vectorize(partial_dates)
table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
except:
format_type = main_type
partial_dates = partial(
process_dates,
format_type=format_type,
auxiliary_type=None,
errors=errors,
)
vpartial_dates = np.vectorize(partial_dates)
table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
else:
format_type = main_type

format_type = auxiliary_type or main_type
try:
partial_dates = partial(
process_dates,
format_type=format_type,
auxiliary_type=None,
errors=errors,
)
vpartial_dates = np.vectorize(partial_dates)
table[cols[column_index]] = vpartial_dates(table[cols[column_index]])
vcoerce_datetime = np.vectorize(coerce_datetime)
table[cols[column_index]] = vcoerce_datetime(table[cols[column_index]])
table[cols[column_index]] = pd.to_datetime(
table[cols[column_index]], format="%Y%m%d", errors="coerce"
).dt.normalize()

table[cols[column_index]] = pd.to_datetime(
vpartial_dates(table[cols[column_index]]),
format="%Y-%m-%d",
errors="coerce",
).normalize()
except:
partial_dates = partial(
process_dates, format_type=main_type, auxiliary_type=None, errors=errors
)
vpartial_dates = np.vectorize(partial_dates)

table[cols[column_index]] = pd.to_datetime(
vpartial_dates(table[cols[column_index]]),
format="%Y-%m-%d",
errors="coerce",
).normalize()
else:
try:
table[cols[column_index]] = (
Expand All @@ -159,73 +152,56 @@ def clean_frame(
.str.title()
)
except AttributeError as e:
warning_type = "UserWarning"
msg = (
"It was not possible to perform the cleaning, the column {%s} is duplicated. "
% cols[column_index]
)
msg += "Error: {%s}" % e
print(f"{warning_type}: {msg}")
msg = f"It was not possible to perform the cleaning, the column {cols[column_index]} is duplicated. Error: {e}"
logging.warning(msg)
sys.exit("Perform correction manually")

if not fast_execution:
no_emoji = kwargs["no_emoji"] if "no_emoji" in kwargs else False
title_mode = kwargs["title_mode"] if "title_mode" in kwargs else True
partial_clean = partial(
clean,
no_emoji=no_emoji,
title_mode=title_mode,
)
no_emoji = kwargs.get("no_emoji", False)
title_mode = kwargs.get("title_mode", True)
partial_clean = partial(clean, no_emoji=no_emoji, title_mode=title_mode)
vpartial_clean = np.vectorize(partial_clean)
table[cols[column_index]] = vpartial_clean(table[cols[column_index]])

table = self._remove_duplicate_columns(table)
self.df = table.copy()
return self.df

def _correct_float(self, value, datatype):
"""float correction function"""
val_type = type(value).__name__
if self.dict_dtypes[val_type] != datatype:
try:
return float(value)
except:
return np.nan
else:
return value

def _correct_int(self, value, datatype):
"""integer correction function"""
val_type = type(value).__name__
if self.dict_dtypes[val_type] != datatype:
try:
return int(value)
except:
return np.nan
else:
return value

def _correct_str(self, value, datatype):
"""character correction function"""
def _correct_type(self, value, datatype):
"""General type correction function."""
val_type = type(value).__name__
if self.dict_dtypes[val_type] != datatype:
try:
return str(value)
except:
return ""
else:
return value

def _remove_duplicate_columns(self, df: DataFrame) -> DataFrame:
"""Function that removes duplicate columns based on column name"""
# Drop duplicate columns
# df = df.T.drop_duplicates().T
# df = df.loc[:,~df.columns.duplicated()]
seen_columns = set()
unique_columns = []

for col in df.columns:
if col not in seen_columns:
unique_columns.append(col)
seen_columns.add(col)

return df[unique_columns]
return {"float": float, "int": int, "str": str}[datatype](value)
except ValueError:
return np.nan if datatype in ["float", "int"] else ""
return value

def _remove_duplicate_columns(self, df: pd.DataFrame) -> pd.DataFrame:
"""Remove duplicate columns based on column name."""
seen = set()
unique_cols = [col for col in df.columns if not (col in seen or seen.add(col))]
return df[unique_cols]


if __name__ == "__main__":
# Example usage
df = pd.DataFrame(
{
"index": ["0", "1", "2", "3", "4"],
"fecha": ["10/09/1974", "06/01/1973", "18/01/1975", "25/08/2020", " fecha_no_valida"],
"first_date": [
"09/10/1974",
"01/06/1973",
"01/18/1975",
"08/25/2020",
" fecha_no_valida",
],
"another_date": ["9/10/1974", "1/6/1973", "1/18/1975", "8/25/2020", " fecha_no_valida"],
"third_date": ["10/9/1974", "6/1/1973", "18/1/1975", "25/8/2020", " fecha_no_valida"],
}
)

handler = LightCleaner(df)
df = handler.clean_frame(sample_frac=1.0, fast_execution=False, errors="raise")
breakpoint()
8 changes: 0 additions & 8 deletions pydbsmgr/utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,14 +111,6 @@ def _process_columns(self, surrounding: bool = True) -> DataFrame:
return df


def coerce_datetime(x: str) -> datetime64:
try:
x = x.replace("-", "")
return pd.to_datetime(x, format="%Y%m%d")
except:
return np.datetime64("NaT")


class ControllerFeatures:
def __init__(self, _container_client):
self._container_client = _container_client
Expand Down
17 changes: 12 additions & 5 deletions test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,13 +64,20 @@ def lightest_with_data() -> Callable:
"""Passes a test dataframe to the class"""
df = pd.DataFrame(
{
"index": ["0", "1", "2", "3"],
"fecha": ["10/09/1974", "06/01/1973", "18/01/1975", "25/08/2020"],
"first_date": ["09/10/1974", "01/06/1973", "01/18/1975", "08/25/2020"],
"another_date": ["9/10/1974", "1/6/1973", "1/18/1975", "8/25/2020"],
"third_date": ["10/9/1974", "6/1/1973", "18/1/1975", "25/8/2020"],
"index": ["0", "1", "2", "3", "4"],
"fecha": ["10/09/1974", "06/01/1973", "18/01/1975", "25/08/2020", " fecha_no_valida"],
"first_date": [
"09/10/1974",
"01/06/1973",
"01/18/1975",
"08/25/2020",
" fecha_no_valida",
],
"another_date": ["9/10/1974", "1/6/1973", "1/18/1975", "8/25/2020", " fecha_no_valida"],
"third_date": ["10/9/1974", "6/1/1973", "18/1/1975", "25/8/2020", " fecha_no_valida"],
}
)

handler = LightCleaner(df)
df = handler.clean_frame(sample_frac=1.0, fast_execution=False, errors="raise")

Expand Down
4 changes: 2 additions & 2 deletions test/test_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,12 @@ def test_correct_nan(_correct_nan):
def test_columns_dtypes(columns_dtypes_with_data):
df = columns_dtypes_with_data.correct(sample_frac=0.33)
data_types = df.dtypes
assert data_types[1] == "datetime64[ns]"
assert data_types.iloc[1] == "datetime64[ns]"


def test_lightest(lightest_with_data):
fecha, first_date, anther_date, third_date = lightest_with_data
comparison = ["1974-09-10", "1973-01-06", "1975-01-18", "2020-08-25"]
comparison = ["1974-09-10", "1973-01-06", "1975-01-18", "2020-08-25", "NaT"]
assert fecha == comparison
assert first_date == comparison
assert anther_date == comparison
Expand Down

0 comments on commit 188b81d

Please sign in to comment.