Skip to content

Commit

Permalink
Update lightest.py
Browse files Browse the repository at this point in the history
* Improved date logic
* Added corrections
  • Loading branch information
jzsmoreno committed Nov 13, 2023
1 parent f78fd83 commit 501ceeb
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 70 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ dmypy.json
cython_debug/

# Ignore test_<files>.py for testing
/test_utilities.py
/_utilities.py
/test_logs.txt
/test_logsbook.csv
*.html
Expand Down
76 changes: 38 additions & 38 deletions pydbsmgr/lightest.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import concurrent.futures

from pydbsmgr.main import *
from pydbsmgr.utils.tools import coerce_datetime


def process_dates(x: str) -> str:
Expand All @@ -25,7 +26,9 @@ def process_dates(x: str) -> str:
else:
if str(x).find(":") != -1:
x = convert_date(x[:8])
if len(x) == 8:
if str(x).isdigit():
x = str(pd.to_datetime(x, format="%Y%d%m", errors="ignore"))[:10]
if str(x).isdigit():
try:
x = str(pd.to_datetime(x, format="%Y%d%m", errors="ignore"))[:10]
except:
Expand All @@ -47,50 +50,47 @@ def clean_frame(self, sample_frac: float = 0.1, fast_execution: bool = True) ->
table_sample = table.sample(frac=sample_frac)
for column_index, datatype in enumerate(table.dtypes):
if datatype == "object":
x = (table_sample[cols[column_index]].values)[0]
datetype_column = False
if isinstance(x, str):
if (
x == ""
or x.find("/") != -1
or x.find("-") != -1
or x == np.datetime64("NaT")
):
datetype_column = (
(table_sample[cols[column_index]].apply(check_if_contains_dates))
.isin([True])
.any()
datetype_column = (
(table_sample[cols[column_index]].apply(check_if_contains_dates))
.isin([True])
.any()
)
if datetype_column:
with concurrent.futures.ThreadPoolExecutor() as executor:
table[cols[column_index]] = list(
executor.map(process_dates, table[cols[column_index]])
)
table[cols[column_index]] = list(
executor.map(coerce_datetime, table[cols[column_index]])
)
if not (x.find("//") or x.find("\\")) != -1 and datetype_column:
table[cols[column_index]] = pd.to_datetime(
table[cols[column_index]], format="%Y%m%d", errors="coerce"
)
else:
if fast_execution == False:
with concurrent.futures.ThreadPoolExecutor() as executor:
table[cols[column_index]] = list(
executor.map(process_dates, table[cols[column_index]])
executor.map(clean, table[cols[column_index]])
)
table[cols[column_index]] = list(
executor.map(remove_char, table[cols[column_index]])
)
else:
if fast_execution == False:
with concurrent.futures.ThreadPoolExecutor() as executor:
try:
table[cols[column_index]] = list(
executor.map(clean, table[cols[column_index]])
executor.map(
lambda text: text.title() if text is not None else text,
table[cols[column_index]],
)
)
table[cols[column_index]] = list(
executor.map(remove_char, table[cols[column_index]])
except AttributeError as e:
warning_type = "UserWarning"
msg = (
"It was not possible to perform the cleaning, the column {%s} is duplicated. "
% cols[column_index]
)
try:
table[cols[column_index]] = list(
executor.map(
lambda text: text.title() if text is not None else text,
table[cols[column_index]],
)
)
except AttributeError as e:
warning_type = "UserWarning"
msg = (
"It was not possible to perform the cleaning, the column {%s} is duplicated. "
% cols[column_index]
)
msg += "Error: {%s}" % e
print(f"{warning_type}: {msg}")
sys.exit("Perform correction manually")
msg += "Error: {%s}" % e
print(f"{warning_type}: {msg}")
sys.exit("Perform correction manually")

table = self._remove_duplicate_columns(table)
self.df = table.copy()
Expand Down
44 changes: 14 additions & 30 deletions pydbsmgr/utils/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,38 +284,22 @@ def _check_datetime(self, sample_frac: float) -> None:
for column_index, datatype in enumerate(df_.dtypes):
col = cols[column_index]
if datatype == "object":
x = (df_sample[col].values)[0]
datetype_column = False
if isinstance(x, str):
if (
x == ""
or x.find("/") != -1
or x.find("-") != -1
or x == np.datetime64("NaT")
):
datetype_column = (
(df_sample[col].apply(check_if_contains_dates)).isin([True]).any()
)
if not (x.find("//") or x.find("\\")) != -1 and datetype_column:
try:
with concurrent.futures.ThreadPoolExecutor() as executor:
df_[col] = list(
executor.map(lambda date: date.replace("-", ""), df_[col])
)
df_[col] = pd.to_datetime(df_[col], format="%Y%m%d").dt.normalize()
print(
f"Successfully transformed the '{col}' column into datetime64[ns]."
)
except:
with concurrent.futures.ThreadPoolExecutor() as executor:
df_[col] = list(executor.map(coerce_datetime, df_[col]))
df_[col] = pd.to_datetime(df_[col], format="%Y%m%d", errors="coerce")
print(
f"Successfully transformed the '{col}' column into datetime64[ns]."
datetype_column = (df_sample[col].apply(check_if_contains_dates)).isin([True]).any()
if datetype_column:
try:
with concurrent.futures.ThreadPoolExecutor() as executor:
df_[col] = list(
executor.map(lambda date: date.replace("-", ""), df_[col])
)
elif datatype == "datetime64[us]" or datatype == "datetime64[ns]":
df_[col] = pd.to_datetime(df_[col], format="%Y%m%d")
print(f"Successfully transformed the '{col}' column into datetime64[ns].")
except:
with concurrent.futures.ThreadPoolExecutor() as executor:
df_[col] = list(executor.map(coerce_datetime, df_[col]))
df_[col] = pd.to_datetime(df_[col], format="%Y%m%d", errors="coerce")
print(f"Successfully transformed the '{col}' column into datetime64[ns].")
elif datatype == "datetime64[us]":
df_[col] = df_[col].astype("datetime64[ns]")
df_[col] = df_[col].dt.normalize()
print(f"Successfully transformed the '{col}' column into datetime64[ns].")

self.df = df_
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setuptools.setup(
name="pydbsmgr",
version="0.6.7",
version="0.6.8",
author="J. A. Moreno-Guerra",
author_email="[email protected]",
description="Testing installation of Package",
Expand Down

0 comments on commit 501ceeb

Please sign in to comment.