diff --git a/pydbsmgr/lightest.py b/pydbsmgr/lightest.py index a5dddd0..ebd5952 100644 --- a/pydbsmgr/lightest.py +++ b/pydbsmgr/lightest.py @@ -17,10 +17,20 @@ def clean_frame(self) -> DataFrame: for column_index, datatype in enumerate(table.dtypes): if datatype == "object" or datatype == "datetime64[ns]": x = (table[cols[column_index]].values)[0] + datetype_column = True if isinstance(x, str): - if (x.find("/") != -1 or x.find("-")) != -1 and not ( - x.find("//") or x.find("\\") - ) != -1: + if ( + x == "" + or x.find("/") != -1 + or x.find("-") != -1 + or x == np.datetime64("NaT") + ): + datetype_column = ( + (table[cols[column_index]].apply(check_if_contains_dates)) + .isin([True]) + .any() + ) + if not (x.find("//") or x.find("\\")) != -1 and datetype_column: with concurrent.futures.ThreadPoolExecutor() as executor: table[cols[column_index]] = list( executor.map(clean_and_convert_to, table[cols[column_index]]) diff --git a/pydbsmgr/main.py b/pydbsmgr/main.py index 5adf159..f24c617 100644 --- a/pydbsmgr/main.py +++ b/pydbsmgr/main.py @@ -22,6 +22,17 @@ ######################################################################################## +def check_if_contains_dates(input_string: str) -> bool: + """Check if a string contains date.""" + if input_string == "": + return False + else: + if re.search(r"\d{4}(-|/)\d{1,2}(-|/)\d{1,2}", str(input_string)): + return True + else: + return False + + def remove_numeric_char(input_string: str) -> str: """Remove all numeric characters from a string. diff --git a/pydbsmgr/utils/tools.py b/pydbsmgr/utils/tools.py index aeefbeb..940b9e5 100644 --- a/pydbsmgr/utils/tools.py +++ b/pydbsmgr/utils/tools.py @@ -15,7 +15,7 @@ from pandas.errors import IntCastingNaNError from pyarrow import Table -from pydbsmgr.main import is_number_regex +from pydbsmgr.main import is_number_regex, check_if_contains_dates class ColumnsCheck: @@ -280,10 +280,18 @@ def _check_datetime(self) -> None: col = cols[column_index] if datatype == "object": x = (df_[col].values)[0] + datetype_column = True if isinstance(x, str): - if (x.find("/") != -1 or x.find("-")) != -1 and not ( - x.find("//") or x.find("\\") - ) != -1: + if ( + x == "" + or x.find("/") != -1 + or x.find("-") != -1 + or x == np.datetime64("NaT") + ): + datetype_column = ( + (df_[col].apply(check_if_contains_dates)).isin([True]).any() + ) + if not (x.find("//") or x.find("\\")) != -1 and datetype_column: try: with concurrent.futures.ThreadPoolExecutor() as executor: df_[col] = list( diff --git a/setup.py b/setup.py index d85b17d..5f4cb31 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="pydbsmgr", - version="0.5.7", + version="0.5.8", author="J. A. Moreno-Guerra", author_email="jzs.gm27@gmail.com", description="Testing installation of Package",