Update lightest.py

* Improved date logic * Added corrections
jzsmoreno · Nov 13, 2023 · 501ceeb · 501ceeb
1 parent f78fd83
commit 501ceeb
Show file tree

Hide file tree

Showing 4 changed files with 54 additions and 70 deletions.
diff --git a/.gitignore b/.gitignore
@@ -153,7 +153,7 @@ dmypy.json
 cython_debug/
 
 # Ignore test_<files>.py for testing 
-/test_utilities.py
+/_utilities.py
 /test_logs.txt
 /test_logsbook.csv
 *.html

diff --git a/pydbsmgr/lightest.py b/pydbsmgr/lightest.py
@@ -1,6 +1,7 @@
 import concurrent.futures
 
 from pydbsmgr.main import *
+from pydbsmgr.utils.tools import coerce_datetime
 
 
 def process_dates(x: str) -> str:
@@ -25,7 +26,9 @@ def process_dates(x: str) -> str:
     else:
         if str(x).find(":") != -1:
             x = convert_date(x[:8])
-    if len(x) == 8:
+            if str(x).isdigit():
+                x = str(pd.to_datetime(x, format="%Y%d%m", errors="ignore"))[:10]
+    if str(x).isdigit():
         try:
             x = str(pd.to_datetime(x, format="%Y%d%m", errors="ignore"))[:10]
         except:
@@ -47,50 +50,47 @@ def clean_frame(self, sample_frac: float = 0.1, fast_execution: bool = True) ->
         table_sample = table.sample(frac=sample_frac)
         for column_index, datatype in enumerate(table.dtypes):
             if datatype == "object":
-                x = (table_sample[cols[column_index]].values)[0]
-                datetype_column = False
-                if isinstance(x, str):
-                    if (
-                        x == ""
-                        or x.find("/") != -1
-                        or x.find("-") != -1
-                        or x == np.datetime64("NaT")
-                    ):
-                        datetype_column = (
-                            (table_sample[cols[column_index]].apply(check_if_contains_dates))
-                            .isin([True])
-                            .any()
+                datetype_column = (
+                    (table_sample[cols[column_index]].apply(check_if_contains_dates))
+                    .isin([True])
+                    .any()
+                )
+                if datetype_column:
+                    with concurrent.futures.ThreadPoolExecutor() as executor:
+                        table[cols[column_index]] = list(
+                            executor.map(process_dates, table[cols[column_index]])
+                        )
+                        table[cols[column_index]] = list(
+                            executor.map(coerce_datetime, table[cols[column_index]])
                         )
-                    if not (x.find("//") or x.find("\\")) != -1 and datetype_column:
+                    table[cols[column_index]] = pd.to_datetime(
+                        table[cols[column_index]], format="%Y%m%d", errors="coerce"
+                    )
+                else:
+                    if fast_execution == False:
                         with concurrent.futures.ThreadPoolExecutor() as executor:
                             table[cols[column_index]] = list(
-                                executor.map(process_dates, table[cols[column_index]])
+                                executor.map(clean, table[cols[column_index]])
+                            )
+                            table[cols[column_index]] = list(
+                                executor.map(remove_char, table[cols[column_index]])
                             )
-                    else:
-                        if fast_execution == False:
-                            with concurrent.futures.ThreadPoolExecutor() as executor:
+                            try:
                                 table[cols[column_index]] = list(
-                                    executor.map(clean, table[cols[column_index]])
+                                    executor.map(
+                                        lambda text: text.title() if text is not None else text,
+                                        table[cols[column_index]],
+                                    )
                                 )
-                                table[cols[column_index]] = list(
-                                    executor.map(remove_char, table[cols[column_index]])
+                            except AttributeError as e:
+                                warning_type = "UserWarning"
+                                msg = (
+                                    "It was not possible to perform the cleaning, the column {%s} is duplicated. "
+                                    % cols[column_index]
                                 )
-                                try:
-                                    table[cols[column_index]] = list(
-                                        executor.map(
-                                            lambda text: text.title() if text is not None else text,
-                                            table[cols[column_index]],
-                                        )
-                                    )
-                                except AttributeError as e:
-                                    warning_type = "UserWarning"
-                                    msg = (
-                                        "It was not possible to perform the cleaning, the column {%s} is duplicated. "
-                                        % cols[column_index]
-                                    )
-                                    msg += "Error: {%s}" % e
-                                    print(f"{warning_type}: {msg}")
-                                    sys.exit("Perform correction manually")
+                                msg += "Error: {%s}" % e
+                                print(f"{warning_type}: {msg}")
+                                sys.exit("Perform correction manually")
 
         table = self._remove_duplicate_columns(table)
         self.df = table.copy()

diff --git a/pydbsmgr/utils/tools.py b/pydbsmgr/utils/tools.py
@@ -284,38 +284,22 @@ def _check_datetime(self, sample_frac: float) -> None:
         for column_index, datatype in enumerate(df_.dtypes):
             col = cols[column_index]
             if datatype == "object":
-                x = (df_sample[col].values)[0]
-                datetype_column = False
-                if isinstance(x, str):
-                    if (
-                        x == ""
-                        or x.find("/") != -1
-                        or x.find("-") != -1
-                        or x == np.datetime64("NaT")
-                    ):
-                        datetype_column = (
-                            (df_sample[col].apply(check_if_contains_dates)).isin([True]).any()
-                        )
-                    if not (x.find("//") or x.find("\\")) != -1 and datetype_column:
-                        try:
-                            with concurrent.futures.ThreadPoolExecutor() as executor:
-                                df_[col] = list(
-                                    executor.map(lambda date: date.replace("-", ""), df_[col])
-                                )
-                            df_[col] = pd.to_datetime(df_[col], format="%Y%m%d").dt.normalize()
-                            print(
-                                f"Successfully transformed the '{col}' column into datetime64[ns]."
-                            )
-                        except:
-                            with concurrent.futures.ThreadPoolExecutor() as executor:
-                                df_[col] = list(executor.map(coerce_datetime, df_[col]))
-                            df_[col] = pd.to_datetime(df_[col], format="%Y%m%d", errors="coerce")
-                            print(
-                                f"Successfully transformed the '{col}' column into datetime64[ns]."
+                datetype_column = (df_sample[col].apply(check_if_contains_dates)).isin([True]).any()
+                if datetype_column:
+                    try:
+                        with concurrent.futures.ThreadPoolExecutor() as executor:
+                            df_[col] = list(
+                                executor.map(lambda date: date.replace("-", ""), df_[col])
                             )
-            elif datatype == "datetime64[us]" or datatype == "datetime64[ns]":
+                        df_[col] = pd.to_datetime(df_[col], format="%Y%m%d")
+                        print(f"Successfully transformed the '{col}' column into datetime64[ns].")
+                    except:
+                        with concurrent.futures.ThreadPoolExecutor() as executor:
+                            df_[col] = list(executor.map(coerce_datetime, df_[col]))
+                        df_[col] = pd.to_datetime(df_[col], format="%Y%m%d", errors="coerce")
+                        print(f"Successfully transformed the '{col}' column into datetime64[ns].")
+            elif datatype == "datetime64[us]":
                 df_[col] = df_[col].astype("datetime64[ns]")
-                df_[col] = df_[col].dt.normalize()
                 print(f"Successfully transformed the '{col}' column into datetime64[ns].")
 
         self.df = df_

diff --git a/setup.py b/setup.py
@@ -5,7 +5,7 @@
 
 setuptools.setup(
     name="pydbsmgr",
-    version="0.6.7",
+    version="0.6.8",
     author="J. A. Moreno-Guerra",
     author_email="[email protected]",
     description="Testing installation of Package",