[FIX] some column redundancy check

jzsmoreno · Apr 9, 2024 · ce903fb · ce903fb
1 parent c740411
commit ce903fb
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 74 deletions.
diff --git a/pydbsmgr/main.py b/pydbsmgr/main.py
@@ -213,18 +213,12 @@ def convert_date(date_string: str) -> str:
         The date string in the proper format `YYYY-MM-DD`.
     """
     try:
-        proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="raise"))[
-            :10
-        ]
+        proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="raise"))[:10]
     except:
         try:
-            proper_date = str(
-                pd.to_datetime(date_string, format="%d%m%Y", errors="raise")
-            )[:10]
+            proper_date = str(pd.to_datetime(date_string, format="%d%m%Y", errors="raise"))[:10]
         except:
-            proper_date = str(
-                pd.to_datetime(date_string, format="%Y%m%d", errors="ignore")
-            )[:10]
+            proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="ignore"))[:10]
     return proper_date
 
 
@@ -285,9 +279,7 @@ def clean_and_convert_to(x: str) -> str:
     x = remove_char(x)
     try:
         x, find_ = check_if_isemail(x)
-        if (x.find("/") != -1 or x.find("-")) != -1 and not (
-            x.find("//") or x.find("\\")
-        ) != -1:
+        if (x.find("/") != -1 or x.find("-")) != -1 and not (x.find("//") or x.find("\\")) != -1:
             x_ = x.replace("/", "")
             x_ = x_.replace("-", "")
 
@@ -317,9 +309,7 @@ def clean_and_convert_to(x: str) -> str:
                     x = " ".join(x.split())
                     x = x.title()
     except:
-        print(
-            f"No transformation has been performed, the character will be returned as it came."
-        )
+        print(f"No transformation has been performed, the character will be returned as it came.")
         None
     return x
 
@@ -366,18 +356,14 @@ def check_dtypes(dataframe: DataFrame, datatypes: Series) -> DataFrame:
             dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(
                 clean_and_convert_to
             )
-            dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(
-                correct_nan
-            )
+            dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(correct_nan)
             try:
-                dataframe[cols[column_index]] = dataframe[cols[column_index]].map(
-                    str.strip
-                )
+                dataframe[cols[column_index]] = dataframe[cols[column_index]].map(str.strip)
             except:
                 try:
-                    dataframe[cols[column_index]] = dataframe[
-                        cols[column_index]
-                    ].astype("datetime64[ns]")
+                    dataframe[cols[column_index]] = dataframe[cols[column_index]].astype(
+                        "datetime64[ns]"
+                    )
                 except:
                     warning_type = "UserWarning"
                     msg = (
@@ -410,9 +396,7 @@ def create_yaml_from_df(
     df_info, df = check_values(df_, df_name="df_name", sheet_name="sheet_name")
 
     df_info["data type"] = [str(_type) for _type in df_info["data type"].to_list()]
-    df_info["sql name"] = [
-        col_name.replace(" ", "_") for col_name in df_info["column name"]
-    ]
+    df_info["sql name"] = [col_name.replace(" ", "_") for col_name in df_info["column name"]]
 
     data = {}
     for col_name, data_type, sql_name in zip(
@@ -428,9 +412,7 @@ def create_yaml_from_df(
         file.write(yaml_data)
 
 
-def create_yaml_tree(
-    yaml_name: str, df_info: DataFrame, dabase_name: str = "database"
-) -> None:
+def create_yaml_tree(yaml_name: str, df_info: DataFrame, dabase_name: str = "database") -> None:
     """
     Function that creates a `yaml` configuration file for database data type validation.
 
@@ -602,9 +584,7 @@ def check_values(
         lambda x: "{:.2%}".format(float(x))
     )
     info["# rows"] = info["# rows"].apply(lambda x: "{:,}".format(int(x)))
-    info["# missing rows"] = info["# missing rows"].apply(
-        lambda x: "{:,}".format(int(x))
-    )
+    info["# missing rows"] = info["# missing rows"].apply(lambda x: "{:,}".format(int(x)))
     info["unique values"] = info["unique values"].apply(lambda x: "{:,}".format(int(x)))
 
     return info, df
@@ -656,9 +636,7 @@ def check_for_list(
             drop_empty_cols=drop_empty_cols,
         )
         dataframes.append(df)
-        logger.info(
-            f"DataFrame '{dfs_names[j]}' has been added to the list of dataframes"
-        )
+        logger.info(f"DataFrame '{dfs_names[j]}' has been added to the list of dataframes")
         df_sheet_files_info = pd.concat([df_sheet_files_info, info])
     df_sheet_files_info.to_html(report_name, index=False, encoding=encoding)
     logger.info(f"A report has been created under the name '{report_name}'")
@@ -743,7 +721,5 @@ def clearConsole():
     with open("./Detail of the report/docs.txt", "w") as f:
         for line in docs:
             f.write(line[0] + "\t" + line[1] + "\t" + line[2] + "\n")
-    df_sheet_files_info.to_html(
-        "report-health-checker.html", index=False, encoding="latin1"
-    )
+    df_sheet_files_info.to_html("report-health-checker.html", index=False, encoding="latin1")
     print("***process completed***")
diff --git a/pydbsmgr/utils/tools/tools.py b/pydbsmgr/utils/tools/tools.py
@@ -59,49 +59,20 @@ def __init__(self, df: DataFrame):
 
     def get_frame(self) -> DataFrame:
         self.df = self._process_columns()
-        self.df = self._check_reserved_words()
         return self.df
 
-    def _process_columns(self) -> DataFrame:
+    def _process_columns(self, surrounding: bool = False) -> DataFrame:
         df = (self.df).copy()
         df.columns = df.columns.str.lower()
         df.columns = df.columns.str.replace(".", "")
         df.columns = df.columns.str.replace(",", "")
-        df.columns = df.columns.str.replace("__", "_")
-        new_cols = []
-        for col in df.columns:
-            res = any(chr.isdigit() for chr in col)
-            if res:
-                col = "[" + col + "]"
-            else:
-                col = re.sub("[^a-zA-Z0-9ñáéíóú_]", "_", col)
-            new_cols.append(col)
+        df.columns = df.columns.str.replace(r"[^a-zA-Z0-9ñáéíóú_]", "_", regex=True)
 
-        df.columns = new_cols
-        return df
+        df.columns = df.columns.str.replace("_+", "_", regex=True)
+        df.columns = df.columns.str.strip().strip("_")
+        if surrounding:
+            df.columns = [f"[{col}]" for col in df.columns]
 
-    def _check_reserved_words(self) -> DataFrame:
-        df = (self.df).copy()
-        new_cols = []
-        for col in df.columns:
-            # SQL reserved words
-            reserved_words = [
-                "update",
-                "insert",
-                "delete",
-                "create",
-                "drop",
-                "truncate",
-                "into",
-                "from",
-                "where",
-                "group",
-                "view",
-            ]
-            if col in reserved_words:
-                col = "[" + col + "]"
-            new_cols.append(col)
-        df.columns = new_cols
         return df