Update tools.py

jzsmoreno · Dec 6, 2024 · e1fecd4 · e1fecd4
1 parent 2685f77
commit e1fecd4
Show file tree

Hide file tree

Showing 3 changed files with 59 additions and 67 deletions.
diff --git a/merge_by_lev/__init__.py b/merge_by_lev/__init__.py
@@ -1,7 +1 @@
-"""
-Authors: J. A. Moreno-Guerra
-Last modification: 04/09/2024
-Corresponding author: [email protected] 
-"""
 
-from merge_by_lev.main import *
diff --git a/merge_by_lev/tools.py b/merge_by_lev/tools.py
@@ -2,45 +2,44 @@
 
 import numpy as np
 import pandas as pd
-from pandas.core.frame import DataFrame
 
 
-class QualityAssesment:
+class QualityAssessment:
     """Class containing the methods to generate the quality report of a set of tables."""
 
-    def __init__(self, dfs: List[DataFrame]):
+    def __init__(self, dfs: List[pd.DataFrame]):
         self.dfs = dfs
 
     def get_report(
         self,
         df_names: List[str],
         report_name: str = "./report-health-checker.html",
         encoding: str = "latin1",
-    ) -> DataFrame:
-        """Function that returns the report generated
+    ) -> pd.DataFrame:
+        """Function that returns the report generated.
 
         Parameters
         ----------
         df_names : `List[str]`
-            list of dataframes, which are the tables.
+            List of names for the dataframes.
         report_name : `str`, optional
-            name and path to be used to save the report. By default is set to ./report-health-checker.html.
+            Name and path to be used to save the report. Defaults to "./report-health-checker.html".
         encoding : `str`, optional
-            type of report encoding. By default is set to latin1.
+            Encoding type of the report. Defaults to "latin1".
 
         Returns
         -------
-        DataFrame :
-            `DataFrame` of the generated report.
+        pd.DataFrame :
+            DataFrame containing the generated report.
         """
         df_sheet_files_info = self._iterative_evaluation(df_names)
         df_sheet_files_info.to_html(report_name, index=False, encoding=encoding)
 
         self.report = df_sheet_files_info
         return df_sheet_files_info
 
-    def _iterative_evaluation(self, df_names: List[str]) -> DataFrame:
-        """Function that iterates over the set of tables to build the report
+    def _iterative_evaluation(self, df_names: List[str]) -> pd.DataFrame:
+        """Function that iterates over the set of tables to build the report.
 
         Parameters
         ----------
@@ -49,87 +48,87 @@ def _iterative_evaluation(self, df_names: List[str]) -> DataFrame:
 
         Returns
         -------
-        `DataFrame` :
+        pd.DataFrame :
             Report generated from the set of tables.
         """
-        df_sheet_files_info = pd.DataFrame()
+        rows = []
         for i, df in enumerate(self.dfs):
-            info = []
+            nrows_total = len(df)
+            null_counts = df.isnull().sum()
+            unique_counts = df.nunique()
+
             for col in df.columns:
-                nrows = df.isnull().sum()[col] + df[col].count()
-                nrows_missing = df.isnull().sum()[col]
-                percentage = nrows_missing / nrows
+                nrows_missing = null_counts[col]
+                percentage_missing = nrows_missing / nrows_total
                 datatype = df.dtypes[col]
-                unique_vals = len(df[col].unique())
-                info.append(
+                unique_vals = unique_counts[col]
+
+                rows.append(
                     [
                         col,
                         datatype,
                         df_names[i],
-                        nrows,
+                        nrows_total,
                         nrows_missing,
-                        percentage,
+                        percentage_missing,
                         unique_vals,
                     ]
                 )
-            info = np.array(info).reshape((-1, 7))
-            info = pd.DataFrame(
-                info,
-                columns=[
-                    "column name",
-                    "data type",
-                    "database name",
-                    "# rows",
-                    "# missing rows",
-                    "# missing rows (percentage)",
-                    "unique values",
-                ],
-            )
-            info["# missing rows (percentage)"] = info["# missing rows (percentage)"].apply(
-                lambda x: "{:.2%}".format(float(x))
-            )
-            info["# rows"] = info["# rows"].apply(lambda x: "{:,}".format(int(x)))
-            info["# missing rows"] = info["# missing rows"].apply(lambda x: "{:,}".format(int(x)))
-            info["unique values"] = info["unique values"].apply(lambda x: "{:,}".format(int(x)))
-            df_sheet_files_info = pd.concat([df_sheet_files_info, info])
 
-        return df_sheet_files_info
+        info_df = pd.DataFrame(
+            rows,
+            columns=[
+                "column name",
+                "data type",
+                "database name",
+                "# rows",
+                "# missing rows",
+                "# missing rows (percentage)",
+                "unique values",
+            ],
+        )
+
+        # Apply formatting
+        info_df["# missing rows (percentage)"] = info_df["# missing rows (percentage)"].apply(
+            lambda x: "{:.2%}".format(x)
+        )
+        info_df["# rows"] = info_df["# rows"].apply(lambda x: f"{x:,}")
+        info_df["# missing rows"] = info_df["# missing rows"].apply(lambda x: f"{x:,}")
+        info_df["unique values"] = info_df["unique values"].apply(lambda x: f"{x:,}")
+
+        return info_df
 
 
 def check_empty_df(
-    dfs: List[DataFrame], names: List[str], num_cols: int = 2
-) -> Tuple[List[DataFrame], List[str]]:
-    """Check if the `DataFrame` is empty or not
+    dfs: List[pd.DataFrame], names: List[str], num_cols: int = 2
+) -> Tuple[List[pd.DataFrame], List[str]]:
+    """Check if the `DataFrame` is empty or not.
 
     Parameters
     ----------
-    dfs : `List[DataFrame]`
+    dfs : `List[pd.DataFrame]`
         List of dataframes to iterate over.
     names : `List[str]`
         List of DataFrame names.
-    num_cols : `int`
-        Minimum number of columns of a DataFrame. By default is set to 2.
+    num_cols : `int`, optional
+        Minimum number of columns of a DataFrame. Defaults to 2.
 
     Returns
     -------
-    `Tuple[List[DataFrame], List[str]]` :
+    Tuple[List[pd.DataFrame], List[str]] :
         Verified dataframes and names.
     """
-    new_dfs = []
-    new_names = []
-    for i, df in enumerate(dfs):
-        if len(df.columns) > num_cols:
-            if len(df) > num_cols:
-                new_dfs.append(df)
-                new_names.append(names[i])
-
-    return new_dfs, new_names
+    return [
+        (df, name)
+        for df, name in zip(dfs, names)
+        if len(df.columns) > num_cols and len(df) > num_cols
+    ]
 
 
 if __name__ == "__main__":
     # Create a DataFrame
     data = {"Name": ["John", "Alice", "Bob"], "Age": [25, 30, 35]}
     df = pd.DataFrame(data)
     table_name = "test_table"
-    handler = QualityAssesment([df])
+    handler = QualityAssessment([df])
     handler.get_report([table_name])
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,4 @@
 numpy<2.0.0
 pandas
 tabulate
-pydbsmgr
 pyarrow>=17.0.0