Skip to content

Commit

Permalink
Update tools.py
Browse files Browse the repository at this point in the history
  • Loading branch information
jzsmoreno committed Dec 6, 2024
1 parent 2685f77 commit e1fecd4
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 67 deletions.
6 changes: 0 additions & 6 deletions merge_by_lev/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1 @@
"""
Authors: J. A. Moreno-Guerra
Last modification: 04/09/2024
Corresponding author: [email protected]
"""

from merge_by_lev.main import *
119 changes: 59 additions & 60 deletions merge_by_lev/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,45 +2,44 @@

import numpy as np
import pandas as pd
from pandas.core.frame import DataFrame


class QualityAssesment:
class QualityAssessment:
"""Class containing the methods to generate the quality report of a set of tables."""

def __init__(self, dfs: List[DataFrame]):
def __init__(self, dfs: List[pd.DataFrame]):
self.dfs = dfs

def get_report(
self,
df_names: List[str],
report_name: str = "./report-health-checker.html",
encoding: str = "latin1",
) -> DataFrame:
"""Function that returns the report generated
) -> pd.DataFrame:
"""Function that returns the report generated.
Parameters
----------
df_names : `List[str]`
list of dataframes, which are the tables.
List of names for the dataframes.
report_name : `str`, optional
name and path to be used to save the report. By default is set to ./report-health-checker.html.
Name and path to be used to save the report. Defaults to "./report-health-checker.html".
encoding : `str`, optional
type of report encoding. By default is set to latin1.
Encoding type of the report. Defaults to "latin1".
Returns
-------
DataFrame :
`DataFrame` of the generated report.
pd.DataFrame :
DataFrame containing the generated report.
"""
df_sheet_files_info = self._iterative_evaluation(df_names)
df_sheet_files_info.to_html(report_name, index=False, encoding=encoding)

self.report = df_sheet_files_info
return df_sheet_files_info

def _iterative_evaluation(self, df_names: List[str]) -> DataFrame:
"""Function that iterates over the set of tables to build the report
def _iterative_evaluation(self, df_names: List[str]) -> pd.DataFrame:
"""Function that iterates over the set of tables to build the report.
Parameters
----------
Expand All @@ -49,87 +48,87 @@ def _iterative_evaluation(self, df_names: List[str]) -> DataFrame:
Returns
-------
`DataFrame` :
pd.DataFrame :
Report generated from the set of tables.
"""
df_sheet_files_info = pd.DataFrame()
rows = []
for i, df in enumerate(self.dfs):
info = []
nrows_total = len(df)
null_counts = df.isnull().sum()
unique_counts = df.nunique()

for col in df.columns:
nrows = df.isnull().sum()[col] + df[col].count()
nrows_missing = df.isnull().sum()[col]
percentage = nrows_missing / nrows
nrows_missing = null_counts[col]
percentage_missing = nrows_missing / nrows_total
datatype = df.dtypes[col]
unique_vals = len(df[col].unique())
info.append(
unique_vals = unique_counts[col]

rows.append(
[
col,
datatype,
df_names[i],
nrows,
nrows_total,
nrows_missing,
percentage,
percentage_missing,
unique_vals,
]
)
info = np.array(info).reshape((-1, 7))
info = pd.DataFrame(
info,
columns=[
"column name",
"data type",
"database name",
"# rows",
"# missing rows",
"# missing rows (percentage)",
"unique values",
],
)
info["# missing rows (percentage)"] = info["# missing rows (percentage)"].apply(
lambda x: "{:.2%}".format(float(x))
)
info["# rows"] = info["# rows"].apply(lambda x: "{:,}".format(int(x)))
info["# missing rows"] = info["# missing rows"].apply(lambda x: "{:,}".format(int(x)))
info["unique values"] = info["unique values"].apply(lambda x: "{:,}".format(int(x)))
df_sheet_files_info = pd.concat([df_sheet_files_info, info])

return df_sheet_files_info
info_df = pd.DataFrame(
rows,
columns=[
"column name",
"data type",
"database name",
"# rows",
"# missing rows",
"# missing rows (percentage)",
"unique values",
],
)

# Apply formatting
info_df["# missing rows (percentage)"] = info_df["# missing rows (percentage)"].apply(
lambda x: "{:.2%}".format(x)
)
info_df["# rows"] = info_df["# rows"].apply(lambda x: f"{x:,}")
info_df["# missing rows"] = info_df["# missing rows"].apply(lambda x: f"{x:,}")
info_df["unique values"] = info_df["unique values"].apply(lambda x: f"{x:,}")

return info_df


def check_empty_df(
dfs: List[DataFrame], names: List[str], num_cols: int = 2
) -> Tuple[List[DataFrame], List[str]]:
"""Check if the `DataFrame` is empty or not
dfs: List[pd.DataFrame], names: List[str], num_cols: int = 2
) -> Tuple[List[pd.DataFrame], List[str]]:
"""Check if the `DataFrame` is empty or not.
Parameters
----------
dfs : `List[DataFrame]`
dfs : `List[pd.DataFrame]`
List of dataframes to iterate over.
names : `List[str]`
List of DataFrame names.
num_cols : `int`
Minimum number of columns of a DataFrame. By default is set to 2.
num_cols : `int`, optional
Minimum number of columns of a DataFrame. Defaults to 2.
Returns
-------
`Tuple[List[DataFrame], List[str]]` :
Tuple[List[pd.DataFrame], List[str]] :
Verified dataframes and names.
"""
new_dfs = []
new_names = []
for i, df in enumerate(dfs):
if len(df.columns) > num_cols:
if len(df) > num_cols:
new_dfs.append(df)
new_names.append(names[i])

return new_dfs, new_names
return [
(df, name)
for df, name in zip(dfs, names)
if len(df.columns) > num_cols and len(df) > num_cols
]


if __name__ == "__main__":
# Create a DataFrame
data = {"Name": ["John", "Alice", "Bob"], "Age": [25, 30, 35]}
df = pd.DataFrame(data)
table_name = "test_table"
handler = QualityAssesment([df])
handler = QualityAssessment([df])
handler.get_report([table_name])
1 change: 0 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
numpy<2.0.0
pandas
tabulate
pydbsmgr
pyarrow>=17.0.0

0 comments on commit e1fecd4

Please sign in to comment.