Skip to content

Commit

Permalink
[FIX] some column redundancy check
Browse files Browse the repository at this point in the history
  • Loading branch information
BubuDavid committed Apr 9, 2024
1 parent c740411 commit ce903fb
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 74 deletions.
54 changes: 15 additions & 39 deletions pydbsmgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,18 +213,12 @@ def convert_date(date_string: str) -> str:
The date string in the proper format `YYYY-MM-DD`.
"""
try:
proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="raise"))[
:10
]
proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="raise"))[:10]
except:
try:
proper_date = str(
pd.to_datetime(date_string, format="%d%m%Y", errors="raise")
)[:10]
proper_date = str(pd.to_datetime(date_string, format="%d%m%Y", errors="raise"))[:10]
except:
proper_date = str(
pd.to_datetime(date_string, format="%Y%m%d", errors="ignore")
)[:10]
proper_date = str(pd.to_datetime(date_string, format="%Y%m%d", errors="ignore"))[:10]
return proper_date


Expand Down Expand Up @@ -285,9 +279,7 @@ def clean_and_convert_to(x: str) -> str:
x = remove_char(x)
try:
x, find_ = check_if_isemail(x)
if (x.find("/") != -1 or x.find("-")) != -1 and not (
x.find("//") or x.find("\\")
) != -1:
if (x.find("/") != -1 or x.find("-")) != -1 and not (x.find("//") or x.find("\\")) != -1:
x_ = x.replace("/", "")
x_ = x_.replace("-", "")

Expand Down Expand Up @@ -317,9 +309,7 @@ def clean_and_convert_to(x: str) -> str:
x = " ".join(x.split())
x = x.title()
except:
print(
f"No transformation has been performed, the character will be returned as it came."
)
print(f"No transformation has been performed, the character will be returned as it came.")
None
return x

Expand Down Expand Up @@ -366,18 +356,14 @@ def check_dtypes(dataframe: DataFrame, datatypes: Series) -> DataFrame:
dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(
clean_and_convert_to
)
dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(
correct_nan
)
dataframe[cols[column_index]] = dataframe[cols[column_index]].apply(correct_nan)
try:
dataframe[cols[column_index]] = dataframe[cols[column_index]].map(
str.strip
)
dataframe[cols[column_index]] = dataframe[cols[column_index]].map(str.strip)
except:
try:
dataframe[cols[column_index]] = dataframe[
cols[column_index]
].astype("datetime64[ns]")
dataframe[cols[column_index]] = dataframe[cols[column_index]].astype(
"datetime64[ns]"
)
except:
warning_type = "UserWarning"
msg = (
Expand Down Expand Up @@ -410,9 +396,7 @@ def create_yaml_from_df(
df_info, df = check_values(df_, df_name="df_name", sheet_name="sheet_name")

df_info["data type"] = [str(_type) for _type in df_info["data type"].to_list()]
df_info["sql name"] = [
col_name.replace(" ", "_") for col_name in df_info["column name"]
]
df_info["sql name"] = [col_name.replace(" ", "_") for col_name in df_info["column name"]]

data = {}
for col_name, data_type, sql_name in zip(
Expand All @@ -428,9 +412,7 @@ def create_yaml_from_df(
file.write(yaml_data)


def create_yaml_tree(
yaml_name: str, df_info: DataFrame, dabase_name: str = "database"
) -> None:
def create_yaml_tree(yaml_name: str, df_info: DataFrame, dabase_name: str = "database") -> None:
"""
Function that creates a `yaml` configuration file for database data type validation.
Expand Down Expand Up @@ -602,9 +584,7 @@ def check_values(
lambda x: "{:.2%}".format(float(x))
)
info["# rows"] = info["# rows"].apply(lambda x: "{:,}".format(int(x)))
info["# missing rows"] = info["# missing rows"].apply(
lambda x: "{:,}".format(int(x))
)
info["# missing rows"] = info["# missing rows"].apply(lambda x: "{:,}".format(int(x)))
info["unique values"] = info["unique values"].apply(lambda x: "{:,}".format(int(x)))

return info, df
Expand Down Expand Up @@ -656,9 +636,7 @@ def check_for_list(
drop_empty_cols=drop_empty_cols,
)
dataframes.append(df)
logger.info(
f"DataFrame '{dfs_names[j]}' has been added to the list of dataframes"
)
logger.info(f"DataFrame '{dfs_names[j]}' has been added to the list of dataframes")
df_sheet_files_info = pd.concat([df_sheet_files_info, info])
df_sheet_files_info.to_html(report_name, index=False, encoding=encoding)
logger.info(f"A report has been created under the name '{report_name}'")
Expand Down Expand Up @@ -743,7 +721,5 @@ def clearConsole():
with open("./Detail of the report/docs.txt", "w") as f:
for line in docs:
f.write(line[0] + "\t" + line[1] + "\t" + line[2] + "\n")
df_sheet_files_info.to_html(
"report-health-checker.html", index=False, encoding="latin1"
)
df_sheet_files_info.to_html("report-health-checker.html", index=False, encoding="latin1")
print("***process completed***")
41 changes: 6 additions & 35 deletions pydbsmgr/utils/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,49 +59,20 @@ def __init__(self, df: DataFrame):

def get_frame(self) -> DataFrame:
self.df = self._process_columns()
self.df = self._check_reserved_words()
return self.df

def _process_columns(self) -> DataFrame:
def _process_columns(self, surrounding: bool = False) -> DataFrame:
df = (self.df).copy()
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(".", "")
df.columns = df.columns.str.replace(",", "")
df.columns = df.columns.str.replace("__", "_")
new_cols = []
for col in df.columns:
res = any(chr.isdigit() for chr in col)
if res:
col = "[" + col + "]"
else:
col = re.sub("[^a-zA-Z0-9ñáéíóú_]", "_", col)
new_cols.append(col)
df.columns = df.columns.str.replace(r"[^a-zA-Z0-9ñáéíóú_]", "_", regex=True)

df.columns = new_cols
return df
df.columns = df.columns.str.replace("_+", "_", regex=True)
df.columns = df.columns.str.strip().strip("_")
if surrounding:
df.columns = [f"[{col}]" for col in df.columns]

def _check_reserved_words(self) -> DataFrame:
df = (self.df).copy()
new_cols = []
for col in df.columns:
# SQL reserved words
reserved_words = [
"update",
"insert",
"delete",
"create",
"drop",
"truncate",
"into",
"from",
"where",
"group",
"view",
]
if col in reserved_words:
col = "[" + col + "]"
new_cols.append(col)
df.columns = new_cols
return df


Expand Down

0 comments on commit ce903fb

Please sign in to comment.