Skip to content

Commit

Permalink
develop (#24)
Browse files Browse the repository at this point in the history
* [FIX] some column redundancy check

* Update `setup.py` install_requires

* Update `azure_sdk.py`

* [FIX] encoding of `get_excel_csv` method

---------

Co-authored-by: BubuDavid <[email protected]>
  • Loading branch information
jzsmoreno and BubuDavid authored Apr 10, 2024
1 parent d5b10ec commit 735f4cd
Show file tree
Hide file tree
Showing 5 changed files with 29 additions and 66 deletions.
2 changes: 1 addition & 1 deletion pydbsmgr/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.1
0.9.2
24 changes: 13 additions & 11 deletions pydbsmgr/utils/azure_sdk.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,14 @@
"""Define azure storage utilities"""

import gzip
import os
import re
from io import BytesIO, StringIO
from typing import List, Tuple

import pandas as pd
import pyarrow.parquet as pq
from azure.storage.blob import BlobPrefix, BlobServiceClient
from dotenv import load_dotenv
from pandas import ExcelFile, read_csv, read_excel, read_parquet, read_table
from pandas import read_csv, read_excel
from pandas.core.frame import DataFrame

from pydbsmgr.utils.tools import ControllerFeatures
Expand Down Expand Up @@ -111,7 +109,7 @@ def upload_parquet(
raise ValueError(f"{format_type} not supported")

def get_excel_csv(
self, directory_name: str, regex: str, manual_mode: bool = False, encoding: str = "utf-8"
self, directory_name: str, regex: str, manual_mode: bool = False
) -> Tuple[List[DataFrame], List[str]]:
"""Perform reading of `.xlsx` and `.csv` files in container-directory"""
dataframes = list()
Expand All @@ -129,24 +127,28 @@ def get_excel_csv(
container=self.container_name, blob=file["name"]
)
blob_data = blob_client.download_blob().readall()
print("File name : ", file["name"].split("/")[-1])

blob_data_str = StringIO(str(blob_data, encoding))
print("File name : ", file["name"].split("/")[-1])

if file["name"].endswith(".csv"):
# blob_data_str = StringIO(str(blob_data, encoding))
try:
blob_data_str = blob_data.decode("utf-8")
except UnicodeDecodeError:
blob_data_str = blob_data.decode("latin-1")
df_name = str(file["name"]).replace(".csv", "").split("/")[-1]
dataframe_names.append(df_name)
df = read_csv(blob_data_str, index_col=None, low_memory=False)
df = read_csv(StringIO(blob_data_str), index_col=None, low_memory=False)
dataframes.append(df)
elif file["name"].endswith(".xlsx"):
xls_buffer = ExcelFile(blob_data)
for sheet_name in xls_buffer.sheet_names:
xls_buffer = BytesIO(blob_data)
all_sheets = read_excel(xls_buffer, sheet_name=None, index_col=None)
for sheet_name, df in all_sheets.items():
df_name = (
str(file["name"]).replace(".xlsx", "").split("/")[-1] + "-" + sheet_name
)
dataframe_names.append(df_name)
df = read_excel(xls_buffer, sheet_name=sheet_name, index_col=None)
dataframes.append(df)
dataframes.append(df.reset_index(drop=True))

return dataframes, dataframe_names

Expand Down
41 changes: 6 additions & 35 deletions pydbsmgr/utils/tools/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,49 +92,20 @@ def __init__(self, df: DataFrame):

def get_frame(self) -> DataFrame:
self.df = self._process_columns()
self.df = self._check_reserved_words()
return self.df

def _process_columns(self) -> DataFrame:
def _process_columns(self, surrounding: bool = True) -> DataFrame:
df = (self.df).copy()
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace(".", "")
df.columns = df.columns.str.replace(",", "")
df.columns = df.columns.str.replace("__", "_")
new_cols = []
for col in df.columns:
res = any(chr.isdigit() for chr in col)
if res:
col = "[" + col + "]"
else:
col = re.sub("[^a-zA-Z0-9ñáéíóú_]", "_", col)
new_cols.append(col)
df.columns = df.columns.str.replace(r"[^a-zA-Z0-9ñáéíóú_]", "_", regex=True)

df.columns = new_cols
return df
df.columns = df.columns.str.replace("_+", "_", regex=True)
df.columns = df.columns.str.strip().strip("_")
if surrounding:
df.columns = [f"[{col}]" for col in df.columns]

def _check_reserved_words(self) -> DataFrame:
df = (self.df).copy()
new_cols = []
for col in df.columns:
# SQL reserved words
reserved_words = [
"update",
"insert",
"delete",
"create",
"drop",
"truncate",
"into",
"from",
"where",
"group",
"view",
]
if col in reserved_words:
col = "[" + col + "]"
new_cols.append(col)
df.columns = new_cols
return df


Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
numpy
numpy<2.0.0
pandas
clean-text
missingno
Expand Down
26 changes: 8 additions & 18 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,13 @@
from pathlib import Path

import setuptools
from pip._internal.req import parse_requirements

# Parse the requirements.txt file
requirements = parse_requirements("requirements.txt", session="hack")

# Get the list of requirements as strings
install_requires = [str(req.requirement) for req in requirements]

with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()
Expand All @@ -27,24 +34,7 @@
project_urls={"Bug Tracker": "https://github.com/jzsmoreno/pydbsmgr"},
license="MIT",
packages=setuptools.find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]),
install_requires=[
"numpy<2.0.0",
"pandas",
"clean-text",
"missingno",
"pyodbc",
"ipython",
"SQLAlchemy",
"pyyaml",
"azure-storage-blob==12.16.0",
"python-dotenv==1.0.0",
"openpyxl==3.1.2",
"pyarrow",
"fastparquet",
"loguru",
"psutil",
"Unidecode",
],
install_requires=install_requires,
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down

0 comments on commit 735f4cd

Please sign in to comment.