diff --git a/pydbsmgr/VERSION b/pydbsmgr/VERSION index f514a2f..f76f913 100644 --- a/pydbsmgr/VERSION +++ b/pydbsmgr/VERSION @@ -1 +1 @@ -0.9.1 \ No newline at end of file +0.9.2 \ No newline at end of file diff --git a/pydbsmgr/utils/azure_sdk.py b/pydbsmgr/utils/azure_sdk.py index a782827..b93c749 100644 --- a/pydbsmgr/utils/azure_sdk.py +++ b/pydbsmgr/utils/azure_sdk.py @@ -1,16 +1,14 @@ """Define azure storage utilities""" -import gzip import os import re from io import BytesIO, StringIO from typing import List, Tuple -import pandas as pd import pyarrow.parquet as pq from azure.storage.blob import BlobPrefix, BlobServiceClient from dotenv import load_dotenv -from pandas import ExcelFile, read_csv, read_excel, read_parquet, read_table +from pandas import read_csv, read_excel from pandas.core.frame import DataFrame from pydbsmgr.utils.tools import ControllerFeatures @@ -111,7 +109,7 @@ def upload_parquet( raise ValueError(f"{format_type} not supported") def get_excel_csv( - self, directory_name: str, regex: str, manual_mode: bool = False, encoding: str = "utf-8" + self, directory_name: str, regex: str, manual_mode: bool = False ) -> Tuple[List[DataFrame], List[str]]: """Perform reading of `.xlsx` and `.csv` files in container-directory""" dataframes = list() @@ -129,24 +127,28 @@ def get_excel_csv( container=self.container_name, blob=file["name"] ) blob_data = blob_client.download_blob().readall() - print("File name : ", file["name"].split("/")[-1]) - blob_data_str = StringIO(str(blob_data, encoding)) + print("File name : ", file["name"].split("/")[-1]) if file["name"].endswith(".csv"): + # blob_data_str = StringIO(str(blob_data, encoding)) + try: + blob_data_str = blob_data.decode("utf-8") + except UnicodeDecodeError: + blob_data_str = blob_data.decode("latin-1") df_name = str(file["name"]).replace(".csv", "").split("/")[-1] dataframe_names.append(df_name) - df = read_csv(blob_data_str, index_col=None, low_memory=False) + df = read_csv(StringIO(blob_data_str), index_col=None, low_memory=False) dataframes.append(df) elif file["name"].endswith(".xlsx"): - xls_buffer = ExcelFile(blob_data) - for sheet_name in xls_buffer.sheet_names: + xls_buffer = BytesIO(blob_data) + all_sheets = read_excel(xls_buffer, sheet_name=None, index_col=None) + for sheet_name, df in all_sheets.items(): df_name = ( str(file["name"]).replace(".xlsx", "").split("/")[-1] + "-" + sheet_name ) dataframe_names.append(df_name) - df = read_excel(xls_buffer, sheet_name=sheet_name, index_col=None) - dataframes.append(df) + dataframes.append(df.reset_index(drop=True)) return dataframes, dataframe_names diff --git a/pydbsmgr/utils/tools/tools.py b/pydbsmgr/utils/tools/tools.py index 7c4a67a..260bed7 100644 --- a/pydbsmgr/utils/tools/tools.py +++ b/pydbsmgr/utils/tools/tools.py @@ -92,49 +92,20 @@ def __init__(self, df: DataFrame): def get_frame(self) -> DataFrame: self.df = self._process_columns() - self.df = self._check_reserved_words() return self.df - def _process_columns(self) -> DataFrame: + def _process_columns(self, surrounding: bool = True) -> DataFrame: df = (self.df).copy() df.columns = df.columns.str.lower() df.columns = df.columns.str.replace(".", "") df.columns = df.columns.str.replace(",", "") - df.columns = df.columns.str.replace("__", "_") - new_cols = [] - for col in df.columns: - res = any(chr.isdigit() for chr in col) - if res: - col = "[" + col + "]" - else: - col = re.sub("[^a-zA-Z0-9ñáéíóú_]", "_", col) - new_cols.append(col) + df.columns = df.columns.str.replace(r"[^a-zA-Z0-9ñáéíóú_]", "_", regex=True) - df.columns = new_cols - return df + df.columns = df.columns.str.replace("_+", "_", regex=True) + df.columns = df.columns.str.strip().strip("_") + if surrounding: + df.columns = [f"[{col}]" for col in df.columns] - def _check_reserved_words(self) -> DataFrame: - df = (self.df).copy() - new_cols = [] - for col in df.columns: - # SQL reserved words - reserved_words = [ - "update", - "insert", - "delete", - "create", - "drop", - "truncate", - "into", - "from", - "where", - "group", - "view", - ] - if col in reserved_words: - col = "[" + col + "]" - new_cols.append(col) - df.columns = new_cols return df diff --git a/requirements.txt b/requirements.txt index 385fae6..2aedb15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -numpy +numpy<2.0.0 pandas clean-text missingno diff --git a/setup.py b/setup.py index 2ea00d2..91afabd 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,13 @@ from pathlib import Path import setuptools +from pip._internal.req import parse_requirements + +# Parse the requirements.txt file +requirements = parse_requirements("requirements.txt", session="hack") + +# Get the list of requirements as strings +install_requires = [str(req.requirement) for req in requirements] with open("README.md", "r", encoding="utf-8") as fh: long_description = fh.read() @@ -27,24 +34,7 @@ project_urls={"Bug Tracker": "https://github.com/jzsmoreno/pydbsmgr"}, license="MIT", packages=setuptools.find_packages(exclude=["*.tests", "*.tests.*", "tests.*", "tests"]), - install_requires=[ - "numpy<2.0.0", - "pandas", - "clean-text", - "missingno", - "pyodbc", - "ipython", - "SQLAlchemy", - "pyyaml", - "azure-storage-blob==12.16.0", - "python-dotenv==1.0.0", - "openpyxl==3.1.2", - "pyarrow", - "fastparquet", - "loguru", - "psutil", - "Unidecode", - ], + install_requires=install_requires, classifiers=[ "Programming Language :: Python :: 3", "License :: OSI Approved :: MIT License",