Skip to content

Commit

Permalink
Update schema_config.py
Browse files Browse the repository at this point in the history
  • Loading branch information
jzsmoreno committed Apr 10, 2024
1 parent d967dfd commit 0e90ea8
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 21 deletions.
1 change: 1 addition & 0 deletions merge_by_lev/VERSION
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
0.2.8
2 changes: 1 addition & 1 deletion merge_by_lev/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
"""
Authors: J. A. Moreno-Guerra
Last modification: 05/23/2022
Last modification: 04/09/2024
Corresponding author: [email protected]
"""

Expand Down
55 changes: 43 additions & 12 deletions merge_by_lev/schema_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,30 +8,32 @@
from pydbsmgr.main import *
from pydbsmgr.main import DataFrame
from pydbsmgr.utils.azure_sdk import *
import pandas as pd


class StandardColumns:
"""Allows generic column renaming and creates a `.json` file with the equivalent names.
Allows columns containing questions to be transformed according to `SQL` standards."""
"""Allows columns to be transformed according to `SQL` standards
or creates a `.json` file with the obfuscated columns."""

def __init__(self, df: DataFrame) -> None:
self.df = df.copy()

def get_frame(
self,
json_name: str = "output.json",
write_to_cloud: bool = True,
write_to_cloud: bool = False,
connection_string: str = "",
container_name: str = "",
overwrite: bool = True,
encoding: str = "utf-8",
get_standard: bool = True,
**kwargs,
) -> DataFrame:
"""Returns the `DataFrame` with the obfuscated columns.
"""Returns the `DataFrame` with the obfuscated columns or SQL standard format.
Args:
json_name (`str`, optional): name of the dictionary `.json` file. By default it is set to `output.json`.
write_to_cloud (`bool`, optional): boolean variable to write to an Azure storage account. By default it is set to `True`.
write_to_cloud (`bool`, optional): boolean variable to write to an Azure storage account. By default it is set to `False`.
connection_string (`str`, optional): the connection string to storage account. By default it is set to "".
container_name (`str`, optional): Azure container name. By default it is set to "".
overwrite (`bool`, optional): boolean variable that indicates whether to overwrite. By default it is set to `True`.
Expand All @@ -40,27 +42,56 @@ def get_frame(
Returns:
`DataFrame`: `DataFrame` with changed columns
Keyword Arguments:
----------
- snake_case (`bool`, optional): If true - transforms column names into snake
case otherwise camel case will be used. Default is `True`.
- sort (`bool`, optional): If true - sorts columns by their names in alphabetical order.
Default is `False`.
- surrounding (`bool`, optional): If true - removes brackets from column names before transformation.
Default is `True`.
"""
self._generate_dict(encoding)
self._writer(json_name, write_to_cloud, connection_string, container_name, overwrite)
if get_standard:
df_renamed = self._sql_standards()
df_renamed = self._sql_standards(**kwargs)
else:
df_renamed = (self.df).rename(columns=self.obfuscator)
return df_renamed

def _sql_standards(self) -> DataFrame:
def _sql_standards(
self, snake_case: bool = True, sort: bool = False, surrounding: bool = True
) -> DataFrame:
"""Transforms all column names into SQL standard format.
Args:
snake_case (`bool`, optional): If true - transforms column names into snake
case otherwise camel case will be used. Default is `True`.
sort (`bool`, optional): If true - sorts columns by their names in alphabetical order.
Default is `False`.
surrounding (`bool`, optional): If true - removes brackets from column names before transformation.
Default is `True`.
Returns:
`DataFrame`: `DataFrame` with transformed columns.
"""
df = (self.df).copy()
if surrounding:
df.columns = [col[1:-1] for col in df.columns]
df.columns = df.columns.str.lower()
df.columns = df.columns.str.replace("_", " ")
df.columns = df.columns.str.replace("__", " ")
df.columns = df.columns.str.replace("_+", " ", regex=True)
df.columns = df.columns.str.title()
df.columns = df.columns.str.strip()
df.columns = df.columns.str.replace(" ", "")
df.columns = df.columns.str.replace("\n", "_")
df.columns = [self._camel_to_snake(col) for col in df.columns]
if snake_case:
df.columns = [self._camel_to_snake(col) for col in df.columns]
df.columns = [self._truncate(col) for col in df.columns]
# df = self._sort_columns_by_length(df)
if sort:
df = self._sort_columns_by_length(df)
if surrounding:
df.columns = [f"[{col}]" for col in df.columns]
return df

def _truncate(self, column_name: str) -> str:
Expand Down Expand Up @@ -327,5 +358,5 @@ def get_table(self) -> Table:
schema = data_handler.get_schema()
table = data_handler.get_table()
column_handler = StandardColumns(df)
df = column_handler.get_frame(write_to_cloud=False)
df = column_handler.get_frame(surrounding=False)
breakpoint()
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
numpy
pandas
tabulate
git+https://github.com/jzsmoreno/pydbsmgr
pydbsmgr
pyarrow
25 changes: 18 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
from pathlib import Path

import setuptools
from pip._internal.req import parse_requirements

# Parse the requirements.txt file
requirements = parse_requirements("requirements.txt", session="hack")

# Get the list of requirements as strings
install_requires = [str(req.requirement) for req in requirements]

with open("README.md", "r", encoding="utf-8") as fh:
long_description = fh.read()

about = {}
ROOT_DIR = Path(__file__).resolve().parent
PACKAGE_DIR = ROOT_DIR / "merge_by_lev"
with open(PACKAGE_DIR / "VERSION") as f:
_version = f.read().strip()
about["__version__"] = _version

setuptools.setup(
name="merge_by_lev",
version="0.2.7",
version=about["__version__"],
author="J. A. Moreno-Guerra",
author_email="[email protected]",
description="Testing installation of Package",
Expand All @@ -15,12 +31,7 @@
project_urls={"Bug Tracker": "https://github.com/jzsmoreno/merge_by_lev"},
license="MIT",
packages=["merge_by_lev"],
install_requires=[
"numpy",
"pandas",
"tabulate",
"pyarrow",
],
install_requires=install_requires,
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand Down

0 comments on commit 0e90ea8

Please sign in to comment.