Skip to content

Commit

Permalink
Update documentation (#25)
Browse files Browse the repository at this point in the history
  • Loading branch information
jzsmoreno authored May 10, 2024
1 parent 8c0763c commit 1da7d92
Show file tree
Hide file tree
Showing 10 changed files with 159 additions and 103 deletions.
5 changes: 5 additions & 0 deletions .github/CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# This is a CODEOWNERS file
# Each line specifies a file pattern followed by one or more GitHub usernames or team names

# Owners for the entire repository
* @jzsmoreno
24 changes: 17 additions & 7 deletions .github/workflows/generate-docs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ name: Auto-documentation Generation
on:
push:
branches:
- main
- 'main'
paths:
- 'pydbsmgr/**'

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:

jobs:
generate-docs:
Expand All @@ -24,6 +29,17 @@ jobs:
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
pip install pdoc3
- name: Set up Git
env:
GITHUB_TOKEN: ${{ secrets.TOKEN }}
GITHUB_NAME: ${{ secrets.NAME }}
GITHUB_EMAIL: ${{ secrets.EMAIL }}
run: |
git config user.email "${GITHUB_EMAIL}"
git config user.name "${GITHUB_NAME}"
git config credential.helper "store --file=.git/credentials"
echo "https://${{ secrets.TOKEN }}@github.com/${{ github.repository }}" > .git/credentials
- name: Remove existing documentation files
run: rm -rf docs/*

Expand All @@ -43,13 +59,7 @@ jobs:
rm -rf docs/${{ steps.get_package_name.outputs.name }}
- name: Commit documentation changes
env:
GITHUB_TOKEN: ${{ secrets.TOKEN }}
GITHUB_ACTOR: ${{ github.actor }}
GITHUB_EMAIL: [email protected]
run: |
git config user.email "${GITHUB_EMAIL}"
git config user.name "${GITHUB_ACTOR}"
if git status --porcelain | grep .; then
echo "Changes detected, proceeding with workflow steps..."
git add docs/
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/jekyll-gh-pages.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ on:
branches:
- 'main'
paths:
- 'docs/*'
- 'docs/**'

# Allows you to run this workflow manually from the Actions tab
workflow_dispatch:
Expand Down
2 changes: 1 addition & 1 deletion pydbsmgr/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.9.5
0.9.6
32 changes: 20 additions & 12 deletions pydbsmgr/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,12 @@ def __init__(self, _df: DataFrame | List[DataFrame], df_names: str | List[str] =
def fix(self, cols_upper_case: bool = False, drop_empty_cols: bool = True) -> None:
"""Performs the clean of the data and validation
Args:
-----
cols_upper_case (`bool`, optional): Indicates whether to convert column names to uppercase. Defaults to `False`.
drop_empty_cols (`bool`, optional): Variable indicating whether columns with all their values empty should be removed. Defaults to `True`.
Parameters
----------
cols_upper_case : `bool`, `optional`
Indicates whether to convert column names to uppercase. Defaults to `False`.
drop_empty_cols : `bool`, `optional`
Variable indicating whether columns with all their values empty should be removed. Defaults to `True`.
"""
if drop_empty_cols:
for count, df in enumerate(self._dfs):
Expand All @@ -60,14 +62,20 @@ def generate_report(
) -> None:
"""Generate a `.html` health check report.
Args:
-----
report_name (`str`, optional): Name of the quality assessment report. Defaults to `./report.html`.
yaml_name (`str`, optional): Indicates the name of the `.yaml` file that will serve as a template for the creation of the SQL table. Defaults to `./output.yaml`.
database_name (`str`, optional): The header of the `.yaml` file. Default value is `database`
directory_name (`str`, optional): Folder in which the reports will be saved. Defaults to `summary`.
concat_vertically: (`bool`, optional), Variable indicating whether the list of dataframes should be vertically concatenated into a single one. Default value is `False`.
encoding (`str`, optional): The encoding of dataframes. Defaults to `utf-8`.
Parameters
----------
report_name : `str`, `optional`
Name of the quality assessment report. Defaults to `./report.html`.
yaml_name : `str`, `optional`
Indicates the name of the `.yaml` file that will serve as a template for the creation of the SQL table. Defaults to `./output.yaml`.
database_name : `str`, `optional`
The header of the `.yaml` file. Default value is `database`
directory_name : `str`, `optional`
Folder in which the reports will be saved. Defaults to `summary`.
concat_vertically : `bool`, `optional`
Variable indicating whether the list of dataframes should be vertically concatenated into a single one. Default value is `False`.
encoding : `str`, `optional`
The encoding of dataframes. Defaults to `utf-8`.
"""
self.df_files_info = pd.DataFrame()
self.yaml_name = yaml_name
Expand Down
23 changes: 12 additions & 11 deletions pydbsmgr/lightest.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,13 @@ def process_dates(x: str, format_type: str, auxiliary_type: str, errors: str = "
Parameters
----------
x : `str`
character of type date.
x : `str`
character of type date.
Returns
----------
x : `str`
character after processing with format `YYYY-MM-DD`.
x : `str`
character after processing with format `YYYY-MM-DD`.
"""
# performing data type conversion
x = str(x)
Expand Down Expand Up @@ -78,16 +78,17 @@ def clean_frame(
Parameters
----------
- sample_frac (`float`): The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%.
- fast_execution (`bool`): If `False` use `applymap` pandas for extra text cleanup. Default is `True`.
sample_frac : `float`
The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%.
fast_execution : `bool`
If `False` use `applymap` pandas for extra text cleanup. Default is `True`.
Keyword Arguments:
----------
- no_emoji: (`bool`): By default it is set to `False`.
If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`.
- title_mode: (`bool`): By default it is set to `True`.
If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`.
By default, converts everything to `title`.
no_emoji : `bool`
By default it is set to `False`. If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`.
title_mode : `bool`
By default it is set to `True`. If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`. By default, converts everything to `title`.
"""
table = (self.df).copy()
cols = table.columns
Expand Down
12 changes: 8 additions & 4 deletions pydbsmgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,15 @@ def check_if_contains_dates(input_string: str) -> bool:
def remove_numeric_char(input_string: str) -> str:
"""Remove all numeric characters from a string.
Args:
input_string (`str`): character string to be cleaned of numeric characters
Parameters
----------
input_string : `str`
character string to be cleaned of numeric characters
Returns:
`str`: clean character string
Returns
-------
`str`
clean character string
"""
return re.sub(r"\d", "", input_string)

Expand Down
8 changes: 4 additions & 4 deletions pydbsmgr/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,12 @@ def load_config(config_file):
Parameters
----------
config_file : str
config_file : `str`
The path to the configuration file.
Returns
-------
config : ConfigParser
config : `ConfigParser`
A configuration object loaded from file.
"""

Expand All @@ -26,12 +26,12 @@ def parse_config(config):
Parameters
----------
config : ConfigParser
config : `ConfigParser`
A configuration object loaded from file.
Returns
-------
parsed_config : dict
parsed_config : `dict`
A dictionary of parsed configuration values.
"""

Expand Down
103 changes: 64 additions & 39 deletions pydbsmgr/utils/sql_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,22 @@ def insert_data(
) -> None:
"""Insert data into SQL Server.
Parameters:
Parameters
----------
df (`Dataframe` or `str`): The pandas dataframe that will be inserted into sql server
table_name (`str`): Name of the table in which the data is being inserted
overwrite (`bool`): If `True` it will delete and recreate the table before inserting new data
if `False` it will append the new data onto the end of the existing table
char_length (`int`): Length of varchar fields for text columns
override_length (`bool`): Override length of varchar fields for text columns.
Returns:
----------
`None`
df : `Dataframe` | `str`
The pandas dataframe that will be inserted into sql server
table_name : `str`
Name of the table in which the data is being inserted
overwrite : `bool`
If `True` it will delete and recreate the table before inserting new data if `False` it will append the new data onto the end of the existing table.
char_length : `int`
Length of varchar fields for text columns.
override_length : `bool`
Override length of varchar fields for text columns.
Returns
-------
`None`
"""

self.file_type = None
Expand Down Expand Up @@ -131,22 +135,33 @@ def bulk_insert_from_csv(
) -> bool:
"""Insert data from csv files in Azure Blob Storage into SQL Server with Bulk command
Parameters:
----------
file_path (`str`): Path to the file in Azure Blob Storage
db_table_name (`str`): Name of the table in which the data is being inserted
sas_str (`str`): SAS string to the storage account
storage_connection_string (`str`): Connection string to the storage account
storage_account (`str`): Name of the storage account
container_name (`str`): Name of the container in which the data is being inserted
credential_name (`str`): Name of the credentials
data_source_name (`str`): Name of the data source
char_length (`int`): Length of varchar fields for text columns
overwrite (`bool`): If `True` it will delete and recreate the table before inserting new data
if `False` it will append the new data onto the end of the existing table
Returns:
Parameters
----------
`bool`: True if the data was inserted successfully
file_path : `str`
Path to the file in Azure Blob Storage
db_table_name : `str`
Name of the table in which the data is being inserted
sas_str : `str`
SAS string to the storage account
storage_connection_string : `str`
Connection string to the storage account
storage_account : `str`
Name of the storage account
container_name : `str`
Name of the container in which the data is being inserted
credential_name : `str`
Name of the credentials
data_source_name : `str`
Name of the data source
char_length : `int`
Length of varchar fields for text columns
overwrite : `bool`
If `True` it will delete and recreate the table before inserting new data if `False` it will append the new data onto the end of the existing table.
Returns
-------
`bool`
True if the data was inserted successfully
"""
# Get all the files in the container or file individually
filter_condition = ""
Expand Down Expand Up @@ -268,13 +283,17 @@ def drop_dropables(
) -> bool:
"""Drop dropable objects
Parameters:
----------
data_source_name (`str`): Name of the data source
masterkey (`bool`): If `True` it will drop the master key
Returns:
Parameters
----------
`Bool`: True if the data was inserted successfully
data_source_name : `str`
Name of the data source
masterkey : `bool`
If `True` it will drop the master key
Returns
-------
`bool`
True if the data was inserted successfully
"""
print("DROPPING EXTERNAL DATA SOURCE")
self._cur.execute(f"DROP EXTERNAL DATA SOURCE {data_source_name}")
Expand Down Expand Up @@ -339,14 +358,20 @@ def write_csv_from_parquet(
write_to_csv: bool = True,
) -> None:
"""Write a csv file from parquet files in a container
Parameters:
----------
connection_string (`str`): Connection string to the storage account
container_name (`str`): Name of the container in which the data is being inserted
directory (`str`): Directory in which the parquet files are located
Returns:
Parameters
----------
`bool`: True if the file was created successfully
connection_string : `str`
Connection string to the storage account
container_name : `str`
Name of the container in which the data is being inserted
directory : `str`
Directory in which the parquet files are located
Returns
-------
`bool`
`True` if the file was created successfully
"""
# Write the csv files
if write_to_csv:
Expand Down
Loading

0 comments on commit 1da7d92

Please sign in to comment.