From 1da7d92bf24fc957a48d01452275098c068038f3 Mon Sep 17 00:00:00 2001 From: jzsmoreno <42299052+jzsmoreno@users.noreply.github.com> Date: Thu, 9 May 2024 20:49:45 -0600 Subject: [PATCH] Update documentation (#25) --- .github/CODEOWNERS | 5 ++ .github/workflows/generate-docs.yml | 24 ++++-- .github/workflows/jekyll-gh-pages.yml | 2 +- pydbsmgr/VERSION | 2 +- pydbsmgr/health.py | 32 +++++--- pydbsmgr/lightest.py | 23 +++--- pydbsmgr/main.py | 12 ++- pydbsmgr/utils/config.py | 8 +- pydbsmgr/utils/sql_functions.py | 103 ++++++++++++++++---------- pydbsmgr/utils/tools/tools.py | 51 +++++++------ 10 files changed, 159 insertions(+), 103 deletions(-) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..093d866 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,5 @@ +# This is a CODEOWNERS file +# Each line specifies a file pattern followed by one or more GitHub usernames or team names + +# Owners for the entire repository +* @jzsmoreno \ No newline at end of file diff --git a/.github/workflows/generate-docs.yml b/.github/workflows/generate-docs.yml index 0b84803..8006842 100644 --- a/.github/workflows/generate-docs.yml +++ b/.github/workflows/generate-docs.yml @@ -3,7 +3,12 @@ name: Auto-documentation Generation on: push: branches: - - main + - 'main' + paths: + - 'pydbsmgr/**' + + # Allows you to run this workflow manually from the Actions tab + workflow_dispatch: jobs: generate-docs: @@ -24,6 +29,17 @@ jobs: if [ -f requirements.txt ]; then pip install -r requirements.txt; fi pip install pdoc3 + - name: Set up Git + env: + GITHUB_TOKEN: ${{ secrets.TOKEN }} + GITHUB_NAME: ${{ secrets.NAME }} + GITHUB_EMAIL: ${{ secrets.EMAIL }} + run: | + git config user.email "${GITHUB_EMAIL}" + git config user.name "${GITHUB_NAME}" + git config credential.helper "store --file=.git/credentials" + echo "https://${{ secrets.TOKEN }}@github.com/${{ github.repository }}" > .git/credentials + - name: Remove existing documentation files run: rm -rf docs/* @@ -43,13 +59,7 @@ jobs: rm -rf docs/${{ steps.get_package_name.outputs.name }} - name: Commit documentation changes - env: - GITHUB_TOKEN: ${{ secrets.TOKEN }} - GITHUB_ACTOR: ${{ github.actor }} - GITHUB_EMAIL: action@github.com run: | - git config user.email "${GITHUB_EMAIL}" - git config user.name "${GITHUB_ACTOR}" if git status --porcelain | grep .; then echo "Changes detected, proceeding with workflow steps..." git add docs/ diff --git a/.github/workflows/jekyll-gh-pages.yml b/.github/workflows/jekyll-gh-pages.yml index 0c2ee7c..63d9abe 100644 --- a/.github/workflows/jekyll-gh-pages.yml +++ b/.github/workflows/jekyll-gh-pages.yml @@ -7,7 +7,7 @@ on: branches: - 'main' paths: - - 'docs/*' + - 'docs/**' # Allows you to run this workflow manually from the Actions tab workflow_dispatch: diff --git a/pydbsmgr/VERSION b/pydbsmgr/VERSION index 0383441..9cf0386 100644 --- a/pydbsmgr/VERSION +++ b/pydbsmgr/VERSION @@ -1 +1 @@ -0.9.5 \ No newline at end of file +0.9.6 \ No newline at end of file diff --git a/pydbsmgr/health.py b/pydbsmgr/health.py index f1619e6..1c38859 100644 --- a/pydbsmgr/health.py +++ b/pydbsmgr/health.py @@ -33,10 +33,12 @@ def __init__(self, _df: DataFrame | List[DataFrame], df_names: str | List[str] = def fix(self, cols_upper_case: bool = False, drop_empty_cols: bool = True) -> None: """Performs the clean of the data and validation - Args: - ----- - cols_upper_case (`bool`, optional): Indicates whether to convert column names to uppercase. Defaults to `False`. - drop_empty_cols (`bool`, optional): Variable indicating whether columns with all their values empty should be removed. Defaults to `True`. + Parameters + ---------- + cols_upper_case : `bool`, `optional` + Indicates whether to convert column names to uppercase. Defaults to `False`. + drop_empty_cols : `bool`, `optional` + Variable indicating whether columns with all their values empty should be removed. Defaults to `True`. """ if drop_empty_cols: for count, df in enumerate(self._dfs): @@ -60,14 +62,20 @@ def generate_report( ) -> None: """Generate a `.html` health check report. - Args: - ----- - report_name (`str`, optional): Name of the quality assessment report. Defaults to `./report.html`. - yaml_name (`str`, optional): Indicates the name of the `.yaml` file that will serve as a template for the creation of the SQL table. Defaults to `./output.yaml`. - database_name (`str`, optional): The header of the `.yaml` file. Default value is `database` - directory_name (`str`, optional): Folder in which the reports will be saved. Defaults to `summary`. - concat_vertically: (`bool`, optional), Variable indicating whether the list of dataframes should be vertically concatenated into a single one. Default value is `False`. - encoding (`str`, optional): The encoding of dataframes. Defaults to `utf-8`. + Parameters + ---------- + report_name : `str`, `optional` + Name of the quality assessment report. Defaults to `./report.html`. + yaml_name : `str`, `optional` + Indicates the name of the `.yaml` file that will serve as a template for the creation of the SQL table. Defaults to `./output.yaml`. + database_name : `str`, `optional` + The header of the `.yaml` file. Default value is `database` + directory_name : `str`, `optional` + Folder in which the reports will be saved. Defaults to `summary`. + concat_vertically : `bool`, `optional` + Variable indicating whether the list of dataframes should be vertically concatenated into a single one. Default value is `False`. + encoding : `str`, `optional` + The encoding of dataframes. Defaults to `utf-8`. """ self.df_files_info = pd.DataFrame() self.yaml_name = yaml_name diff --git a/pydbsmgr/lightest.py b/pydbsmgr/lightest.py index 3eb8788..2dfddd9 100644 --- a/pydbsmgr/lightest.py +++ b/pydbsmgr/lightest.py @@ -9,13 +9,13 @@ def process_dates(x: str, format_type: str, auxiliary_type: str, errors: str = " Parameters ---------- - x : `str` - character of type date. + x : `str` + character of type date. Returns ---------- - x : `str` - character after processing with format `YYYY-MM-DD`. + x : `str` + character after processing with format `YYYY-MM-DD`. """ # performing data type conversion x = str(x) @@ -78,16 +78,17 @@ def clean_frame( Parameters ---------- - - sample_frac (`float`): The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%. - - fast_execution (`bool`): If `False` use `applymap` pandas for extra text cleanup. Default is `True`. + sample_frac : `float` + The fraction of rows to use for date type inference. Default is 0.1 i.e., 10%. + fast_execution : `bool` + If `False` use `applymap` pandas for extra text cleanup. Default is `True`. Keyword Arguments: ---------- - - no_emoji: (`bool`): By default it is set to `False`. - If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`. - - title_mode: (`bool`): By default it is set to `True`. - If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`. - By default, converts everything to `title`. + no_emoji : `bool` + By default it is set to `False`. If `True`, removes all emojis from text data. Works only when `fast_execution` = `False`. + title_mode : `bool` + By default it is set to `True`. If `False`, converts the text to lowercase. Works only when `fast_execution` = `False`. By default, converts everything to `title`. """ table = (self.df).copy() cols = table.columns diff --git a/pydbsmgr/main.py b/pydbsmgr/main.py index 75b2385..8401434 100644 --- a/pydbsmgr/main.py +++ b/pydbsmgr/main.py @@ -55,11 +55,15 @@ def check_if_contains_dates(input_string: str) -> bool: def remove_numeric_char(input_string: str) -> str: """Remove all numeric characters from a string. - Args: - input_string (`str`): character string to be cleaned of numeric characters + Parameters + ---------- + input_string : `str` + character string to be cleaned of numeric characters - Returns: - `str`: clean character string + Returns + ------- + `str` + clean character string """ return re.sub(r"\d", "", input_string) diff --git a/pydbsmgr/utils/config.py b/pydbsmgr/utils/config.py index 42b1064..b0c9959 100644 --- a/pydbsmgr/utils/config.py +++ b/pydbsmgr/utils/config.py @@ -6,12 +6,12 @@ def load_config(config_file): Parameters ---------- - config_file : str + config_file : `str` The path to the configuration file. Returns ------- - config : ConfigParser + config : `ConfigParser` A configuration object loaded from file. """ @@ -26,12 +26,12 @@ def parse_config(config): Parameters ---------- - config : ConfigParser + config : `ConfigParser` A configuration object loaded from file. Returns ------- - parsed_config : dict + parsed_config : `dict` A dictionary of parsed configuration values. """ diff --git a/pydbsmgr/utils/sql_functions.py b/pydbsmgr/utils/sql_functions.py index 463bc1a..cc8a2d8 100644 --- a/pydbsmgr/utils/sql_functions.py +++ b/pydbsmgr/utils/sql_functions.py @@ -28,18 +28,22 @@ def insert_data( ) -> None: """Insert data into SQL Server. - Parameters: + Parameters ---------- - df (`Dataframe` or `str`): The pandas dataframe that will be inserted into sql server - table_name (`str`): Name of the table in which the data is being inserted - overwrite (`bool`): If `True` it will delete and recreate the table before inserting new data - if `False` it will append the new data onto the end of the existing table - char_length (`int`): Length of varchar fields for text columns - override_length (`bool`): Override length of varchar fields for text columns. - - Returns: - ---------- - `None` + df : `Dataframe` | `str` + The pandas dataframe that will be inserted into sql server + table_name : `str` + Name of the table in which the data is being inserted + overwrite : `bool` + If `True` it will delete and recreate the table before inserting new data if `False` it will append the new data onto the end of the existing table. + char_length : `int` + Length of varchar fields for text columns. + override_length : `bool` + Override length of varchar fields for text columns. + + Returns + ------- + `None` """ self.file_type = None @@ -131,22 +135,33 @@ def bulk_insert_from_csv( ) -> bool: """Insert data from csv files in Azure Blob Storage into SQL Server with Bulk command - Parameters: - ---------- - file_path (`str`): Path to the file in Azure Blob Storage - db_table_name (`str`): Name of the table in which the data is being inserted - sas_str (`str`): SAS string to the storage account - storage_connection_string (`str`): Connection string to the storage account - storage_account (`str`): Name of the storage account - container_name (`str`): Name of the container in which the data is being inserted - credential_name (`str`): Name of the credentials - data_source_name (`str`): Name of the data source - char_length (`int`): Length of varchar fields for text columns - overwrite (`bool`): If `True` it will delete and recreate the table before inserting new data - if `False` it will append the new data onto the end of the existing table - Returns: + Parameters ---------- - `bool`: True if the data was inserted successfully + file_path : `str` + Path to the file in Azure Blob Storage + db_table_name : `str` + Name of the table in which the data is being inserted + sas_str : `str` + SAS string to the storage account + storage_connection_string : `str` + Connection string to the storage account + storage_account : `str` + Name of the storage account + container_name : `str` + Name of the container in which the data is being inserted + credential_name : `str` + Name of the credentials + data_source_name : `str` + Name of the data source + char_length : `int` + Length of varchar fields for text columns + overwrite : `bool` + If `True` it will delete and recreate the table before inserting new data if `False` it will append the new data onto the end of the existing table. + + Returns + ------- + `bool` + True if the data was inserted successfully """ # Get all the files in the container or file individually filter_condition = "" @@ -268,13 +283,17 @@ def drop_dropables( ) -> bool: """Drop dropable objects - Parameters: - ---------- - data_source_name (`str`): Name of the data source - masterkey (`bool`): If `True` it will drop the master key - Returns: + Parameters ---------- - `Bool`: True if the data was inserted successfully + data_source_name : `str` + Name of the data source + masterkey : `bool` + If `True` it will drop the master key + + Returns + ------- + `bool` + True if the data was inserted successfully """ print("DROPPING EXTERNAL DATA SOURCE") self._cur.execute(f"DROP EXTERNAL DATA SOURCE {data_source_name}") @@ -339,14 +358,20 @@ def write_csv_from_parquet( write_to_csv: bool = True, ) -> None: """Write a csv file from parquet files in a container - Parameters: - ---------- - connection_string (`str`): Connection string to the storage account - container_name (`str`): Name of the container in which the data is being inserted - directory (`str`): Directory in which the parquet files are located - Returns: + + Parameters ---------- - `bool`: True if the file was created successfully + connection_string : `str` + Connection string to the storage account + container_name : `str` + Name of the container in which the data is being inserted + directory : `str` + Directory in which the parquet files are located + + Returns + ------- + `bool` + `True` if the file was created successfully """ # Write the csv files if write_to_csv: diff --git a/pydbsmgr/utils/tools/tools.py b/pydbsmgr/utils/tools/tools.py index 9e5411c..f74ffa6 100644 --- a/pydbsmgr/utils/tools/tools.py +++ b/pydbsmgr/utils/tools/tools.py @@ -42,14 +42,17 @@ def wrapper(*args, **kwargs): def most_repeated_item(items: list, two_most_common: bool = False) -> Tuple[str, str | None]: """Returns a `Tuple` with the most common elements of a `list`. - Args: + Parameters ---------- - items (`list`): the `list` containing the items to be evaluated. - two_most_common (`bool`, optional): If `False`, returns only one element. Defaults to `False`. - - Returns: - ---------- - Tuple[`str`, `str` | `None`]: The two most common elements. + items : `list` + the `list` containing the items to be evaluated. + two_most_common : `bool`, `optional` + If `False`, returns only one element. Defaults to `False`. + + Returns + ------- + Tuple[`str`, `str` | `None`] + The two most common elements. """ # Use Counter to count occurrences of each item in the list counter = Counter(items) @@ -156,7 +159,7 @@ def write_parquet( overwrite: bool = True, upload: bool = True, ) -> None: - """Write dataframes as `parquet` format by converting them first into `bytes`""" + """Write dataframes as `parquet` format by converting them first into `bytes`.""" files = [] format_type = "parquet" files_not_loaded = [] @@ -185,6 +188,7 @@ def write_parquet( def column_coincidence(df1: DataFrame, df2: DataFrame) -> float: + """Return the percentage of coincident columns between two pandas dataframes.""" if not isinstance(df1, pd.DataFrame) or not isinstance(df2, pd.DataFrame): raise ValueError("Both inputs should be pandas DataFrames") @@ -199,6 +203,7 @@ def column_coincidence(df1: DataFrame, df2: DataFrame) -> float: def merge_by_coincidence(df1: DataFrame, df2: DataFrame, tol: float = 0.9) -> DataFrame: + """Merge two pandas dataframes by finding the most similar columns based on their names.""" percentage = column_coincidence(df1, df2) total_columns = set(df1.columns).union(set(df2.columns)) num_col1 = len(df1.columns) @@ -249,14 +254,17 @@ def get_extraction_date( ) -> str: """Allows to extract the date of extraction according to the directory within the storage account. - Args: - ---------- - filename (`str` | List[`str`]): file path inside the storage account - REGEX_PATTERN (`str`, optional): regular expression pattern to extract the date. Defaults to `r"\d{4}-\d{2}-\d{2}"`. - - Returns: + Parameters ---------- - `str`: the date that was extracted if found in the file path. + filename : `str` | List[`str`] + file path inside the storage account + REGEX_PATTERN : `str`, `optional` + regular expression pattern to extract the date. Defaults to `r"\d{4}-\d{2}-\d{2}"`. + + Returns + ------- + `str` + the date that was extracted if found in the file path. """ def sub_extraction_date(filename: str, REGEX_PATTERN: str) -> str: @@ -296,9 +304,7 @@ def get_frame(self) -> DataFrame: return self.df def _check_int_float(self, drop_values: bool = False, drop_rows: bool = False) -> None: - """ - Check and correct the data types of columns in a `DataFrame`. - """ + """Check and correct the data types of columns in a `DataFrame`.""" def check_float(x): if isinstance(x, str): @@ -350,9 +356,7 @@ def check_int(x): self.df = df_ def _check_datetime(self, sample_frac: float) -> None: - """ - Check and convert date-time string columns to datetime objects. - """ + """Check and convert date-time string columns to `datetime` objects.""" df_ = self.df cols = df_.columns df_sample = df_.sample(frac=sample_frac) @@ -382,9 +386,7 @@ def _check_datetime(self, sample_frac: float) -> None: def create_directory(data, parent_path=""): - """ - Creates the directory tree from a yaml file - """ + """Creates the directory tree from a `yaml` file.""" for key, value in data.items(): path = os.path.join(parent_path, key) if isinstance(value, dict): @@ -395,6 +397,7 @@ def create_directory(data, parent_path=""): def create_directories_from_yaml(yaml_file): + """Reads a `yaml` file and creates directories based on its content.""" with open(yaml_file, "r") as file: data = yaml.safe_load(file) create_directory(data)