diff --git a/merge_by_lev/VERSION b/merge_by_lev/VERSION index d81f1c3..9325c3c 100644 --- a/merge_by_lev/VERSION +++ b/merge_by_lev/VERSION @@ -1 +1 @@ -0.2.9 \ No newline at end of file +0.3.0 \ No newline at end of file diff --git a/merge_by_lev/main.py b/merge_by_lev/main.py index ea3430b..68e356a 100644 --- a/merge_by_lev/main.py +++ b/merge_by_lev/main.py @@ -18,9 +18,9 @@ def progressbar( it: range, prefix: str = "", size: int = 40, out: TextIOWrapper = sys.stdout -) -> None: +) -> None: # type: ignore """ - Auxiliary function displaying a progress bar + Auxiliary function displaying a progress bar. """ count = len(it) @@ -42,7 +42,7 @@ def show(j): def clearConsole() -> None: """ - Auxiliary function that cleans the console + Auxiliary function that cleans the console. """ command = "clear" if os.name in ("nt", "dos"): @@ -52,22 +52,29 @@ def clearConsole() -> None: def check_cols_to_match(dict_dfs: dict[DataFrame], df_names: List[DataFrame]) -> None: """ - Receives a dictionary of dataframes (dict_dfs) and a list of dataframe names (dfs_names). - Then check if the dataframes have the same columns. Print the data frames that do not match - - params: - dict_dfs (Dictionary) : Contains the dataframes to be analyzed - df_names (`List`) : Contains the keys (names of each dataframe) of the dictionary - - returns: - This function returns a summary of the condition of the columns - - example: - # dfs -> (`List` of dataframes) - # names -> (`List` of names) - dict_dfs = {name:df for df, name in zip(dfs, names)} - check_cols_to_match(dict_dfs, df_names) - >> + Receives a dictionary of dataframes (`dict_dfs`) and a list of dataframe names (`dfs_names`). + Then check if the dataframes have the same columns. Print the data frames that do not match. + + Parameters + ---------- + dict_dfs : `dict` + Contains the dataframes to be analyzed. + df_names : `List` + Contains the keys (names of each dataframe) of the dictionary. + + Returns + ------- + This function returns a summary of the condition of the columns + + Example + ------- + ``` + dfs -> List[DataFrame] + names -> List[str] + dict_dfs = {name:df for df, name in zip(dfs, names)} + check_cols_to_match(dict_dfs, df_names) + >> + ``` """ cols_set = set([col for name in df_names for col in dict_dfs[name].columns]) for name in df_names: @@ -89,16 +96,23 @@ def rename_cols(df: DataFrame) -> DataFrame: Identifying the cases in which there was a renaming of similar columns with different information, consolidating them. - params: - df (`Dataframe`) : The dataframe on which you want to operate - - returns: - df (`Dataframe`) : The same df dataframe with the consolidated columns - - example: - df_1 = df_1.merge(df_2, how = 'left') - df_1 = rename_cols(df_1) - >> + Parameters + ---------- + df : `Dataframe` + The dataframe on which you want to operate. + + Returns + ------- + df : `Dataframe` + The same df dataframe with the consolidated columns. + + Example + -------- + ``` + df_1 = df_1.merge(df_2, how = 'left') + df_1 = rename_cols(df_1) + >> + ``` """ cols = [] for i in df.columns: @@ -122,17 +136,25 @@ def clean_names(x: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str: """ Receives a string for cleaning to be used in merge_by_similarity function. - params: - x (String) : Character string to which a regular expression is to be applied - pattern (regex) : By default extracts names without numerical characters - - returns: - result (String) : The clean text string - - example: - x = 'stamp_1' - clean_names(x) - >> 'stamp' + Parameters + ---------- + x : `str` + Character string to which a regular expression is to be applied. + pattern : `regex` + By default extracts names without numerical characters. + + Returns + ------- + result : `str` + The clean text string. + + Example + ------- + ``` + x = 'stamp_1' + clean_names(x) + >> 'stamp' + ``` """ result = re.findall(pattern, str(x).replace("_", "")) if len(result) > 0: @@ -147,20 +169,27 @@ def clean_names(x: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str: def lev_dist(a: str, b: str) -> int: """ This function will calculate the levenshtein distance between two input - strings a and b - - params: - a (String) : The first string you want to compare - b (String) : The second string you want to compare - - returns: - This function will return the distnace between string a and b. - - example: - a = 'stamp' - b = 'stomp' - lev_dist(a,b) - >> 1.0 + strings `a` and `b`. + + Parameters + ---------- + a : `str` + The first string you want to compare + b : `str` + The second string you want to compare + + Returns + ------- + This function will return the distnace between string `a` and `b`. + + Example + ------- + ``` + a = 'stamp' + b = 'stomp' + lev_dist(a,b) + >> 1.0 + ``` """ @lru_cache(None) # for memorization @@ -183,15 +212,20 @@ def min_dist(s1, s2): def cal_cols_similarity(col_list: List[str]) -> ndarray: """ - Calculate in pairs the levenshtein distance of the chars according to their name + Calculate in pairs the levenshtein distance of the chars according to their name. - params: - col_list (`List`) : List with the chars names + Parameters + ---------- + col_list : `List` + List with the chars names. - returns: - mtx (`np.array`) : Matrix of $n$ x $n$ containing the results for $n$ chars. + Returns + -------- + mtx : `np.array` + Matrix of $n$ x $n$ containing the results for $n$ chars. - example: + Example + -------- cal_cols_similarity(col_list) >> """ @@ -204,14 +238,19 @@ def cal_cols_similarity(col_list: List[str]) -> ndarray: def create_table_tabular(df1: DataFrame, df2: DataFrame) -> List[List[str]]: - """Create a table for column names from two dataframes - - Args: - df1 (`DataFrame`): First dataframe - df2 (`DataFrame`): Second dataframe - - Returns: - List[List[`str`]]: List of rows for each of the columns of both dataframes + """Create a table for column names from two dataframes. + + Parameters + ---------- + df1 : `DataFrame` + First dataframe + df2 : `DataFrame` + Second dataframe + + Returns + ------- + List[List[`str`]] + List of rows for each of the columns of both dataframes. """ table = [] col_names_df1 = df1.columns @@ -227,13 +266,19 @@ def create_table_tabular(df1: DataFrame, df2: DataFrame) -> List[List[str]]: def rename_cols_dict(df_name: str, df: DataFrame, cols: list) -> DataFrame: """Function that allows to rename a segment of columns of a dataframe from a list as input. - Args: - df_name (str): Name of dataframe - df (DataFrame): Dataframe whose columns names will be changed - cols (list): List indicating the names of the columns to be changed - - Returns: - DataFrame: Processed dataframe with changed names + Parameters + ---------- + df_name : `str` + Name of dataframe. + df : `DataFrame` + Dataframe whose columns names will be changed. + cols : `list` + List indicating the names of the columns to be changed. + + Returns + ------- + `DataFrame` + Processed dataframe with changed names. """ if not cols: return df @@ -262,18 +307,26 @@ def merge_by_similarity( stdout: Any = sys.stdout, ) -> Tuple[List[DataFrame], List[str], ndarray]: """ - It makes use of the levenshtein distance to calculate + It makes use of the `lev_dist` to calculate a similarity between dataframes according to a list of names to concatenate them or make a left join (if merge_mode = `True`). - params: - df_list (List of Dataframes) : The list of dataframes to be used in the process - col_list (List of chars) : The list of dataframe names - dist_min (`int`) : Minimum distance to determine that they are equal. By default is set to `2`. - match_cols (`int`) : Minimum number of columns to concatenate. By default is set to `2`. - merge_mode (Boolean) : If `True`, it seeks to take the largest dataframe and make a left join with those that share columns with each other. - manually (Boolean) : If `False` avoids inputs when there are differences in columns. By default is set to `False`. - drop_empty (Boolean) : If `True`, identify frames with few columns and rows to be discarded. By default is set to `False`. + Parameters + ---------- + df_list : `List` | `Dataframes` + The list of dataframes to be used in the process. + col_list : `List[str]` + The list of dataframe names. + dist_min : `int` + Minimum distance to determine that they are equal. By default is set to `2`. + match_cols : `int` + Minimum number of columns to concatenate. By default is set to `2`. + merge_mode : `bool` + If `True`, it seeks to take the largest dataframe and make a left join with those that share columns with each other. + manually : `bool` + If `False` avoids inputs when there are differences in columns. By default is set to `False`. + drop_empty : `bool` + If `True`, identify frames with few columns and rows to be discarded. By default is set to `False`. """ if drop_empty: df_list, col_list = check_empty_df(df_list, col_list) diff --git a/merge_by_lev/schema_config.py b/merge_by_lev/schema_config.py index b64a389..b5817e5 100644 --- a/merge_by_lev/schema_config.py +++ b/merge_by_lev/schema_config.py @@ -31,26 +31,36 @@ def get_frame( ) -> DataFrame: """Returns the `DataFrame` with the obfuscated columns or SQL standard format. - Args: - json_name (`str`, optional): name of the dictionary `.json` file. By default it is set to `output.json`. - write_to_cloud (`bool`, optional): boolean variable to write to an Azure storage account. By default it is set to `False`. - connection_string (`str`, optional): the connection string to storage account. By default it is set to "". - container_name (`str`, optional): Azure container name. By default it is set to "". - overwrite (`bool`, optional): boolean variable that indicates whether to overwrite. By default it is set to `True`. - encoding (`str`, optional): file coding. By default it is set to `utf-8`. - get_standard (`bool`, optional): instead of obfuscation returns the columns with SQL standards. By default it is set to `True`. - - Returns: - `DataFrame`: `DataFrame` with changed columns - - Keyword Arguments: + Parameters ---------- - - snake_case (`bool`, optional): If true - transforms column names into snake - case otherwise camel case will be used. Default is `True`. - - sort (`bool`, optional): If true - sorts columns by their names in alphabetical order. - Default is `False`. - - surrounding (`bool`, optional): If true - removes brackets from column names before transformation. - Default is `True`. + json_name : `str`, optional + Name of the dictionary `.json` file. By default it is set to `output.json`. + write_to_cloud : `bool`, optional + Boolean variable to write to an Azure storage account. By default it is set to `False`. + connection_string : `str`, optional + The connection string to storage account. + container_name : `str`, optional + Azure container name. + overwrite : `bool`, optional + Boolean variable that indicates whether to overwrite. By default it is set to `True`. + encoding : `str`, optional + File coding. By default it is set to `utf-8`. + get_standard : `bool`, optional + Instead of obfuscation returns the columns with SQL standards. By default it is set to `True`. + + Returns + ------- + `DataFrame`: + DataFrame with changed columns + + Keyword Arguments + ---------- + snake_case : `bool`, optional + If true - transforms column names into snake case otherwise camel case will be used. Default is True. + sort :`bool`, optional + If true - sorts columns by their names in alphabetical order. Default is False. + surrounding : `bool`, optional + If true - removes brackets from column names before transformation. Default is True. """ self._generate_dict(encoding) self._writer(json_name, write_to_cloud, connection_string, container_name, overwrite) @@ -65,15 +75,19 @@ def _sql_standards( ) -> DataFrame: """Transforms all column names into SQL standard format. - Args: - snake_case (`bool`, optional): If true - transforms column names into snake - case otherwise camel case will be used. Default is `True`. - sort (`bool`, optional): If true - sorts columns by their names in alphabetical order. - Default is `False`. - surrounding (`bool`, optional): If true - removes brackets from column names before transformation. - Default is `True`. - Returns: - `DataFrame`: `DataFrame` with transformed columns. + Parameters + ---------- + snake_case : `bool`, optional + If true - transforms column names into snake case otherwise camel case will be used. Default is `True`. + sort : `bool`, optional + If true - sorts columns by their names in alphabetical order. Default is `False`. + surrounding : `bool`, optional + If true - removes brackets from column names before transformation. Default is `True`. + + Returns + ------- + `DataFrame`: + `DataFrame` with transformed columns. """ df = (self.df).copy() @@ -96,14 +110,14 @@ def _sql_standards( return df def _sort_columns_by_length(self, dataframe: DataFrame) -> DataFrame: - # Get the column names and sort them by length + """Get the column names and sort them by length""" sorted_columns = sorted(dataframe.columns, key=len, reverse=True) sorted_dataframe = dataframe[sorted_columns] return sorted_dataframe def _camel_to_snake(self, column_name: str) -> str: - # Use regular expression to convert camelCase/PascalCase to snake_case + """Use regular expression to convert camelCase/PascalCase to snake_case""" s1 = re.sub("(.)([A-Z][a-z]+)", r"\1_\2", column_name) return re.sub("([a-z0-9])([A-Z])", r"\1_\2", s1).lower() @@ -115,14 +129,20 @@ def _writer( container_name: str, overwrite: bool, ) -> None: - """writer of the json file. - - Args: - json_name (`str`): name of the dictionary `.json` file. - write_to_cloud (`bool`): boolean variable to write to an Azure storage account. - connection_string (`str`): the connection string to storage account. - container_name (`str`): Azure container name. - overwrite (`bool`): boolean variable that indicates whether to overwrite. + """Writer of the json file. + + Parameters + ---------- + json_name : `str` + Name of the dictionary `.json` file. + write_to_cloud : `bool` + Boolean variable to write to an Azure storage account. + connection_string : `str` + The connection string to storage account. + container_name : `str` + Azure container name. + overwrite : `bool` + Boolean variable that indicates whether to overwrite. """ if write_to_cloud: blob_service_client = BlobServiceClient.from_connection_string(connection_string) @@ -135,13 +155,17 @@ def _writer( ) def _generate_dict(self, encoding: str) -> dict: - """generates the dictionary that renames the columns of the `DataFrame`. + """Generates the dictionary that renames the columns of the `DataFrame`. - Args: - encoding (`str`): file coding. + Parameters + ---------- + encoding : `str` + File coding. - Returns: - `dict`: dictionary to rename columns. + Returns + ------- + `dict`: + Dictionary to rename columns. """ values = [] keys = [] @@ -174,14 +198,20 @@ def create_yaml( ) -> str: """Function that generates the schema of a `DataFrame` in a `.yml` file. - Args: + Parameters ---------- - dabase_name (`str`, optional): `Dataframe` name. By default it is set to `database` - yaml_name (`str`, optional): output name of the `.yml` file. By default it is set to `output.yml` - write_to_cloud (`bool`, optional): boolean type variable indicating whether or not to write to the cloud. By default it is set to `False` - connection_string (`str`, optional): storage account and container connection string. By default it is set to `""`. - container_name (`str`, optional): name of the container inside the storage account. By default it is set to `""`. - overwrite (`bool`, optional): boolean variable indicating whether the file is overwritten or not. By default it is set to `True`. + dabase_name : `str`, optional + Dataframe name. By default it is set to database. + yaml_name : `str`, optional + Output name of the `.yml` file. By default it is set to `output.yml`. + write_to_cloud : `bool`, optional + Boolean type variable indicating whether or not to write to the cloud. By default it is set to `False`. + connection_string : `str`, optional + Storage account and container connection string. + container_name : `str`, optional + Name of the container inside the storage account. + overwrite : `bool`, optional + Boolean variable indicating whether the file is overwritten or not. By default it is set to `True`. """ self.df.columns = [ c.replace(" ", "_") for c in list(self.df.columns) diff --git a/merge_by_lev/tools.py b/merge_by_lev/tools.py index f0db126..ed9d022 100644 --- a/merge_by_lev/tools.py +++ b/merge_by_lev/tools.py @@ -19,13 +19,19 @@ def get_report( ) -> DataFrame: """Function that returns the report generated - Args: - df_names (List[str]): `list` of dataframes, which are the tables. - report_name (`str`, optional): name and path to be used to save the report. By default is set to `./report-health-checker.html`. - encoding (`str`, optional): type of report encoding. By default is set to `latin1`. - - Returns: - `DataFrame`: `DataFrame` of the generated report + Parameters + ---------- + df_names : `List[str]` + list of dataframes, which are the tables. + report_name : `str`, optional + name and path to be used to save the report. By default is set to ./report-health-checker.html. + encoding : `str`, optional + type of report encoding. By default is set to latin1. + + Returns + ------- + DataFrame : + `DataFrame` of the generated report. """ df_sheet_files_info = self._iterative_evaluation(df_names) df_sheet_files_info.to_html(report_name, index=False, encoding=encoding) @@ -36,11 +42,15 @@ def get_report( def _iterative_evaluation(self, df_names: List[str]) -> DataFrame: """Function that iterates over the set of tables to build the report - Args: - df_names (List[`str`]): `list` of names of the tables on which it iterates. + Parameters + ---------- + df_names : `List[str]` + List of names of the tables on which it iterates. - Returns: - `DataFrame`: report generated from the set of tables. + Returns + ------- + `DataFrame` : + Report generated from the set of tables. """ df_sheet_files_info = pd.DataFrame() for i, df in enumerate(self.dfs): @@ -91,13 +101,19 @@ def check_empty_df( ) -> Tuple[List[DataFrame], List[str]]: """Check if the `DataFrame` is empty or not - Args: - dfs (List[`DataFrame`]): List of dataframes to iterate over - names (List[`str`]): List of `DataFrame` names - num_cols (`int`): minimum number of columns of a `DataFrame`. By default is set to `2` - - Returns: - Tuple[List[DataFrame], List[str]]: Verified dataframes and names + Parameters + ---------- + dfs : `List[DataFrame]` + List of dataframes to iterate over. + names : `List[str]` + List of DataFrame names. + num_cols : `int` + Minimum number of columns of a DataFrame. By default is set to 2. + + Returns + ------- + `Tuple[List[DataFrame], List[str]]` : + Verified dataframes and names. """ new_dfs = [] new_names = []