Merge pull request #4 from jzsmoreno/dev

develop
jzsmoreno · Aug 2, 2024 · b5eedf9 · b5eedf9
2 parents b877c52 + 140ebd6
commit b5eedf9
Show file tree

Hide file tree

Showing 4 changed files with 252 additions and 153 deletions.
diff --git a/merge_by_lev/VERSION b/merge_by_lev/VERSION
@@ -1 +1 @@
-0.2.9
+0.3.0
diff --git a/merge_by_lev/main.py b/merge_by_lev/main.py
@@ -18,9 +18,9 @@
 
 def progressbar(
     it: range, prefix: str = "", size: int = 40, out: TextIOWrapper = sys.stdout
-) -> None:
+) -> None:  # type: ignore
     """
-    Auxiliary function displaying a progress bar
+    Auxiliary function displaying a progress bar.
     """
     count = len(it)
 
@@ -42,7 +42,7 @@ def show(j):
 
 def clearConsole() -> None:
     """
-    Auxiliary function that cleans the console
+    Auxiliary function that cleans the console.
     """
     command = "clear"
     if os.name in ("nt", "dos"):
@@ -52,22 +52,29 @@ def clearConsole() -> None:
 
 def check_cols_to_match(dict_dfs: dict[DataFrame], df_names: List[DataFrame]) -> None:
     """
-    Receives a dictionary of dataframes (dict_dfs) and a list of dataframe names (dfs_names).
-    Then check if the dataframes have the same columns. Print the data frames that do not match
-
-    params:
-        dict_dfs (Dictionary) : Contains the dataframes to be analyzed
-        df_names (`List`) : Contains the keys (names of each dataframe) of the dictionary
-
-    returns:
-        This function returns a summary of the condition of the columns
-
-    example:
-        # dfs -> (`List` of dataframes)
-        # names -> (`List` of names)
-        dict_dfs = {name:df for df, name in zip(dfs, names)}
-        check_cols_to_match(dict_dfs, df_names)
-        >>
+    Receives a dictionary of dataframes (`dict_dfs`) and a list of dataframe names (`dfs_names`).
+    Then check if the dataframes have the same columns. Print the data frames that do not match.
+
+    Parameters
+    ----------
+    dict_dfs : `dict`
+        Contains the dataframes to be analyzed.
+    df_names : `List`
+        Contains the keys (names of each dataframe) of the dictionary.
+
+    Returns
+    -------
+    This function returns a summary of the condition of the columns
+
+    Example
+    -------
+    ```
+    dfs -> List[DataFrame]
+    names -> List[str]
+    dict_dfs = {name:df for df, name in zip(dfs, names)}
+    check_cols_to_match(dict_dfs, df_names)
+    >>
+    ```
     """
     cols_set = set([col for name in df_names for col in dict_dfs[name].columns])
     for name in df_names:
@@ -89,16 +96,23 @@ def rename_cols(df: DataFrame) -> DataFrame:
     Identifying the cases in which there was a renaming of similar columns
     with different information, consolidating them.
 
-    params:
-        df (`Dataframe`) : The dataframe on which you want to operate
-
-    returns:
-        df (`Dataframe`) : The same df dataframe with the consolidated columns
-
-    example:
-        df_1 = df_1.merge(df_2, how = 'left')
-        df_1 = rename_cols(df_1)
-        >>
+    Parameters
+    ----------
+    df : `Dataframe`
+        The dataframe on which you want to operate.
+
+    Returns
+    -------
+    df : `Dataframe`
+        The same df dataframe with the consolidated columns.
+
+    Example
+    --------
+    ```
+    df_1 = df_1.merge(df_2, how = 'left')
+    df_1 = rename_cols(df_1)
+    >>
+    ```
     """
     cols = []
     for i in df.columns:
@@ -122,17 +136,25 @@ def clean_names(x: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str:
     """
     Receives a string for cleaning to be used in merge_by_similarity function.
 
-    params:
-        x (String) : Character string to which a regular expression is to be applied
-        pattern (regex) : By default extracts names without numerical characters
-
-    returns:
-        result (String) : The clean text string
-
-    example:
-        x = 'stamp_1'
-        clean_names(x)
-        >> 'stamp'
+    Parameters
+    ----------
+    x : `str`
+        Character string to which a regular expression is to be applied.
+    pattern : `regex`
+        By default extracts names without numerical characters.
+
+    Returns
+    -------
+    result : `str`
+        The clean text string.
+
+    Example
+    -------
+    ```
+    x = 'stamp_1'
+    clean_names(x)
+    >> 'stamp'
+    ```
     """
     result = re.findall(pattern, str(x).replace("_", ""))
     if len(result) > 0:
@@ -147,20 +169,27 @@ def clean_names(x: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str:
 def lev_dist(a: str, b: str) -> int:
     """
     This function will calculate the levenshtein distance between two input
-    strings a and b
-
-    params:
-        a (String) : The first string you want to compare
-        b (String) : The second string you want to compare
-
-    returns:
-        This function will return the distnace between string a and b.
-
-    example:
-        a = 'stamp'
-        b = 'stomp'
-        lev_dist(a,b)
-        >> 1.0
+    strings `a` and `b`.
+
+    Parameters
+    ----------
+    a : `str`
+        The first string you want to compare
+    b : `str`
+        The second string you want to compare
+
+    Returns
+    -------
+    This function will return the distnace between string `a` and `b`.
+
+    Example
+    -------
+    ```
+    a = 'stamp'
+    b = 'stomp'
+    lev_dist(a,b)
+    >> 1.0
+    ```
     """
 
     @lru_cache(None)  # for memorization
@@ -183,15 +212,20 @@ def min_dist(s1, s2):
 
 def cal_cols_similarity(col_list: List[str]) -> ndarray:
     """
-    Calculate in pairs the levenshtein distance of the chars according to their name
+    Calculate in pairs the levenshtein distance of the chars according to their name.
 
-    params:
-        col_list (`List`) : List with the chars names
+    Parameters
+    ----------
+    col_list : `List`
+        List with the chars names.
 
-    returns:
-        mtx (`np.array`) : Matrix of $n$ x $n$ containing the results for $n$ chars.
+    Returns
+    --------
+    mtx : `np.array`
+        Matrix of $n$ x $n$ containing the results for $n$ chars.
 
-    example:
+    Example
+    --------
         cal_cols_similarity(col_list)
         >>
     """
@@ -204,14 +238,19 @@ def cal_cols_similarity(col_list: List[str]) -> ndarray:
 
 
 def create_table_tabular(df1: DataFrame, df2: DataFrame) -> List[List[str]]:
-    """Create a table for column names from two dataframes
-
-    Args:
-        df1 (`DataFrame`): First dataframe
-        df2 (`DataFrame`): Second dataframe
-
-    Returns:
-        List[List[`str`]]: List of rows for each of the columns of both dataframes
+    """Create a table for column names from two dataframes.
+
+    Parameters
+    ----------
+    df1 : `DataFrame`
+        First dataframe
+    df2 : `DataFrame`
+        Second dataframe
+
+    Returns
+    -------
+    List[List[`str`]]
+        List of rows for each of the columns of both dataframes.
     """
     table = []
     col_names_df1 = df1.columns
@@ -227,13 +266,19 @@ def create_table_tabular(df1: DataFrame, df2: DataFrame) -> List[List[str]]:
 def rename_cols_dict(df_name: str, df: DataFrame, cols: list) -> DataFrame:
     """Function that allows to rename a segment of columns of a dataframe from a list as input.
 
-    Args:
-        df_name (str): Name of dataframe
-        df (DataFrame): Dataframe whose columns names will be changed
-        cols (list): List indicating the names of the columns to be changed
-
-    Returns:
-        DataFrame: Processed dataframe with changed names
+    Parameters
+    ----------
+    df_name : `str`
+        Name of dataframe.
+    df : `DataFrame`
+        Dataframe whose columns names will be changed.
+    cols : `list`
+        List indicating the names of the columns to be changed.
+
+    Returns
+    -------
+    `DataFrame`
+        Processed dataframe with changed names.
     """
     if not cols:
         return df
@@ -262,18 +307,26 @@ def merge_by_similarity(
     stdout: Any = sys.stdout,
 ) -> Tuple[List[DataFrame], List[str], ndarray]:
     """
-    It makes use of the levenshtein distance to calculate
+    It makes use of the `lev_dist` to calculate
     a similarity between dataframes according to a list of names
     to concatenate them or make a left join (if merge_mode = `True`).
 
-    params:
-        df_list (List of Dataframes) : The list of dataframes to be used in the process
-        col_list (List of chars) : The list of dataframe names
-        dist_min (`int`) : Minimum distance to determine that they are equal. By default is set to `2`.
-        match_cols (`int`) : Minimum number of columns to concatenate. By default is set to `2`.
-        merge_mode (Boolean) : If `True`, it seeks to take the largest dataframe and make a left join with those that share columns with each other.
-        manually (Boolean) : If `False` avoids inputs when there are differences in columns. By default is set to `False`.
-        drop_empty (Boolean) : If `True`, identify frames with few columns and rows to be discarded. By default is set to `False`.
+    Parameters
+    ----------
+    df_list : `List` | `Dataframes`
+        The list of dataframes to be used in the process.
+    col_list : `List[str]`
+        The list of dataframe names.
+    dist_min : `int`
+        Minimum distance to determine that they are equal. By default is set to `2`.
+    match_cols : `int`
+        Minimum number of columns to concatenate. By default is set to `2`.
+    merge_mode : `bool`
+        If `True`, it seeks to take the largest dataframe and make a left join with those that share columns with each other.
+    manually : `bool`
+        If `False` avoids inputs when there are differences in columns. By default is set to `False`.
+    drop_empty : `bool`
+        If `True`, identify frames with few columns and rows to be discarded. By default is set to `False`.
     """
     if drop_empty:
         df_list, col_list = check_empty_df(df_list, col_list)