Skip to content

Commit

Permalink
Merge pull request #4 from jzsmoreno/dev
Browse files Browse the repository at this point in the history
develop
  • Loading branch information
jzsmoreno authored Aug 2, 2024
2 parents b877c52 + 140ebd6 commit b5eedf9
Show file tree
Hide file tree
Showing 4 changed files with 252 additions and 153 deletions.
2 changes: 1 addition & 1 deletion merge_by_lev/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.2.9
0.3.0
221 changes: 137 additions & 84 deletions merge_by_lev/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@

def progressbar(
it: range, prefix: str = "", size: int = 40, out: TextIOWrapper = sys.stdout
) -> None:
) -> None: # type: ignore
"""
Auxiliary function displaying a progress bar
Auxiliary function displaying a progress bar.
"""
count = len(it)

Expand All @@ -42,7 +42,7 @@ def show(j):

def clearConsole() -> None:
"""
Auxiliary function that cleans the console
Auxiliary function that cleans the console.
"""
command = "clear"
if os.name in ("nt", "dos"):
Expand All @@ -52,22 +52,29 @@ def clearConsole() -> None:

def check_cols_to_match(dict_dfs: dict[DataFrame], df_names: List[DataFrame]) -> None:
"""
Receives a dictionary of dataframes (dict_dfs) and a list of dataframe names (dfs_names).
Then check if the dataframes have the same columns. Print the data frames that do not match
params:
dict_dfs (Dictionary) : Contains the dataframes to be analyzed
df_names (`List`) : Contains the keys (names of each dataframe) of the dictionary
returns:
This function returns a summary of the condition of the columns
example:
# dfs -> (`List` of dataframes)
# names -> (`List` of names)
dict_dfs = {name:df for df, name in zip(dfs, names)}
check_cols_to_match(dict_dfs, df_names)
>>
Receives a dictionary of dataframes (`dict_dfs`) and a list of dataframe names (`dfs_names`).
Then check if the dataframes have the same columns. Print the data frames that do not match.
Parameters
----------
dict_dfs : `dict`
Contains the dataframes to be analyzed.
df_names : `List`
Contains the keys (names of each dataframe) of the dictionary.
Returns
-------
This function returns a summary of the condition of the columns
Example
-------
```
dfs -> List[DataFrame]
names -> List[str]
dict_dfs = {name:df for df, name in zip(dfs, names)}
check_cols_to_match(dict_dfs, df_names)
>>
```
"""
cols_set = set([col for name in df_names for col in dict_dfs[name].columns])
for name in df_names:
Expand All @@ -89,16 +96,23 @@ def rename_cols(df: DataFrame) -> DataFrame:
Identifying the cases in which there was a renaming of similar columns
with different information, consolidating them.
params:
df (`Dataframe`) : The dataframe on which you want to operate
returns:
df (`Dataframe`) : The same df dataframe with the consolidated columns
example:
df_1 = df_1.merge(df_2, how = 'left')
df_1 = rename_cols(df_1)
>>
Parameters
----------
df : `Dataframe`
The dataframe on which you want to operate.
Returns
-------
df : `Dataframe`
The same df dataframe with the consolidated columns.
Example
--------
```
df_1 = df_1.merge(df_2, how = 'left')
df_1 = rename_cols(df_1)
>>
```
"""
cols = []
for i in df.columns:
Expand All @@ -122,17 +136,25 @@ def clean_names(x: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str:
"""
Receives a string for cleaning to be used in merge_by_similarity function.
params:
x (String) : Character string to which a regular expression is to be applied
pattern (regex) : By default extracts names without numerical characters
returns:
result (String) : The clean text string
example:
x = 'stamp_1'
clean_names(x)
>> 'stamp'
Parameters
----------
x : `str`
Character string to which a regular expression is to be applied.
pattern : `regex`
By default extracts names without numerical characters.
Returns
-------
result : `str`
The clean text string.
Example
-------
```
x = 'stamp_1'
clean_names(x)
>> 'stamp'
```
"""
result = re.findall(pattern, str(x).replace("_", ""))
if len(result) > 0:
Expand All @@ -147,20 +169,27 @@ def clean_names(x: str, pattern: str = r"[a-zA-Zñáéíóú_]+\b") -> str:
def lev_dist(a: str, b: str) -> int:
"""
This function will calculate the levenshtein distance between two input
strings a and b
params:
a (String) : The first string you want to compare
b (String) : The second string you want to compare
returns:
This function will return the distnace between string a and b.
example:
a = 'stamp'
b = 'stomp'
lev_dist(a,b)
>> 1.0
strings `a` and `b`.
Parameters
----------
a : `str`
The first string you want to compare
b : `str`
The second string you want to compare
Returns
-------
This function will return the distnace between string `a` and `b`.
Example
-------
```
a = 'stamp'
b = 'stomp'
lev_dist(a,b)
>> 1.0
```
"""

@lru_cache(None) # for memorization
Expand All @@ -183,15 +212,20 @@ def min_dist(s1, s2):

def cal_cols_similarity(col_list: List[str]) -> ndarray:
"""
Calculate in pairs the levenshtein distance of the chars according to their name
Calculate in pairs the levenshtein distance of the chars according to their name.
params:
col_list (`List`) : List with the chars names
Parameters
----------
col_list : `List`
List with the chars names.
returns:
mtx (`np.array`) : Matrix of $n$ x $n$ containing the results for $n$ chars.
Returns
--------
mtx : `np.array`
Matrix of $n$ x $n$ containing the results for $n$ chars.
example:
Example
--------
cal_cols_similarity(col_list)
>>
"""
Expand All @@ -204,14 +238,19 @@ def cal_cols_similarity(col_list: List[str]) -> ndarray:


def create_table_tabular(df1: DataFrame, df2: DataFrame) -> List[List[str]]:
"""Create a table for column names from two dataframes
Args:
df1 (`DataFrame`): First dataframe
df2 (`DataFrame`): Second dataframe
Returns:
List[List[`str`]]: List of rows for each of the columns of both dataframes
"""Create a table for column names from two dataframes.
Parameters
----------
df1 : `DataFrame`
First dataframe
df2 : `DataFrame`
Second dataframe
Returns
-------
List[List[`str`]]
List of rows for each of the columns of both dataframes.
"""
table = []
col_names_df1 = df1.columns
Expand All @@ -227,13 +266,19 @@ def create_table_tabular(df1: DataFrame, df2: DataFrame) -> List[List[str]]:
def rename_cols_dict(df_name: str, df: DataFrame, cols: list) -> DataFrame:
"""Function that allows to rename a segment of columns of a dataframe from a list as input.
Args:
df_name (str): Name of dataframe
df (DataFrame): Dataframe whose columns names will be changed
cols (list): List indicating the names of the columns to be changed
Returns:
DataFrame: Processed dataframe with changed names
Parameters
----------
df_name : `str`
Name of dataframe.
df : `DataFrame`
Dataframe whose columns names will be changed.
cols : `list`
List indicating the names of the columns to be changed.
Returns
-------
`DataFrame`
Processed dataframe with changed names.
"""
if not cols:
return df
Expand Down Expand Up @@ -262,18 +307,26 @@ def merge_by_similarity(
stdout: Any = sys.stdout,
) -> Tuple[List[DataFrame], List[str], ndarray]:
"""
It makes use of the levenshtein distance to calculate
It makes use of the `lev_dist` to calculate
a similarity between dataframes according to a list of names
to concatenate them or make a left join (if merge_mode = `True`).
params:
df_list (List of Dataframes) : The list of dataframes to be used in the process
col_list (List of chars) : The list of dataframe names
dist_min (`int`) : Minimum distance to determine that they are equal. By default is set to `2`.
match_cols (`int`) : Minimum number of columns to concatenate. By default is set to `2`.
merge_mode (Boolean) : If `True`, it seeks to take the largest dataframe and make a left join with those that share columns with each other.
manually (Boolean) : If `False` avoids inputs when there are differences in columns. By default is set to `False`.
drop_empty (Boolean) : If `True`, identify frames with few columns and rows to be discarded. By default is set to `False`.
Parameters
----------
df_list : `List` | `Dataframes`
The list of dataframes to be used in the process.
col_list : `List[str]`
The list of dataframe names.
dist_min : `int`
Minimum distance to determine that they are equal. By default is set to `2`.
match_cols : `int`
Minimum number of columns to concatenate. By default is set to `2`.
merge_mode : `bool`
If `True`, it seeks to take the largest dataframe and make a left join with those that share columns with each other.
manually : `bool`
If `False` avoids inputs when there are differences in columns. By default is set to `False`.
drop_empty : `bool`
If `True`, identify frames with few columns and rows to be discarded. By default is set to `False`.
"""
if drop_empty:
df_list, col_list = check_empty_df(df_list, col_list)
Expand Down
Loading

0 comments on commit b5eedf9

Please sign in to comment.