From 7abbb1456dbb6725c451d16691078c7310885a1d Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 10 Jan 2025 09:17:10 +0100 Subject: [PATCH 01/11] fix merge taxonomy scripts and outputs --- bin/merge_taxonomy.py | 137 ++++++++++++++++++++++++++++++------------ 1 file changed, 98 insertions(+), 39 deletions(-) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index 44eed31a..d492e4b4 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -3,7 +3,7 @@ # Written by Anan Ibrahim and released under the MIT license. # See git repository (https://github.com/Darcy220606/AMPcombi) for full license text. # Date: March 2024 -# Version: 0.1.0 +# Version: 0.1.1 # Required modules import sys @@ -12,7 +12,7 @@ import numpy as np import argparse -tool_version = "0.1.0" +tool_version = "0.1.1" ######################################### # TOP LEVEL: AMPCOMBI ######################################### @@ -66,6 +66,15 @@ # TAXONOMY ######################################### def reformat_mmseqs_taxonomy(mmseqs_taxonomy): + """_summary_ + Reformats the taxonomy files and joins them in a list to be passed on to the tools functions + + Args: + mmseqs_taxonomy (tsv): mmseqs output file per sample + + Returns: + data frame: reformated tables + """ mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig']) # remove the lineage column mmseqs2_df.drop('lineage', axis=1, inplace=True) @@ -85,7 +94,19 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy): # FUNCTION: AMPCOMBI ######################################### def ampcombi_taxa(args): - merged_df = pd.DataFrame() + """_summary_ + Merges AMPcombi tool output with taxonomy information. + + Parameters: + ---------- + args: + Contains arguments for AMPcombi file path (`amp`) and list of taxonomy file paths (`taxa1`). + + Outputs: + ------- + Creates a file named `ampcombi_complete_summary_taxonomy.tsv` containing the merged results. + """ + combined_dfs = [] # assign input args to variables ampcombi = args.amp @@ -100,13 +121,6 @@ def ampcombi_taxa(args): # filter the tool df tool_df = pd.read_csv(ampcombi, sep='\t') - # remove the column with contig_id - duplicate #NOTE: will be fixed in AMPcombi v2.0.0 - tool_df = tool_df.drop('contig_id', axis=1) - # make sure 1st and 2nd column have the same column labels - tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) - tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) - # grab the real contig id in another column copy for merging - tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] # merge rows from taxa to ampcombi_df based on substring match in sample_id # grab the unique sample names from the taxonomy table @@ -114,17 +128,18 @@ def ampcombi_taxa(args): # for every sampleID in taxadf merge the results for sampleID in samples_taxa: # subset ampcombi - subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)] # subset taxa - subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)] # merge - subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') + subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left') # cleanup the table - columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] + columnsremove = ['sample_id_y'] subset_df.drop(columnsremove, axis=1, inplace=True) - subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) + subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True) # append in the combined_df - merged_df = merged_df.append(subset_df, ignore_index=True) + combined_dfs.append(subset_df) + merged_df = pd.concat(combined_dfs, ignore_index=True) # write to file merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False) @@ -133,7 +148,20 @@ def ampcombi_taxa(args): # FUNCTION: COMBGC ######################################### def combgc_taxa(args): - merged_df = pd.DataFrame() + """_summary_ + + Merges comBGC tool output with taxonomy information. + + Parameters: + ---------- + args: + Contains arguments for comBGC file path (`bgc`) and list of taxonomy file paths (`taxa2`). + + Outputs: + ------- + Creates a file named `combgc_complete_summary_taxonomy.tsv` containing the merged results. + """ + combined_dfs = [] # assign input args to variables combgc = args.bgc @@ -152,23 +180,24 @@ def combgc_taxa(args): tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) - # merge rows from taxa to ampcombi_df based on substring match in sample_id + # merge rows from taxa to combgc_df based on substring match in sample_id # grab the unique sample names from the taxonomy table samples_taxa = taxa_df['sample_id'].unique() # for every sampleID in taxadf merge the results for sampleID in samples_taxa: - # subset ampcombi - subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset tool + subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)] # subset taxa - subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)] # merge - subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id', right_on='contig_id', how='left') + subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left') # cleanup the table columnsremove = ['sample_id_y'] subset_df.drop(columnsremove, axis=1, inplace=True) subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True) # append in the combined_df - merged_df = merged_df.append(subset_df, ignore_index=True) + combined_dfs.append(subset_df) + merged_df = pd.concat(combined_dfs, ignore_index=True) # write to file merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False) @@ -176,8 +205,21 @@ def combgc_taxa(args): ######################################### # FUNCTION: HAMRONIZATION ######################################### +# TODO : FIX THE MERGING in ARG pipeline def hamronization_taxa(args): - merged_df = pd.DataFrame() + """_summary_ + Merges Hamronization tool output with taxonomy information. + + Parameters: + ---------- + args: + Contains arguments for hamronization file path (`arg`) and list of taxonomy file paths (`taxa2`). + + Outputs: + ------- + Creates a file named `hamronization_complete_summary_taxonomy.tsv` containing the merged results. + """ + combined_dfs = [] # assign input args to variables hamronization = args.arg @@ -197,29 +239,46 @@ def hamronization_taxa(args): # reorder the columns new_order = ['sample_id', 'contig_id'] + [col for col in tool_df.columns if col not in ['sample_id', 'contig_id']] tool_df = tool_df.reindex(columns=new_order) - # grab the real contig id in another column copy for merging - tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] - # merge rows from taxa to ampcombi_df based on substring match in sample_id + # merge rows from taxa to hamronization_df based on substring match in sample_id # grab the unique sample names from the taxonomy table samples_taxa = taxa_df['sample_id'].unique() # for every sampleID in taxadf merge the results for sampleID in samples_taxa: - # subset ampcombi - subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset tool + subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)] # subset taxa - subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] - # merge - subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') - # cleanup the table - columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] - subset_df.drop(columnsremove, axis=1, inplace=True) - subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) - # append in the combined_df - merged_df = merged_df.append(subset_df, ignore_index=True) + subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)] + # ensure strings + subset_tool['contig_id'] = subset_tool['contig_id'].astype(str) + subset_taxa['contig_id'] = subset_taxa['contig_id'].astype(str) + # rename columns to avoid droping of mutual ones + rename_dict = {col: f"{col}_taxa" for col in subset_taxa.columns if col in subset_tool.columns} + subset_taxa = subset_taxa.rename(columns=rename_dict) + + # merge by string + merged_rows = [] + # iterate and find all matches + for _, tool_row in subset_tool.iterrows(): + tool_contig_id = tool_row['contig_id'] + matches = subset_taxa[subset_taxa['contig_id_taxa'].apply(lambda x: str(x) in tool_contig_id)] + # if match, merge row + if not matches.empty: + for _, taxa_row in matches.iterrows(): + merged_row = {**tool_row.to_dict(), **taxa_row.to_dict()} + merged_rows.append(merged_row) + else: + # if no match keep row as is + merged_row = {**tool_row.to_dict()} + merged_rows.append(merged_row) + + merged_df = pd.DataFrame(merged_rows) + combined_dfs.append(merged_df) + + merged_df_final = pd.concat(combined_dfs, ignore_index=True) # write to file - merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False) + merged_df_final.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False) ######################################### # SUBPARSERS: DEFAULT From e7630e81549c84366369f22de50b0d2339040beb Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 10 Jan 2025 09:18:55 +0100 Subject: [PATCH 02/11] fix ARG TABIX output file name --- conf/modules.config | 1 + 1 file changed, 1 insertion(+) diff --git a/conf/modules.config b/conf/modules.config index d4e473d2..7999b58a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -534,6 +534,7 @@ process { } withName: ARG_TABIX_BGZIP { + ext.prefix = { "hamronization_complete_summary_taxonomy" } publishDir = [ path: { "${params.outdir}/reports/hamronization_summarize" }, mode: params.publish_dir_mode, From 681906c42e4e934060183cbf62ffc1d181c34421 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 10 Jan 2025 09:31:03 +0100 Subject: [PATCH 03/11] fix output declaration for args --- docs/output.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/output.md b/docs/output.md index 0920236c..686ef2dc 100644 --- a/docs/output.md +++ b/docs/output.md @@ -522,7 +522,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `hamronization_summarize/` one of the following: - `hamronization_combined_report.json`: summarised output in .json format - `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default). - - `hamronization_combined_report.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`. + - `hamronization_complete_summary_taxonomy.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`. - `hamronization_combined_report.html`: interactive output in .html format From a039619bcac5a901df190885f92c693f5ea0ccbd Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 10 Jan 2025 09:39:08 +0100 Subject: [PATCH 04/11] update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index a27c12d3..9227c5b1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606) - [#430](https://github.com/nf-core/funcscan/pull/430) Updated `rgi/main` module to fix incorrect variable name. (by @amizeranschi and @jasmezz) +- [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts and updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606) ### `Dependencies` @@ -24,6 +25,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | Macrel | 1.2.0 | 1.4.0 | | MultiQC | 1.24.0 | 1.25.1 | + ### `Deprecated` ## v2.0.0 - [2024-09-05] From 35423f435200889ae2ccb4fab1b2bde9b767c1be Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 10 Jan 2025 10:03:17 +0100 Subject: [PATCH 05/11] fix linting --- CHANGELOG.md | 1 - 1 file changed, 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9227c5b1..84df40b8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -25,7 +25,6 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | Macrel | 1.2.0 | 1.4.0 | | MultiQC | 1.24.0 | 1.25.1 | - ### `Deprecated` ## v2.0.0 - [2024-09-05] From 11a07ca24739d830d63014a08ce2df2a8db042c2 Mon Sep 17 00:00:00 2001 From: darcy220606 Date: Fri, 10 Jan 2025 11:28:24 +0100 Subject: [PATCH 06/11] update nf tests --- tests/test_taxonomy_bakta.nf.test | 2 +- tests/test_taxonomy_prokka.nf.test | 2 +- tests/test_taxonomy_pyrodigal.nf.test | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_taxonomy_bakta.nf.test b/tests/test_taxonomy_bakta.nf.test index 6498c4bd..5f076d29 100644 --- a/tests/test_taxonomy_bakta.nf.test +++ b/tests/test_taxonomy_bakta.nf.test @@ -79,7 +79,7 @@ nextflow_pipeline { ).match("fargene") }, // hAMRonization - { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() }, + { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() }, // antiSMASH { assert snapshot ( diff --git a/tests/test_taxonomy_prokka.nf.test b/tests/test_taxonomy_prokka.nf.test index 0628508a..64c67b4a 100644 --- a/tests/test_taxonomy_prokka.nf.test +++ b/tests/test_taxonomy_prokka.nf.test @@ -79,7 +79,7 @@ nextflow_pipeline { ).match("fargene") }, // hAMRonization - { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() }, + { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() }, // antiSMASH { assert snapshot ( diff --git a/tests/test_taxonomy_pyrodigal.nf.test b/tests/test_taxonomy_pyrodigal.nf.test index 8f325fc0..f0dc1012 100644 --- a/tests/test_taxonomy_pyrodigal.nf.test +++ b/tests/test_taxonomy_pyrodigal.nf.test @@ -79,7 +79,7 @@ nextflow_pipeline { ).match("fargene") }, // hAMRonization - { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() }, + { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() }, // antiSMASH { assert snapshot ( From 0e1e426972b430693431adc56d6a3c753bb9d741 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:27:35 +0100 Subject: [PATCH 07/11] Update bin/merge_taxonomy.py Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- bin/merge_taxonomy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index d492e4b4..b3a4aa6b 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -73,7 +73,7 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy): mmseqs_taxonomy (tsv): mmseqs output file per sample Returns: - data frame: reformated tables + data frame: reformatted tables """ mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig']) # remove the lineage column From 8101fab9ba40054c53938f07a5dab3301034ad6b Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:27:52 +0100 Subject: [PATCH 08/11] Update bin/merge_taxonomy.py Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- bin/merge_taxonomy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index b3a4aa6b..49a9a970 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -208,7 +208,7 @@ def combgc_taxa(args): # TODO : FIX THE MERGING in ARG pipeline def hamronization_taxa(args): """_summary_ - Merges Hamronization tool output with taxonomy information. + Merges hAMRonization tool output with taxonomy information. Parameters: ---------- From 727215308a2ccc18099eb7f66993c01c7c1e6eee Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:28:15 +0100 Subject: [PATCH 09/11] Update CHANGELOG.md Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 84df40b8..89c3fb0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606) - [#430](https://github.com/nf-core/funcscan/pull/430) Updated `rgi/main` module to fix incorrect variable name. (by @amizeranschi and @jasmezz) -- [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts and updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606) +- [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts, updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606) ### `Dependencies` From 10d9504425186f89aeffca2e33bbc2c6f7b0d990 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:28:33 +0100 Subject: [PATCH 10/11] Update bin/merge_taxonomy.py Co-authored-by: Jasmin Frangenberg <73216762+jasmezz@users.noreply.github.com> --- bin/merge_taxonomy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index 49a9a970..ead506e5 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -252,7 +252,7 @@ def hamronization_taxa(args): # ensure strings subset_tool['contig_id'] = subset_tool['contig_id'].astype(str) subset_taxa['contig_id'] = subset_taxa['contig_id'].astype(str) - # rename columns to avoid droping of mutual ones + # rename columns to avoid dropping of mutual ones rename_dict = {col: f"{col}_taxa" for col in subset_taxa.columns if col in subset_tool.columns} subset_taxa = subset_taxa.rename(columns=rename_dict) From eec773883aa97e0bc8871fc669d4bfc6a2c875a6 Mon Sep 17 00:00:00 2001 From: Anan Ibrahim <81744003+Darcy220606@users.noreply.github.com> Date: Wed, 15 Jan 2025 13:29:26 +0100 Subject: [PATCH 11/11] Update merge_taxonomy.py --- bin/merge_taxonomy.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index ead506e5..d202bcbf 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -205,7 +205,6 @@ def combgc_taxa(args): ######################################### # FUNCTION: HAMRONIZATION ######################################### -# TODO : FIX THE MERGING in ARG pipeline def hamronization_taxa(args): """_summary_ Merges hAMRonization tool output with taxonomy information.