diff --git a/CHANGELOG.md b/CHANGELOG.md index a27c12d3..89c3fb0f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606) - [#430](https://github.com/nf-core/funcscan/pull/430) Updated `rgi/main` module to fix incorrect variable name. (by @amizeranschi and @jasmezz) +- [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts, updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606) ### `Dependencies` diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py index 44eed31a..d202bcbf 100755 --- a/bin/merge_taxonomy.py +++ b/bin/merge_taxonomy.py @@ -3,7 +3,7 @@ # Written by Anan Ibrahim and released under the MIT license. # See git repository (https://github.com/Darcy220606/AMPcombi) for full license text. # Date: March 2024 -# Version: 0.1.0 +# Version: 0.1.1 # Required modules import sys @@ -12,7 +12,7 @@ import numpy as np import argparse -tool_version = "0.1.0" +tool_version = "0.1.1" ######################################### # TOP LEVEL: AMPCOMBI ######################################### @@ -66,6 +66,15 @@ # TAXONOMY ######################################### def reformat_mmseqs_taxonomy(mmseqs_taxonomy): + """_summary_ + Reformats the taxonomy files and joins them in a list to be passed on to the tools functions + + Args: + mmseqs_taxonomy (tsv): mmseqs output file per sample + + Returns: + data frame: reformatted tables + """ mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig']) # remove the lineage column mmseqs2_df.drop('lineage', axis=1, inplace=True) @@ -85,7 +94,19 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy): # FUNCTION: AMPCOMBI ######################################### def ampcombi_taxa(args): - merged_df = pd.DataFrame() + """_summary_ + Merges AMPcombi tool output with taxonomy information. + + Parameters: + ---------- + args: + Contains arguments for AMPcombi file path (`amp`) and list of taxonomy file paths (`taxa1`). + + Outputs: + ------- + Creates a file named `ampcombi_complete_summary_taxonomy.tsv` containing the merged results. + """ + combined_dfs = [] # assign input args to variables ampcombi = args.amp @@ -100,13 +121,6 @@ def ampcombi_taxa(args): # filter the tool df tool_df = pd.read_csv(ampcombi, sep='\t') - # remove the column with contig_id - duplicate #NOTE: will be fixed in AMPcombi v2.0.0 - tool_df = tool_df.drop('contig_id', axis=1) - # make sure 1st and 2nd column have the same column labels - tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) - tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) - # grab the real contig id in another column copy for merging - tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] # merge rows from taxa to ampcombi_df based on substring match in sample_id # grab the unique sample names from the taxonomy table @@ -114,17 +128,18 @@ def ampcombi_taxa(args): # for every sampleID in taxadf merge the results for sampleID in samples_taxa: # subset ampcombi - subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)] # subset taxa - subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)] # merge - subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') + subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left') # cleanup the table - columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] + columnsremove = ['sample_id_y'] subset_df.drop(columnsremove, axis=1, inplace=True) - subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) + subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True) # append in the combined_df - merged_df = merged_df.append(subset_df, ignore_index=True) + combined_dfs.append(subset_df) + merged_df = pd.concat(combined_dfs, ignore_index=True) # write to file merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False) @@ -133,7 +148,20 @@ def ampcombi_taxa(args): # FUNCTION: COMBGC ######################################### def combgc_taxa(args): - merged_df = pd.DataFrame() + """_summary_ + + Merges comBGC tool output with taxonomy information. + + Parameters: + ---------- + args: + Contains arguments for comBGC file path (`bgc`) and list of taxonomy file paths (`taxa2`). + + Outputs: + ------- + Creates a file named `combgc_complete_summary_taxonomy.tsv` containing the merged results. + """ + combined_dfs = [] # assign input args to variables combgc = args.bgc @@ -152,23 +180,24 @@ def combgc_taxa(args): tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True) tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True) - # merge rows from taxa to ampcombi_df based on substring match in sample_id + # merge rows from taxa to combgc_df based on substring match in sample_id # grab the unique sample names from the taxonomy table samples_taxa = taxa_df['sample_id'].unique() # for every sampleID in taxadf merge the results for sampleID in samples_taxa: - # subset ampcombi - subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset tool + subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)] # subset taxa - subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] + subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)] # merge - subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id', right_on='contig_id', how='left') + subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left') # cleanup the table columnsremove = ['sample_id_y'] subset_df.drop(columnsremove, axis=1, inplace=True) subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True) # append in the combined_df - merged_df = merged_df.append(subset_df, ignore_index=True) + combined_dfs.append(subset_df) + merged_df = pd.concat(combined_dfs, ignore_index=True) # write to file merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False) @@ -177,7 +206,19 @@ def combgc_taxa(args): # FUNCTION: HAMRONIZATION ######################################### def hamronization_taxa(args): - merged_df = pd.DataFrame() + """_summary_ + Merges hAMRonization tool output with taxonomy information. + + Parameters: + ---------- + args: + Contains arguments for hamronization file path (`arg`) and list of taxonomy file paths (`taxa2`). + + Outputs: + ------- + Creates a file named `hamronization_complete_summary_taxonomy.tsv` containing the merged results. + """ + combined_dfs = [] # assign input args to variables hamronization = args.arg @@ -197,29 +238,46 @@ def hamronization_taxa(args): # reorder the columns new_order = ['sample_id', 'contig_id'] + [col for col in tool_df.columns if col not in ['sample_id', 'contig_id']] tool_df = tool_df.reindex(columns=new_order) - # grab the real contig id in another column copy for merging - tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0] - # merge rows from taxa to ampcombi_df based on substring match in sample_id + # merge rows from taxa to hamronization_df based on substring match in sample_id # grab the unique sample names from the taxonomy table samples_taxa = taxa_df['sample_id'].unique() # for every sampleID in taxadf merge the results for sampleID in samples_taxa: - # subset ampcombi - subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)] + # subset tool + subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)] # subset taxa - subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)] - # merge - subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left') - # cleanup the table - columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y'] - subset_df.drop(columnsremove, axis=1, inplace=True) - subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True) - # append in the combined_df - merged_df = merged_df.append(subset_df, ignore_index=True) + subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)] + # ensure strings + subset_tool['contig_id'] = subset_tool['contig_id'].astype(str) + subset_taxa['contig_id'] = subset_taxa['contig_id'].astype(str) + # rename columns to avoid dropping of mutual ones + rename_dict = {col: f"{col}_taxa" for col in subset_taxa.columns if col in subset_tool.columns} + subset_taxa = subset_taxa.rename(columns=rename_dict) + + # merge by string + merged_rows = [] + # iterate and find all matches + for _, tool_row in subset_tool.iterrows(): + tool_contig_id = tool_row['contig_id'] + matches = subset_taxa[subset_taxa['contig_id_taxa'].apply(lambda x: str(x) in tool_contig_id)] + # if match, merge row + if not matches.empty: + for _, taxa_row in matches.iterrows(): + merged_row = {**tool_row.to_dict(), **taxa_row.to_dict()} + merged_rows.append(merged_row) + else: + # if no match keep row as is + merged_row = {**tool_row.to_dict()} + merged_rows.append(merged_row) + + merged_df = pd.DataFrame(merged_rows) + combined_dfs.append(merged_df) + + merged_df_final = pd.concat(combined_dfs, ignore_index=True) # write to file - merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False) + merged_df_final.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False) ######################################### # SUBPARSERS: DEFAULT diff --git a/conf/modules.config b/conf/modules.config index d4e473d2..7999b58a 100644 --- a/conf/modules.config +++ b/conf/modules.config @@ -534,6 +534,7 @@ process { } withName: ARG_TABIX_BGZIP { + ext.prefix = { "hamronization_complete_summary_taxonomy" } publishDir = [ path: { "${params.outdir}/reports/hamronization_summarize" }, mode: params.publish_dir_mode, diff --git a/docs/output.md b/docs/output.md index 0920236c..686ef2dc 100644 --- a/docs/output.md +++ b/docs/output.md @@ -522,7 +522,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation - `hamronization_summarize/` one of the following: - `hamronization_combined_report.json`: summarised output in .json format - `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default). - - `hamronization_combined_report.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`. + - `hamronization_complete_summary_taxonomy.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`. - `hamronization_combined_report.html`: interactive output in .html format diff --git a/tests/test_taxonomy_bakta.nf.test b/tests/test_taxonomy_bakta.nf.test index 6498c4bd..5f076d29 100644 --- a/tests/test_taxonomy_bakta.nf.test +++ b/tests/test_taxonomy_bakta.nf.test @@ -79,7 +79,7 @@ nextflow_pipeline { ).match("fargene") }, // hAMRonization - { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() }, + { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() }, // antiSMASH { assert snapshot ( diff --git a/tests/test_taxonomy_prokka.nf.test b/tests/test_taxonomy_prokka.nf.test index 0628508a..64c67b4a 100644 --- a/tests/test_taxonomy_prokka.nf.test +++ b/tests/test_taxonomy_prokka.nf.test @@ -79,7 +79,7 @@ nextflow_pipeline { ).match("fargene") }, // hAMRonization - { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() }, + { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() }, // antiSMASH { assert snapshot ( diff --git a/tests/test_taxonomy_pyrodigal.nf.test b/tests/test_taxonomy_pyrodigal.nf.test index 8f325fc0..f0dc1012 100644 --- a/tests/test_taxonomy_pyrodigal.nf.test +++ b/tests/test_taxonomy_pyrodigal.nf.test @@ -79,7 +79,7 @@ nextflow_pipeline { ).match("fargene") }, // hAMRonization - { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() }, + { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() }, // antiSMASH { assert snapshot (