Merge pull request #435 from nf-core/fix_taxonomy_files

Darcy220606 · web-flow · commit b89ec1be6cac · 2025-01-15T14:22:58.000+01:00
Fix taxonomy merging scripts and ouput
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 - [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606)
 - [#430](https://github.com/nf-core/funcscan/pull/430) Updated `rgi/main` module to fix incorrect variable name. (by @amizeranschi and @jasmezz)
+- [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts, updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606)
 
 ### `Dependencies`
 
diff --git a/bin/merge_taxonomy.py b/bin/merge_taxonomy.py
@@ -3,7 +3,7 @@
 # Written by Anan Ibrahim and released under the MIT license.
 # See git repository (https://github.com/Darcy220606/AMPcombi) for full license text.
 # Date: March 2024
-# Version: 0.1.0
+# Version: 0.1.1
 
 # Required modules
 import sys
@@ -12,7 +12,7 @@
 import numpy as np
 import argparse
 
-tool_version = "0.1.0"
+tool_version = "0.1.1"
 #########################################
 # TOP LEVEL: AMPCOMBI
 #########################################
@@ -66,6 +66,15 @@
 # TAXONOMY
 #########################################
 def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
+    """_summary_
+    Reformats the taxonomy files and joins them in a list to be passed on to the tools functions
+
+    Args:
+        mmseqs_taxonomy (tsv): mmseqs output file per sample
+
+    Returns:
+        data frame: reformatted tables
+    """
     mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig'])
     # remove the lineage column
     mmseqs2_df.drop('lineage', axis=1, inplace=True)
@@ -85,7 +94,19 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
 # FUNCTION: AMPCOMBI
 #########################################
 def ampcombi_taxa(args):
-    merged_df = pd.DataFrame()
+    """_summary_
+    Merges AMPcombi tool output with taxonomy information.
+
+    Parameters:
+    ----------
+    args:
+        Contains arguments for AMPcombi file path (`amp`) and list of taxonomy file paths (`taxa1`).
+
+    Outputs:
+    -------
+    Creates a file named `ampcombi_complete_summary_taxonomy.tsv` containing the merged results.
+    """
+    combined_dfs = []
 
     # assign input args to variables
     ampcombi = args.amp
@@ -100,31 +121,25 @@ def ampcombi_taxa(args):
 
     # filter the tool df
     tool_df = pd.read_csv(ampcombi, sep='\t')
-    # remove the column with contig_id - duplicate #NOTE: will be fixed in AMPcombi v2.0.0
-    tool_df = tool_df.drop('contig_id', axis=1)
-    # make sure 1st and 2nd column have the same column labels
-    tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True)
-    tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True)
-    # grab the real contig id in another column copy for merging
-    tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0]
 
     # merge rows from taxa to ampcombi_df based on substring match in sample_id
     # grab the unique sample names from the taxonomy table
     samples_taxa = taxa_df['sample_id'].unique()
     # for every sampleID in taxadf merge the results
     for sampleID in samples_taxa:
         # subset ampcombi
-        subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
+        subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
         # subset taxa
-        subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
+        subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
         # merge
-        subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left')
+        subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left')
         # cleanup the table
-        columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y']
+        columnsremove = ['sample_id_y']
         subset_df.drop(columnsremove, axis=1, inplace=True)
-        subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True)
+        subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True)
         # append in the combined_df
-        merged_df = merged_df.append(subset_df, ignore_index=True)
+        combined_dfs.append(subset_df)
+    merged_df = pd.concat(combined_dfs, ignore_index=True)
 
     # write to file
     merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False)
@@ -133,7 +148,20 @@ def ampcombi_taxa(args):
 # FUNCTION: COMBGC
 #########################################
 def combgc_taxa(args):
-    merged_df = pd.DataFrame()
+    """_summary_
+
+    Merges comBGC tool output with taxonomy information.
+
+    Parameters:
+    ----------
+    args:
+        Contains arguments for comBGC file path (`bgc`) and list of taxonomy file paths (`taxa2`).
+
+    Outputs:
+    -------
+    Creates a file named `combgc_complete_summary_taxonomy.tsv` containing the merged results.
+    """
+    combined_dfs = []
 
     # assign input args to variables
     combgc = args.bgc
@@ -152,23 +180,24 @@ def combgc_taxa(args):
     tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True)
     tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True)
 
-    # merge rows from taxa to ampcombi_df based on substring match in sample_id
+    # merge rows from taxa to combgc_df based on substring match in sample_id
     # grab the unique sample names from the taxonomy table
     samples_taxa = taxa_df['sample_id'].unique()
     # for every sampleID in taxadf merge the results
     for sampleID in samples_taxa:
-        # subset ampcombi
-        subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
+        # subset tool
+        subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
         # subset taxa
-        subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
+        subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
         # merge
-        subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id', right_on='contig_id', how='left')
+        subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left')
         # cleanup the table
         columnsremove = ['sample_id_y']
         subset_df.drop(columnsremove, axis=1, inplace=True)
         subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True)
         # append in the combined_df
-        merged_df = merged_df.append(subset_df, ignore_index=True)
+        combined_dfs.append(subset_df)
+    merged_df = pd.concat(combined_dfs, ignore_index=True)
 
     # write to file
     merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False)
@@ -177,7 +206,19 @@ def combgc_taxa(args):
 # FUNCTION: HAMRONIZATION
 #########################################
 def hamronization_taxa(args):
-    merged_df = pd.DataFrame()
+    """_summary_
+    Merges hAMRonization tool output with taxonomy information.
+
+    Parameters:
+    ----------
+    args:
+        Contains arguments for hamronization file path (`arg`) and list of taxonomy file paths (`taxa2`).
+
+    Outputs:
+    -------
+    Creates a file named `hamronization_complete_summary_taxonomy.tsv` containing the merged results.
+    """
+    combined_dfs = []
 
     # assign input args to variables
     hamronization = args.arg
@@ -197,29 +238,46 @@ def hamronization_taxa(args):
     # reorder the columns
     new_order = ['sample_id', 'contig_id'] + [col for col in tool_df.columns if col not in ['sample_id', 'contig_id']]
     tool_df = tool_df.reindex(columns=new_order)
-    # grab the real contig id in another column copy for merging
-    tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0]
 
-    # merge rows from taxa to ampcombi_df based on substring match in sample_id
+    # merge rows from taxa to hamronization_df based on substring match in sample_id
     # grab the unique sample names from the taxonomy table
     samples_taxa = taxa_df['sample_id'].unique()
     # for every sampleID in taxadf merge the results
     for sampleID in samples_taxa:
-        # subset ampcombi
-        subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
+        # subset tool
+        subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
         # subset taxa
-        subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
-        # merge
-        subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left')
-        # cleanup the table
-        columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y']
-        subset_df.drop(columnsremove, axis=1, inplace=True)
-        subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True)
-        # append in the combined_df
-        merged_df = merged_df.append(subset_df, ignore_index=True)
+        subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
+        # ensure strings
+        subset_tool['contig_id'] = subset_tool['contig_id'].astype(str)
+        subset_taxa['contig_id'] = subset_taxa['contig_id'].astype(str)
+        # rename columns to avoid dropping of mutual ones
+        rename_dict = {col: f"{col}_taxa" for col in subset_taxa.columns if col in subset_tool.columns}
+        subset_taxa = subset_taxa.rename(columns=rename_dict)
+
+        # merge by string
+        merged_rows = []
+        # iterate and find all matches
+        for _, tool_row in subset_tool.iterrows():
+            tool_contig_id = tool_row['contig_id']
+            matches = subset_taxa[subset_taxa['contig_id_taxa'].apply(lambda x: str(x) in tool_contig_id)]
+            # if match, merge row
+            if not matches.empty:
+                for _, taxa_row in matches.iterrows():
+                    merged_row = {**tool_row.to_dict(), **taxa_row.to_dict()}
+                    merged_rows.append(merged_row)
+            else:
+                # if no match keep row as is
+                merged_row = {**tool_row.to_dict()}
+                merged_rows.append(merged_row)
+
+        merged_df = pd.DataFrame(merged_rows)
+        combined_dfs.append(merged_df)
+
+    merged_df_final = pd.concat(combined_dfs, ignore_index=True)
 
     # write to file
-    merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False)
+    merged_df_final.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False)
 
 #########################################
 # SUBPARSERS: DEFAULT
diff --git a/conf/modules.config b/conf/modules.config
@@ -534,6 +534,7 @@ process {
     }
 
     withName: ARG_TABIX_BGZIP {
+        ext.prefix = { "hamronization_complete_summary_taxonomy" }
         publishDir = [
             path: { "${params.outdir}/reports/hamronization_summarize" },
             mode: params.publish_dir_mode,
diff --git a/docs/output.md b/docs/output.md
@@ -522,7 +522,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation
 - `hamronization_summarize/` one of the following:
   - `hamronization_combined_report.json`: summarised output in .json format
   - `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default).
-  - `hamronization_combined_report.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`.
+  - `hamronization_complete_summary_taxonomy.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`.
   - `hamronization_combined_report.html`: interactive output in .html format
 
 </details>
diff --git a/tests/test_taxonomy_bakta.nf.test b/tests/test_taxonomy_bakta.nf.test
@@ -79,7 +79,7 @@ nextflow_pipeline {
                 ).match("fargene") },
 
                 // hAMRonization
-                { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
+                { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },
 
                 // antiSMASH
                 { assert snapshot (
diff --git a/tests/test_taxonomy_prokka.nf.test b/tests/test_taxonomy_prokka.nf.test
@@ -79,7 +79,7 @@ nextflow_pipeline {
                 ).match("fargene") },
 
                 // hAMRonization
-                { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
+                { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },
 
                 // antiSMASH
                 { assert snapshot (
diff --git a/tests/test_taxonomy_pyrodigal.nf.test b/tests/test_taxonomy_pyrodigal.nf.test
@@ -79,7 +79,7 @@ nextflow_pipeline {
                 ).match("fargene") },
 
                 // hAMRonization
-                { assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
+                { assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },
 
                 // antiSMASH
                 { assert snapshot (

Original file line number	Diff line number	Diff line change
`@@ -534,6 +534,7 @@ process {`
`534`	`534`	`}`
`535`	`535`
`536`	`536`	`withName: ARG_TABIX_BGZIP {`
	`537`	`+ ext.prefix = { "hamronization_complete_summary_taxonomy" }`
`537`	`538`	`publishDir = [`
`538`	`539`	`path: { "${params.outdir}/reports/hamronization_summarize" },`
`539`	`540`	`mode: params.publish_dir_mode,`