Skip to content

Commit b89ec1b

Browse files
authored
Merge pull request #435 from nf-core/fix_taxonomy_files
Fix taxonomy merging scripts and ouput
2 parents 640992d + eec7738 commit b89ec1b

File tree

7 files changed

+103
-43
lines changed

7 files changed

+103
-43
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1616

1717
- [#427](https://github.com/nf-core/funcscan/pull/427) Fixed the AMP reference database issues reported by users, due to non-ASCII characters. (by @darcy220606)
1818
- [#430](https://github.com/nf-core/funcscan/pull/430) Updated `rgi/main` module to fix incorrect variable name. (by @amizeranschi and @jasmezz)
19+
- [#435](https://github.com/nf-core/funcscan/pull/435) Fixed dependency errors within taxonomy merging scripts, updated the code and output for all three workflows. Bumped to version 0.1.1. (by @darcy220606)
1920

2021
### `Dependencies`
2122

bin/merge_taxonomy.py

Lines changed: 97 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
# Written by Anan Ibrahim and released under the MIT license.
44
# See git repository (https://github.com/Darcy220606/AMPcombi) for full license text.
55
# Date: March 2024
6-
# Version: 0.1.0
6+
# Version: 0.1.1
77

88
# Required modules
99
import sys
@@ -12,7 +12,7 @@
1212
import numpy as np
1313
import argparse
1414

15-
tool_version = "0.1.0"
15+
tool_version = "0.1.1"
1616
#########################################
1717
# TOP LEVEL: AMPCOMBI
1818
#########################################
@@ -66,6 +66,15 @@
6666
# TAXONOMY
6767
#########################################
6868
def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
69+
"""_summary_
70+
Reformats the taxonomy files and joins them in a list to be passed on to the tools functions
71+
72+
Args:
73+
mmseqs_taxonomy (tsv): mmseqs output file per sample
74+
75+
Returns:
76+
data frame: reformatted tables
77+
"""
6978
mmseqs2_df = pd.read_csv(mmseqs_taxonomy, sep='\t', header=None, names=['contig_id', 'taxid', 'rank_label', 'scientific_name', 'lineage', 'mmseqs_lineage_contig'])
7079
# remove the lineage column
7180
mmseqs2_df.drop('lineage', axis=1, inplace=True)
@@ -85,7 +94,19 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
8594
# FUNCTION: AMPCOMBI
8695
#########################################
8796
def ampcombi_taxa(args):
88-
merged_df = pd.DataFrame()
97+
"""_summary_
98+
Merges AMPcombi tool output with taxonomy information.
99+
100+
Parameters:
101+
----------
102+
args:
103+
Contains arguments for AMPcombi file path (`amp`) and list of taxonomy file paths (`taxa1`).
104+
105+
Outputs:
106+
-------
107+
Creates a file named `ampcombi_complete_summary_taxonomy.tsv` containing the merged results.
108+
"""
109+
combined_dfs = []
89110

90111
# assign input args to variables
91112
ampcombi = args.amp
@@ -100,31 +121,25 @@ def ampcombi_taxa(args):
100121

101122
# filter the tool df
102123
tool_df = pd.read_csv(ampcombi, sep='\t')
103-
# remove the column with contig_id - duplicate #NOTE: will be fixed in AMPcombi v2.0.0
104-
tool_df = tool_df.drop('contig_id', axis=1)
105-
# make sure 1st and 2nd column have the same column labels
106-
tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True)
107-
tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True)
108-
# grab the real contig id in another column copy for merging
109-
tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0]
110124

111125
# merge rows from taxa to ampcombi_df based on substring match in sample_id
112126
# grab the unique sample names from the taxonomy table
113127
samples_taxa = taxa_df['sample_id'].unique()
114128
# for every sampleID in taxadf merge the results
115129
for sampleID in samples_taxa:
116130
# subset ampcombi
117-
subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
131+
subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
118132
# subset taxa
119-
subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
133+
subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
120134
# merge
121-
subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left')
135+
subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left')
122136
# cleanup the table
123-
columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y']
137+
columnsremove = ['sample_id_y']
124138
subset_df.drop(columnsremove, axis=1, inplace=True)
125-
subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True)
139+
subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True)
126140
# append in the combined_df
127-
merged_df = merged_df.append(subset_df, ignore_index=True)
141+
combined_dfs.append(subset_df)
142+
merged_df = pd.concat(combined_dfs, ignore_index=True)
128143

129144
# write to file
130145
merged_df.to_csv('ampcombi_complete_summary_taxonomy.tsv', sep='\t', index=False)
@@ -133,7 +148,20 @@ def ampcombi_taxa(args):
133148
# FUNCTION: COMBGC
134149
#########################################
135150
def combgc_taxa(args):
136-
merged_df = pd.DataFrame()
151+
"""_summary_
152+
153+
Merges comBGC tool output with taxonomy information.
154+
155+
Parameters:
156+
----------
157+
args:
158+
Contains arguments for comBGC file path (`bgc`) and list of taxonomy file paths (`taxa2`).
159+
160+
Outputs:
161+
-------
162+
Creates a file named `combgc_complete_summary_taxonomy.tsv` containing the merged results.
163+
"""
164+
combined_dfs = []
137165

138166
# assign input args to variables
139167
combgc = args.bgc
@@ -152,23 +180,24 @@ def combgc_taxa(args):
152180
tool_df.rename(columns={tool_df.columns[0]: 'sample_id'}, inplace=True)
153181
tool_df.rename(columns={tool_df.columns[1]: 'contig_id'}, inplace=True)
154182

155-
# merge rows from taxa to ampcombi_df based on substring match in sample_id
183+
# merge rows from taxa to combgc_df based on substring match in sample_id
156184
# grab the unique sample names from the taxonomy table
157185
samples_taxa = taxa_df['sample_id'].unique()
158186
# for every sampleID in taxadf merge the results
159187
for sampleID in samples_taxa:
160-
# subset ampcombi
161-
subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
188+
# subset tool
189+
subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
162190
# subset taxa
163-
subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
191+
subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
164192
# merge
165-
subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id', right_on='contig_id', how='left')
193+
subset_df = pd.merge(subset_tool, subset_taxa, on='contig_id', how='left')
166194
# cleanup the table
167195
columnsremove = ['sample_id_y']
168196
subset_df.drop(columnsremove, axis=1, inplace=True)
169197
subset_df.rename(columns={'sample_id_x':'sample_id'},inplace=True)
170198
# append in the combined_df
171-
merged_df = merged_df.append(subset_df, ignore_index=True)
199+
combined_dfs.append(subset_df)
200+
merged_df = pd.concat(combined_dfs, ignore_index=True)
172201

173202
# write to file
174203
merged_df.to_csv('combgc_complete_summary_taxonomy.tsv', sep='\t', index=False)
@@ -177,7 +206,19 @@ def combgc_taxa(args):
177206
# FUNCTION: HAMRONIZATION
178207
#########################################
179208
def hamronization_taxa(args):
180-
merged_df = pd.DataFrame()
209+
"""_summary_
210+
Merges hAMRonization tool output with taxonomy information.
211+
212+
Parameters:
213+
----------
214+
args:
215+
Contains arguments for hamronization file path (`arg`) and list of taxonomy file paths (`taxa2`).
216+
217+
Outputs:
218+
-------
219+
Creates a file named `hamronization_complete_summary_taxonomy.tsv` containing the merged results.
220+
"""
221+
combined_dfs = []
181222

182223
# assign input args to variables
183224
hamronization = args.arg
@@ -197,29 +238,46 @@ def hamronization_taxa(args):
197238
# reorder the columns
198239
new_order = ['sample_id', 'contig_id'] + [col for col in tool_df.columns if col not in ['sample_id', 'contig_id']]
199240
tool_df = tool_df.reindex(columns=new_order)
200-
# grab the real contig id in another column copy for merging
201-
tool_df['contig_id_merge'] = tool_df['contig_id'].str.rsplit('_', 1).str[0]
202241

203-
# merge rows from taxa to ampcombi_df based on substring match in sample_id
242+
# merge rows from taxa to hamronization_df based on substring match in sample_id
204243
# grab the unique sample names from the taxonomy table
205244
samples_taxa = taxa_df['sample_id'].unique()
206245
# for every sampleID in taxadf merge the results
207246
for sampleID in samples_taxa:
208-
# subset ampcombi
209-
subset_tool = tool_df.loc[tool_df['sample_id'].str.contains(sampleID)]
247+
# subset tool
248+
subset_tool = tool_df[tool_df['sample_id'].str.contains(sampleID, na=False)]
210249
# subset taxa
211-
subset_taxa = taxa_df.loc[taxa_df['sample_id'].str.contains(sampleID)]
212-
# merge
213-
subset_df = pd.merge(subset_tool, subset_taxa, left_on = 'contig_id_merge', right_on='contig_id', how='left')
214-
# cleanup the table
215-
columnsremove = ['contig_id_merge','contig_id_y', 'sample_id_y']
216-
subset_df.drop(columnsremove, axis=1, inplace=True)
217-
subset_df.rename(columns={'contig_id_x': 'contig_id', 'sample_id_x':'sample_id'},inplace=True)
218-
# append in the combined_df
219-
merged_df = merged_df.append(subset_df, ignore_index=True)
250+
subset_taxa = taxa_df[taxa_df['sample_id'].str.contains(sampleID, na=False)]
251+
# ensure strings
252+
subset_tool['contig_id'] = subset_tool['contig_id'].astype(str)
253+
subset_taxa['contig_id'] = subset_taxa['contig_id'].astype(str)
254+
# rename columns to avoid dropping of mutual ones
255+
rename_dict = {col: f"{col}_taxa" for col in subset_taxa.columns if col in subset_tool.columns}
256+
subset_taxa = subset_taxa.rename(columns=rename_dict)
257+
258+
# merge by string
259+
merged_rows = []
260+
# iterate and find all matches
261+
for _, tool_row in subset_tool.iterrows():
262+
tool_contig_id = tool_row['contig_id']
263+
matches = subset_taxa[subset_taxa['contig_id_taxa'].apply(lambda x: str(x) in tool_contig_id)]
264+
# if match, merge row
265+
if not matches.empty:
266+
for _, taxa_row in matches.iterrows():
267+
merged_row = {**tool_row.to_dict(), **taxa_row.to_dict()}
268+
merged_rows.append(merged_row)
269+
else:
270+
# if no match keep row as is
271+
merged_row = {**tool_row.to_dict()}
272+
merged_rows.append(merged_row)
273+
274+
merged_df = pd.DataFrame(merged_rows)
275+
combined_dfs.append(merged_df)
276+
277+
merged_df_final = pd.concat(combined_dfs, ignore_index=True)
220278

221279
# write to file
222-
merged_df.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False)
280+
merged_df_final.to_csv('hamronization_complete_summary_taxonomy.tsv', sep='\t', index=False)
223281

224282
#########################################
225283
# SUBPARSERS: DEFAULT

conf/modules.config

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -534,6 +534,7 @@ process {
534534
}
535535

536536
withName: ARG_TABIX_BGZIP {
537+
ext.prefix = { "hamronization_complete_summary_taxonomy" }
537538
publishDir = [
538539
path: { "${params.outdir}/reports/hamronization_summarize" },
539540
mode: params.publish_dir_mode,

docs/output.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -522,7 +522,7 @@ Note that filtered FASTA is only used for BGC workflow for run-time optimisation
522522
- `hamronization_summarize/` one of the following:
523523
- `hamronization_combined_report.json`: summarised output in .json format
524524
- `hamronization_combined_report.tsv`: summarised output in .tsv format when the taxonomic classification is turned off (pipeline default).
525-
- `hamronization_combined_report.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`.
525+
- `hamronization_complete_summary_taxonomy.tsv.gz`: summarised output in gzipped format when the taxonomic classification is turned on by `--run_taxa_classification`.
526526
- `hamronization_combined_report.html`: interactive output in .html format
527527

528528
</details>

tests/test_taxonomy_bakta.nf.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ nextflow_pipeline {
7979
).match("fargene") },
8080

8181
// hAMRonization
82-
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
82+
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },
8383

8484
// antiSMASH
8585
{ assert snapshot (

tests/test_taxonomy_prokka.nf.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ nextflow_pipeline {
7979
).match("fargene") },
8080

8181
// hAMRonization
82-
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
82+
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },
8383

8484
// antiSMASH
8585
{ assert snapshot (

tests/test_taxonomy_pyrodigal.nf.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ nextflow_pipeline {
7979
).match("fargene") },
8080

8181
// hAMRonization
82-
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_combined_report.tsv.gz").exists() },
82+
{ assert new File("$outputDir/reports/hamronization_summarize/hamronization_complete_summary_taxonomy.tsv.gz").exists() },
8383

8484
// antiSMASH
8585
{ assert snapshot (

0 commit comments

Comments
 (0)