33# Written by Anan Ibrahim and released under the MIT license.
44# See git repository (https://github.com/Darcy220606/AMPcombi) for full license text.
55# Date: March 2024
6- # Version: 0.1.0
6+ # Version: 0.1.1
77
88# Required modules
99import sys
1212import numpy as np
1313import argparse
1414
15- tool_version = "0.1.0 "
15+ tool_version = "0.1.1 "
1616#########################################
1717# TOP LEVEL: AMPCOMBI
1818#########################################
6666# TAXONOMY
6767#########################################
6868def reformat_mmseqs_taxonomy (mmseqs_taxonomy ):
69+ """_summary_
70+ Reformats the taxonomy files and joins them in a list to be passed on to the tools functions
71+
72+ Args:
73+ mmseqs_taxonomy (tsv): mmseqs output file per sample
74+
75+ Returns:
76+ data frame: reformatted tables
77+ """
6978 mmseqs2_df = pd .read_csv (mmseqs_taxonomy , sep = '\t ' , header = None , names = ['contig_id' , 'taxid' , 'rank_label' , 'scientific_name' , 'lineage' , 'mmseqs_lineage_contig' ])
7079 # remove the lineage column
7180 mmseqs2_df .drop ('lineage' , axis = 1 , inplace = True )
@@ -85,7 +94,19 @@ def reformat_mmseqs_taxonomy(mmseqs_taxonomy):
8594# FUNCTION: AMPCOMBI
8695#########################################
8796def ampcombi_taxa (args ):
88- merged_df = pd .DataFrame ()
97+ """_summary_
98+ Merges AMPcombi tool output with taxonomy information.
99+
100+ Parameters:
101+ ----------
102+ args:
103+ Contains arguments for AMPcombi file path (`amp`) and list of taxonomy file paths (`taxa1`).
104+
105+ Outputs:
106+ -------
107+ Creates a file named `ampcombi_complete_summary_taxonomy.tsv` containing the merged results.
108+ """
109+ combined_dfs = []
89110
90111 # assign input args to variables
91112 ampcombi = args .amp
@@ -100,31 +121,25 @@ def ampcombi_taxa(args):
100121
101122 # filter the tool df
102123 tool_df = pd .read_csv (ampcombi , sep = '\t ' )
103- # remove the column with contig_id - duplicate #NOTE: will be fixed in AMPcombi v2.0.0
104- tool_df = tool_df .drop ('contig_id' , axis = 1 )
105- # make sure 1st and 2nd column have the same column labels
106- tool_df .rename (columns = {tool_df .columns [0 ]: 'sample_id' }, inplace = True )
107- tool_df .rename (columns = {tool_df .columns [1 ]: 'contig_id' }, inplace = True )
108- # grab the real contig id in another column copy for merging
109- tool_df ['contig_id_merge' ] = tool_df ['contig_id' ].str .rsplit ('_' , 1 ).str [0 ]
110124
111125 # merge rows from taxa to ampcombi_df based on substring match in sample_id
112126 # grab the unique sample names from the taxonomy table
113127 samples_taxa = taxa_df ['sample_id' ].unique ()
114128 # for every sampleID in taxadf merge the results
115129 for sampleID in samples_taxa :
116130 # subset ampcombi
117- subset_tool = tool_df . loc [tool_df ['sample_id' ].str .contains (sampleID )]
131+ subset_tool = tool_df [tool_df ['sample_id' ].str .contains (sampleID , na = False )]
118132 # subset taxa
119- subset_taxa = taxa_df . loc [taxa_df ['sample_id' ].str .contains (sampleID )]
133+ subset_taxa = taxa_df [taxa_df ['sample_id' ].str .contains (sampleID , na = False )]
120134 # merge
121- subset_df = pd .merge (subset_tool , subset_taxa , left_on = 'contig_id_merge' , right_on = 'contig_id' , how = 'left' )
135+ subset_df = pd .merge (subset_tool , subset_taxa , on = 'contig_id' , how = 'left' )
122136 # cleanup the table
123- columnsremove = ['contig_id_merge' , 'contig_id_y' , ' sample_id_y' ]
137+ columnsremove = ['sample_id_y' ]
124138 subset_df .drop (columnsremove , axis = 1 , inplace = True )
125- subset_df .rename (columns = {'contig_id_x' : 'contig_id' , ' sample_id_x' :'sample_id' },inplace = True )
139+ subset_df .rename (columns = {'sample_id_x' :'sample_id' },inplace = True )
126140 # append in the combined_df
127- merged_df = merged_df .append (subset_df , ignore_index = True )
141+ combined_dfs .append (subset_df )
142+ merged_df = pd .concat (combined_dfs , ignore_index = True )
128143
129144 # write to file
130145 merged_df .to_csv ('ampcombi_complete_summary_taxonomy.tsv' , sep = '\t ' , index = False )
@@ -133,7 +148,20 @@ def ampcombi_taxa(args):
133148# FUNCTION: COMBGC
134149#########################################
135150def combgc_taxa (args ):
136- merged_df = pd .DataFrame ()
151+ """_summary_
152+
153+ Merges comBGC tool output with taxonomy information.
154+
155+ Parameters:
156+ ----------
157+ args:
158+ Contains arguments for comBGC file path (`bgc`) and list of taxonomy file paths (`taxa2`).
159+
160+ Outputs:
161+ -------
162+ Creates a file named `combgc_complete_summary_taxonomy.tsv` containing the merged results.
163+ """
164+ combined_dfs = []
137165
138166 # assign input args to variables
139167 combgc = args .bgc
@@ -152,23 +180,24 @@ def combgc_taxa(args):
152180 tool_df .rename (columns = {tool_df .columns [0 ]: 'sample_id' }, inplace = True )
153181 tool_df .rename (columns = {tool_df .columns [1 ]: 'contig_id' }, inplace = True )
154182
155- # merge rows from taxa to ampcombi_df based on substring match in sample_id
183+ # merge rows from taxa to combgc_df based on substring match in sample_id
156184 # grab the unique sample names from the taxonomy table
157185 samples_taxa = taxa_df ['sample_id' ].unique ()
158186 # for every sampleID in taxadf merge the results
159187 for sampleID in samples_taxa :
160- # subset ampcombi
161- subset_tool = tool_df . loc [tool_df ['sample_id' ].str .contains (sampleID )]
188+ # subset tool
189+ subset_tool = tool_df [tool_df ['sample_id' ].str .contains (sampleID , na = False )]
162190 # subset taxa
163- subset_taxa = taxa_df . loc [taxa_df ['sample_id' ].str .contains (sampleID )]
191+ subset_taxa = taxa_df [taxa_df ['sample_id' ].str .contains (sampleID , na = False )]
164192 # merge
165- subset_df = pd .merge (subset_tool , subset_taxa , left_on = 'contig_id' , right_on = 'contig_id' , how = 'left' )
193+ subset_df = pd .merge (subset_tool , subset_taxa , on = 'contig_id' , how = 'left' )
166194 # cleanup the table
167195 columnsremove = ['sample_id_y' ]
168196 subset_df .drop (columnsremove , axis = 1 , inplace = True )
169197 subset_df .rename (columns = {'sample_id_x' :'sample_id' },inplace = True )
170198 # append in the combined_df
171- merged_df = merged_df .append (subset_df , ignore_index = True )
199+ combined_dfs .append (subset_df )
200+ merged_df = pd .concat (combined_dfs , ignore_index = True )
172201
173202 # write to file
174203 merged_df .to_csv ('combgc_complete_summary_taxonomy.tsv' , sep = '\t ' , index = False )
@@ -177,7 +206,19 @@ def combgc_taxa(args):
177206# FUNCTION: HAMRONIZATION
178207#########################################
179208def hamronization_taxa (args ):
180- merged_df = pd .DataFrame ()
209+ """_summary_
210+ Merges hAMRonization tool output with taxonomy information.
211+
212+ Parameters:
213+ ----------
214+ args:
215+ Contains arguments for hamronization file path (`arg`) and list of taxonomy file paths (`taxa2`).
216+
217+ Outputs:
218+ -------
219+ Creates a file named `hamronization_complete_summary_taxonomy.tsv` containing the merged results.
220+ """
221+ combined_dfs = []
181222
182223 # assign input args to variables
183224 hamronization = args .arg
@@ -197,29 +238,46 @@ def hamronization_taxa(args):
197238 # reorder the columns
198239 new_order = ['sample_id' , 'contig_id' ] + [col for col in tool_df .columns if col not in ['sample_id' , 'contig_id' ]]
199240 tool_df = tool_df .reindex (columns = new_order )
200- # grab the real contig id in another column copy for merging
201- tool_df ['contig_id_merge' ] = tool_df ['contig_id' ].str .rsplit ('_' , 1 ).str [0 ]
202241
203- # merge rows from taxa to ampcombi_df based on substring match in sample_id
242+ # merge rows from taxa to hamronization_df based on substring match in sample_id
204243 # grab the unique sample names from the taxonomy table
205244 samples_taxa = taxa_df ['sample_id' ].unique ()
206245 # for every sampleID in taxadf merge the results
207246 for sampleID in samples_taxa :
208- # subset ampcombi
209- subset_tool = tool_df . loc [tool_df ['sample_id' ].str .contains (sampleID )]
247+ # subset tool
248+ subset_tool = tool_df [tool_df ['sample_id' ].str .contains (sampleID , na = False )]
210249 # subset taxa
211- subset_taxa = taxa_df .loc [taxa_df ['sample_id' ].str .contains (sampleID )]
212- # merge
213- subset_df = pd .merge (subset_tool , subset_taxa , left_on = 'contig_id_merge' , right_on = 'contig_id' , how = 'left' )
214- # cleanup the table
215- columnsremove = ['contig_id_merge' ,'contig_id_y' , 'sample_id_y' ]
216- subset_df .drop (columnsremove , axis = 1 , inplace = True )
217- subset_df .rename (columns = {'contig_id_x' : 'contig_id' , 'sample_id_x' :'sample_id' },inplace = True )
218- # append in the combined_df
219- merged_df = merged_df .append (subset_df , ignore_index = True )
250+ subset_taxa = taxa_df [taxa_df ['sample_id' ].str .contains (sampleID , na = False )]
251+ # ensure strings
252+ subset_tool ['contig_id' ] = subset_tool ['contig_id' ].astype (str )
253+ subset_taxa ['contig_id' ] = subset_taxa ['contig_id' ].astype (str )
254+ # rename columns to avoid dropping of mutual ones
255+ rename_dict = {col : f"{ col } _taxa" for col in subset_taxa .columns if col in subset_tool .columns }
256+ subset_taxa = subset_taxa .rename (columns = rename_dict )
257+
258+ # merge by string
259+ merged_rows = []
260+ # iterate and find all matches
261+ for _ , tool_row in subset_tool .iterrows ():
262+ tool_contig_id = tool_row ['contig_id' ]
263+ matches = subset_taxa [subset_taxa ['contig_id_taxa' ].apply (lambda x : str (x ) in tool_contig_id )]
264+ # if match, merge row
265+ if not matches .empty :
266+ for _ , taxa_row in matches .iterrows ():
267+ merged_row = {** tool_row .to_dict (), ** taxa_row .to_dict ()}
268+ merged_rows .append (merged_row )
269+ else :
270+ # if no match keep row as is
271+ merged_row = {** tool_row .to_dict ()}
272+ merged_rows .append (merged_row )
273+
274+ merged_df = pd .DataFrame (merged_rows )
275+ combined_dfs .append (merged_df )
276+
277+ merged_df_final = pd .concat (combined_dfs , ignore_index = True )
220278
221279 # write to file
222- merged_df .to_csv ('hamronization_complete_summary_taxonomy.tsv' , sep = '\t ' , index = False )
280+ merged_df_final .to_csv ('hamronization_complete_summary_taxonomy.tsv' , sep = '\t ' , index = False )
223281
224282#########################################
225283# SUBPARSERS: DEFAULT
0 commit comments