1
1
#!/usr/bin/env python3
2
2
3
- __version__ = "0.1.2 "
3
+ __version__ = "0.1.3 "
4
4
5
5
import argparse
6
6
import operator
@@ -53,7 +53,7 @@ class ReportNode:
53
53
offset : int
54
54
55
55
56
- def validate_input_file (putative_classifications_file , verbose_input , minimum_hit_groups ):
56
+ def validate_input_file (putative_classifications_file , verbose_input , minimum_hit_groups , paired_input ):
57
57
"""
58
58
Perform simple validation of the input file.
59
59
"""
@@ -77,23 +77,62 @@ def validate_input_file(putative_classifications_file, verbose_input, minimum_hi
77
77
else :
78
78
num_cols = len (line_proc ) == 6 # 6 columns if the output was produced with the verbose version of kraken2 that outputs minimizer hit groups
79
79
80
+ # Line must start with C or U (as in Classified/unclassified)
80
81
line_start = line_proc [0 ] in ['U' , 'C' ]
81
- paired_data_1 = len (line_proc [3 ].split ('|' )) == 2
82
- paired_data_2 = len (line_proc [- 1 ].split ('|:|' )) == 2 # Should be enough to change this line if we want to accomodate reclassification of single reads
83
82
84
- if num_cols and line_start and paired_data_1 and paired_data_2 :
83
+ # If the data is paired
84
+ if paired_input :
85
+ # Must be information on both sides of the pipe character
86
+ data_col_1 = len (line_proc [3 ].split ('|' )) == 2
87
+
88
+ # If the data is paired in the 3rd column, it must also be paired in the last column
89
+ if "|" in line_proc [- 1 ]:
90
+ data_col_2 = len (line_proc [- 1 ].split ('|:|' )) == 2
91
+ else :
92
+ data_col_2 = False
93
+
94
+ # If the input is from single end reads, atleast the read length column (3rd) must be an int
95
+ else :
96
+ try :
97
+ int (line_proc [3 ])
98
+ except :
99
+ data_col_1 = False
100
+ else :
101
+ data_col_1 = True
102
+
103
+ # And the last column should contain colons between kmer/taxon pairs
104
+ if ":" in line_proc [4 ]:
105
+ data_col_2 = True
106
+ else :
107
+ data_col_2 = False
108
+
109
+ if num_cols and line_start and data_col_1 and data_col_2 :
85
110
log .debug ('Validation OK.' )
86
111
return
87
112
else :
88
113
log .error ('The classifications file is malformatted.' )
89
114
log .debug ('First line of input: {}' .format (line ))
90
115
log .debug ('num_cols: {}' .format (num_cols ))
91
116
log .debug ('line_start: {}' .format (line_start ))
92
- log .debug ('paired_data_1 : {}' .format (paired_data_1 ))
93
- log .debug ('paired_data_2 : {}' .format (paired_data_2 ))
117
+ log .debug ('data_col_1 : {}' .format (data_col_1 ))
118
+ log .debug ('data_col_1 : {}' .format (data_col_2 ))
94
119
sys .exit ()
95
120
96
121
122
+ def is_paired_input (classifications_file ):
123
+ """
124
+ Returns true if input file appears to contain paired read data.
125
+ """
126
+ with read_file (classifications_file ) as f :
127
+ line = f .readline ()
128
+ line_proc = line .strip ()
129
+ line_proc = line_proc .split ('\t ' )
130
+
131
+ # If column 4 contains a pipe character "|", the data is paired
132
+ if "|" in line_proc [3 ]:
133
+ return True
134
+
135
+
97
136
def is_verbose_input (classifications_file ):
98
137
"""
99
138
Returns true if input file consists of 6 columns instead of 5.
@@ -109,7 +148,7 @@ def is_verbose_input(classifications_file):
109
148
return False
110
149
111
150
112
- def process_kmer_string (kmer_info_string ):
151
+ def process_kmer_string (kmer_info_string , paired_input ):
113
152
"""
114
153
Process a kmer info string (last column of a Kraken 2 output file), so that
115
154
we get a dictionary mapping of tax_ids to total sum of kmer hits.
@@ -120,7 +159,10 @@ def process_kmer_string(kmer_info_string):
120
159
tax_id_#N: Z kmer hits}
121
160
"""
122
161
kmer_info_string = kmer_info_string .split ()
123
- kmer_info_string .remove ('|:|' )
162
+
163
+ # Kraken2 classifications file for paired data contain the "|:|" delimiter
164
+ if paired_input :
165
+ kmer_info_string .remove ('|:|' )
124
166
125
167
# Messy list comprehension. Converts all "taxa":"num_kmer" string pairs
126
168
# into integer tuples like (taxa, num_kmers), and saves them in a list.
@@ -142,7 +184,7 @@ def process_kmer_string(kmer_info_string):
142
184
return taxa_kmer_dict
143
185
144
186
145
- def reclassify_read (read , confidence_threshold , taxonomy_tree , verbose_input , minimum_hit_groups , taxa_lineages ):
187
+ def reclassify_read (read , confidence_threshold , taxonomy_tree , verbose_input , minimum_hit_groups , taxa_lineages , paired_input ):
146
188
"""
147
189
Sums the number of kmers that hit in the clade rooted at "current_node",
148
190
and divides it with the total number of kmers queried against the database:
@@ -153,10 +195,11 @@ def reclassify_read(read, confidence_threshold, taxonomy_tree, verbose_input, mi
153
195
This is repeated until confidence >= confidence_threshold.
154
196
155
197
In this function it's envisionable to include other parameters for the
156
- classification... Right now I'm only considering the confidence score.
198
+ classification... Right now I'm only considering the confidence score
199
+ and minimum hit groups.
157
200
"""
158
201
# Process the kmer string into a dict of {tax_id: #kmers} key, value pairs
159
- taxa_kmer_dict = process_kmer_string (read .kmer_string )
202
+ taxa_kmer_dict = process_kmer_string (read .kmer_string , paired_input )
160
203
161
204
# Make the current node the same as the original classification
162
205
read .current_node = read .original_taxid
@@ -542,7 +585,7 @@ def create_read(kraken2_read, verbose_input=False):
542
585
return read
543
586
544
587
545
- def main_loop (f_handle , tax_reads_dict , taxonomy_tree , args , report_frequency , taxa_lineages , verbose_input = False , o_handle = None , v_handle = None ):
588
+ def main_loop (f_handle , tax_reads_dict , taxonomy_tree , args , report_frequency , taxa_lineages , paired_input , verbose_input = False , o_handle = None , v_handle = None ):
546
589
"""
547
590
f_handle: classifications input file to read from.
548
591
o_handle: output_classifications file to write to.
@@ -605,7 +648,8 @@ def write_verbose_output(read):
605
648
taxonomy_tree ,
606
649
verbose_input ,
607
650
args .minimum_hit_groups ,
608
- taxa_lineages )
651
+ taxa_lineages ,
652
+ paired_input )
609
653
610
654
# Counter for number of reads per taxon/node
611
655
if read .reclassified_taxid in tax_reads_dict ['hits_at_node' ]:
@@ -758,8 +802,15 @@ def stringmeup():
758
802
log .warning ('Will NOT reclassify based on minimizer hit groups.' )
759
803
args .minimum_hit_groups = None
760
804
805
+ # Check if the input data is paired or not
806
+ paired_input = is_paired_input (args .original_classifications_file )
807
+ if paired_input :
808
+ log .info ('Classifications were made from paired-end data.' )
809
+ else :
810
+ log .info ('Classifications were made from single-read data.' )
811
+
761
812
# Perform a naive check of the input file
762
- validate_input_file (args .original_classifications_file , verbose_input , args .minimum_hit_groups )
813
+ validate_input_file (args .original_classifications_file , verbose_input , args .minimum_hit_groups , paired_input )
763
814
764
815
# If user provided names.dmp and nodes.dmp, create taxonomy tree from that,
765
816
# otherwise, create it from a pickled taxonomy file
@@ -794,7 +845,7 @@ def stringmeup():
794
845
v = write_file (args .output_verbose , args .gz_output )
795
846
796
847
# Run the main loop (reclassification)
797
- main_loop (f , tax_reads_dict , taxonomy_tree , args , report_frequency , taxa_lineages , verbose_input , o , v )
848
+ main_loop (f , tax_reads_dict , taxonomy_tree , args , report_frequency , taxa_lineages , paired_input , verbose_input , o , v )
798
849
799
850
# Remember to close files
800
851
if o :
0 commit comments