Skip to content

Commit db6cab5

Browse files
committed
StringMeUp now works with single-read data, in addition to paired-end.
1 parent e0ff91e commit db6cab5

File tree

1 file changed

+67
-16
lines changed

1 file changed

+67
-16
lines changed

stringmeup/stringmeup.py

+67-16
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env python3
22

3-
__version__ = "0.1.2"
3+
__version__ = "0.1.3"
44

55
import argparse
66
import operator
@@ -53,7 +53,7 @@ class ReportNode:
5353
offset: int
5454

5555

56-
def validate_input_file(putative_classifications_file, verbose_input, minimum_hit_groups):
56+
def validate_input_file(putative_classifications_file, verbose_input, minimum_hit_groups, paired_input):
5757
"""
5858
Perform simple validation of the input file.
5959
"""
@@ -77,23 +77,62 @@ def validate_input_file(putative_classifications_file, verbose_input, minimum_hi
7777
else:
7878
num_cols = len(line_proc) == 6 # 6 columns if the output was produced with the verbose version of kraken2 that outputs minimizer hit groups
7979

80+
# Line must start with C or U (as in Classified/unclassified)
8081
line_start = line_proc[0] in ['U', 'C']
81-
paired_data_1 = len(line_proc[3].split('|')) == 2
82-
paired_data_2 = len(line_proc[-1].split('|:|')) == 2 # Should be enough to change this line if we want to accomodate reclassification of single reads
8382

84-
if num_cols and line_start and paired_data_1 and paired_data_2:
83+
# If the data is paired
84+
if paired_input:
85+
# Must be information on both sides of the pipe character
86+
data_col_1 = len(line_proc[3].split('|')) == 2
87+
88+
# If the data is paired in the 3rd column, it must also be paired in the last column
89+
if "|" in line_proc[-1]:
90+
data_col_2 = len(line_proc[-1].split('|:|')) == 2
91+
else:
92+
data_col_2 = False
93+
94+
# If the input is from single end reads, atleast the read length column (3rd) must be an int
95+
else:
96+
try:
97+
int(line_proc[3])
98+
except:
99+
data_col_1 = False
100+
else:
101+
data_col_1 = True
102+
103+
# And the last column should contain colons between kmer/taxon pairs
104+
if ":" in line_proc[4]:
105+
data_col_2 = True
106+
else:
107+
data_col_2 = False
108+
109+
if num_cols and line_start and data_col_1 and data_col_2:
85110
log.debug('Validation OK.')
86111
return
87112
else:
88113
log.error('The classifications file is malformatted.')
89114
log.debug('First line of input: {}'.format(line))
90115
log.debug('num_cols: {}'.format(num_cols))
91116
log.debug('line_start: {}'.format(line_start))
92-
log.debug('paired_data_1: {}'.format(paired_data_1))
93-
log.debug('paired_data_2: {}'.format(paired_data_2))
117+
log.debug('data_col_1: {}'.format(data_col_1))
118+
log.debug('data_col_1: {}'.format(data_col_2))
94119
sys.exit()
95120

96121

122+
def is_paired_input(classifications_file):
123+
"""
124+
Returns true if input file appears to contain paired read data.
125+
"""
126+
with read_file(classifications_file) as f:
127+
line = f.readline()
128+
line_proc = line.strip()
129+
line_proc = line_proc.split('\t')
130+
131+
# If column 4 contains a pipe character "|", the data is paired
132+
if "|" in line_proc[3]:
133+
return True
134+
135+
97136
def is_verbose_input(classifications_file):
98137
"""
99138
Returns true if input file consists of 6 columns instead of 5.
@@ -109,7 +148,7 @@ def is_verbose_input(classifications_file):
109148
return False
110149

111150

112-
def process_kmer_string(kmer_info_string):
151+
def process_kmer_string(kmer_info_string, paired_input):
113152
"""
114153
Process a kmer info string (last column of a Kraken 2 output file), so that
115154
we get a dictionary mapping of tax_ids to total sum of kmer hits.
@@ -120,7 +159,10 @@ def process_kmer_string(kmer_info_string):
120159
tax_id_#N: Z kmer hits}
121160
"""
122161
kmer_info_string = kmer_info_string.split()
123-
kmer_info_string.remove('|:|')
162+
163+
# Kraken2 classifications file for paired data contain the "|:|" delimiter
164+
if paired_input:
165+
kmer_info_string.remove('|:|')
124166

125167
# Messy list comprehension. Converts all "taxa":"num_kmer" string pairs
126168
# into integer tuples like (taxa, num_kmers), and saves them in a list.
@@ -142,7 +184,7 @@ def process_kmer_string(kmer_info_string):
142184
return taxa_kmer_dict
143185

144186

145-
def reclassify_read(read, confidence_threshold, taxonomy_tree, verbose_input, minimum_hit_groups, taxa_lineages):
187+
def reclassify_read(read, confidence_threshold, taxonomy_tree, verbose_input, minimum_hit_groups, taxa_lineages, paired_input):
146188
"""
147189
Sums the number of kmers that hit in the clade rooted at "current_node",
148190
and divides it with the total number of kmers queried against the database:
@@ -153,10 +195,11 @@ def reclassify_read(read, confidence_threshold, taxonomy_tree, verbose_input, mi
153195
This is repeated until confidence >= confidence_threshold.
154196
155197
In this function it's envisionable to include other parameters for the
156-
classification... Right now I'm only considering the confidence score.
198+
classification... Right now I'm only considering the confidence score
199+
and minimum hit groups.
157200
"""
158201
# Process the kmer string into a dict of {tax_id: #kmers} key, value pairs
159-
taxa_kmer_dict = process_kmer_string(read.kmer_string)
202+
taxa_kmer_dict = process_kmer_string(read.kmer_string, paired_input)
160203

161204
# Make the current node the same as the original classification
162205
read.current_node = read.original_taxid
@@ -542,7 +585,7 @@ def create_read(kraken2_read, verbose_input=False):
542585
return read
543586

544587

545-
def main_loop(f_handle, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, verbose_input=False, o_handle=None, v_handle=None):
588+
def main_loop(f_handle, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, paired_input, verbose_input=False, o_handle=None, v_handle=None):
546589
"""
547590
f_handle: classifications input file to read from.
548591
o_handle: output_classifications file to write to.
@@ -605,7 +648,8 @@ def write_verbose_output(read):
605648
taxonomy_tree,
606649
verbose_input,
607650
args.minimum_hit_groups,
608-
taxa_lineages)
651+
taxa_lineages,
652+
paired_input)
609653

610654
# Counter for number of reads per taxon/node
611655
if read.reclassified_taxid in tax_reads_dict['hits_at_node']:
@@ -758,8 +802,15 @@ def stringmeup():
758802
log.warning('Will NOT reclassify based on minimizer hit groups.')
759803
args.minimum_hit_groups = None
760804

805+
# Check if the input data is paired or not
806+
paired_input = is_paired_input(args.original_classifications_file)
807+
if paired_input:
808+
log.info('Classifications were made from paired-end data.')
809+
else:
810+
log.info('Classifications were made from single-read data.')
811+
761812
# Perform a naive check of the input file
762-
validate_input_file(args.original_classifications_file, verbose_input, args.minimum_hit_groups)
813+
validate_input_file(args.original_classifications_file, verbose_input, args.minimum_hit_groups, paired_input)
763814

764815
# If user provided names.dmp and nodes.dmp, create taxonomy tree from that,
765816
# otherwise, create it from a pickled taxonomy file
@@ -794,7 +845,7 @@ def stringmeup():
794845
v = write_file(args.output_verbose, args.gz_output)
795846

796847
# Run the main loop (reclassification)
797-
main_loop(f, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, verbose_input, o, v)
848+
main_loop(f, tax_reads_dict, taxonomy_tree, args, report_frequency, taxa_lineages, paired_input, verbose_input, o, v)
798849

799850
# Remember to close files
800851
if o:

0 commit comments

Comments
 (0)