forked from emilhaegglund/bioinfo_scripts
-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract_fastq_kraken.py
executable file
·161 lines (136 loc) · 6.15 KB
/
extract_fastq_kraken.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
#!/usr/bin/env python2.7
''' Script to extract fastq sequences mapping to taxid in kraken '''
import os
import sys
import argparse
import subprocess
import pandas as pd
import ncbiTaxonomyTree as nbt
def parse_arguments():
''' Parse command line arguments '''
parser = argparse.ArgumentParser()
# Add arguments
parser.add_argument('--forward', '-f', required=True,
help='Forward read')
parser.add_argument('--reverse', '-r',
help='If paired, reverse read')
parser.add_argument('--tax_id', '-t', required=True,
help='Taxid to extract')
parser.add_argument('--kraken', '-k', required=True,
help='Kraken result file')
parser.add_argument('--names', '-n',
help='Name file used in the kraken-db, only required if \
--descendents is set')
parser.add_argument('--nodes', '-p',
help='Node file used in the kraken-db, only required if \
--descendents is set')
parser.add_argument('--descendents', action='store_true',
help='Extract sequences for the taxonomical ID and \
sequences in descendent groups. Requires --nodes \
and --names argument')
parser.add_argument('--merge', action='store_true',
help='Write all extracted sequences to a single \
fastq-file.')
parser.add_argument('--output', '-o',
help='Write fastq-files to a directory')
args = parser.parse_args()
if args.descendents:
if args.names is None or args.nodes is None:
parser.error('--descendents requires --names and --nodes.')
return args
def extract_reads(taxid, taxid_sequences, reads, direction, merge, outdir):
""" Extract fastq sequences based on headers """
if merge:
headers_path = os.path.join(outdir, taxid + '.list')
with open(headers_path, 'w') as headers_file:
for node_id in taxid_sequences.keys():
for seq in taxid_sequences[node_id]:
headers_file.write(seq + '\n')
seqtk(taxid, reads, headers_path, outdir)
else:
for node_id in taxid_sequences.keys():
headers_path = os.path.join(outdir, str(node_id) + '.list')
with open(headers_path, 'w') as headers_file:
for seq in taxid_sequences[node_id]:
headers_file.write(seq + '\n')
seqtk(node_id, reads, headers_path, outdir)
def seqtk(taxid, reads, headers_path, outdir):
seqtk_args = ['seqtk', 'subseq', reads, headers_path]
process = subprocess.Popen(seqtk_args, stderr=subprocess.PIPE,
stdout=subprocess.PIPE)
out, err = process.communicate()
fastq_path = os.path.join(outdir, str(taxid) + '.fastq')
with open(fastq_path, 'w') as fastq_out:
fastq_out.write(out)
def main():
""" Main application """
# Get command line arguments
args = parse_arguments()
# Create output directory
if args.output == None:
outdir = ''
else:
outdir = args.output
if not os.path.isdir(outdir):
os.mkdir(outdir)
print("\nLoad data")
print("=========")
# Import kraken result file
print("Load kraken-report file")
kraken_df = pd.read_csv(args.kraken, sep='\t', header=None,
names=['Classified', 'SeqID', 'TaxID', 'SeqLength', 'LCAmapping'])
if args.reverse:
reverse = True
else:
reverse = False
taxid = args.tax_id
# Extract unclassified reads
if int(taxid) == 0:
if args.descendents == True:
print("Warning: --descendents will be unset for unclassified reads")
args.descendents = False
# Read Taxonomy tree
if args.descendents: # Use NCBI taxonomy to extract sequences from descendents
print("Load NCBI taxonomy")
tree = nbt.NcbiTaxonomyTree(nodes_filename=args.nodes, names_filename=args.names)
print("\nFind Nodes and Sequences")
print("========================")
child_nodes = tree.getDescendants([int(taxid)])
# Get sequences from nodes
taxid_sequences = {} # Bin sequences to nodes
nr_nodes = 0 # Nodes containing sequences
tot_sequences = 0 # Total number of sequences to extract
for child_taxid in child_nodes[int(taxid)]:
nr_reads = len(list(kraken_df.loc[kraken_df['TaxID'] == child_taxid].SeqID))
if nr_reads != 0:
print("Node " + str(child_taxid) + " contains " + str(nr_reads) + ' sequences')
taxid_sequences[child_taxid] = list(kraken_df.loc[kraken_df['TaxID'] == child_taxid].SeqID)
nr_nodes += 1
tot_sequences += nr_reads
else: # Extract only sequences for the specific node
print('\nFind Sequences')
print('===============')
taxid_sequences = {}
nr_reads = len(list(kraken_df.loc[kraken_df['TaxID'] == int(taxid)].SeqID))
if nr_reads != 0:
print("Node " + taxid + " contains " + str(nr_reads) + ' sequences')
taxid_sequences[int(taxid)] = list(kraken_df.loc[kraken_df['TaxID'] == int(taxid)].SeqID)
nr_nodes = 1
tot_sequences = nr_reads
# Print results
if nr_nodes == 1:
print("\nFound " + str(tot_sequences) + " sequences to extract from " + str(len(taxid_sequences)) + " node\n")
elif nr_nodes == 0:
sys.exit("No sequences where found for that node")
else:
print("\nFound " + str(tot_sequences) + " sequences to extract from " + str(len(taxid_sequences)) + " different nodes\n")
# Extract reads and write to new fastq-file
print("Extract reads")
print("=============")
print("Extract forward reads")
extract_reads(taxid, taxid_sequences, args.forward, 'forward', args.merge, outdir)
if reverse:
print("Extract reverse reads")
extract_reads(taxid, taxid_sequences, args.reverse, 'reverse', args._merge, outdir)
if __name__ == '__main__':
main()