-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfirm_introns.py
executable file
·298 lines (270 loc) · 13.6 KB
/
confirm_introns.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
This script uses nanopore sequencing reads to confirm splicing of the predicted introns and
maps exact splicing sites for higly represented variants.
"""
__author__ = "Jakub Barylski"
__maintainer__ = "Jakub Barylski"
__license__ = "GNU GENERAL PUBLIC LICENSE"
__email__ = "[email protected]"
from pathlib import Path
from random import randint
import click
import porechop_custom
from annotations import Exon, Intron, Gene, AnnotationBase
from tweaks import run_external
@click.command(no_args_is_help=True)
@click.option("-ca", "--custom_adapters",
required=True,
type=click.Path(exists=True, path_type=Path),
help='fasta file with custom adapters (Forward Forward orientation >__>)')
@click.option("-cb", "--custom_barcodes",
required=True,
type=click.Path(exists=True, path_type=Path),
help='fasta file with custom barcodes (Forward Forward orientation >__>)')
@click.option("-fq", "--read_fastq",
required=False,
type=click.Path(exists=True, path_type=Path),
help='path to fastq file')
@click.option("-cr", "--clean_reads",
required=False,
type=click.Path(exists=True, path_type=Path),
help='directory with demultiplexed clean reads (skips demultiplexing and trimming steps)')
@click.option("-rd", "--reference_dir",
required=True,
type=click.Path(exists=True, path_type=Path),
help='directory with reference fasta files')
@click.option("-o", "--output",
required=True,
type=click.Path(exists=False, path_type=Path),
help='output directory')
@click.option("-s", "--separator",
required=True,
type=str,
help='separator for the primer names')
@click.option("-mr", "--max_reads",
required=False,
type=int,
default=500,
help='number of reads to subsample')
@click.option("-ml", "--min_length",
required=False,
type=int,
default=500,
help='minimum read length to keep during quality filtering')
@click.option("-mq", "--min_mean_q",
required=False,
type=float,
default=90,
help='minimum mean quality to keep during read filtering')
def rnaseq_analysis(custom_adapters: Path,
custom_barcodes: Path,
read_fastq: Path,
clean_reads: Path,
reference_dir: Path,
output: Path,
separator: str,
max_reads: int,
min_length: int,
min_mean_q: float):
"""
De-multiplex reads and map them to respective references
Usage example: ./confirm_introns.py -ca custom_adapters -cb custom_barcodes -fq reads.fastq -rd reference_genomes_dir -o output_dir -s '__' -mr 1000
"""
assert read_fastq or clean_reads, 'Provide either fastq to trim or clean_reads'
output.mkdir()
if not clean_reads:
demultiplexing_dir = output.joinpath('demultiplexing')
demultiplexing_dir.mkdir()
filtered_read_dir = output.joinpath(f'filtered.{min_length}bp_q{min_mean_q}')
filtered_read_dir.mkdir()
clean_reads = trim_and_demultiplex(read_fastq=read_fastq,
custom_adapters=custom_adapters,
custom_barcodes=custom_barcodes,
demultiplexing_dir=demultiplexing_dir,
filtered_dir=filtered_read_dir,
min_length=min_length,
min_mean_q=min_mean_q)
else:
clean_reads = [Path(f) for f in clean_reads.iterdir() if f.suffix in ('.fastq', '.fq')]
minimap_sorted_dir = output.joinpath('minimap2_sorted')
minimap_sorted_dir.mkdir()
minumap_filtered_dir = output.joinpath('minimap2_filtered')
minumap_filtered_dir.mkdir()
raw_gff_dir = output.joinpath('raw_gff')
raw_gff_dir.mkdir()
filtered_introns = AnnotationBase()
for read_fastq in clean_reads:
reference_name = read_fastq.stem.split(separator)[0]
reference = reference_dir.joinpath(f'{reference_name}.fasta')
if reference.exists():
filtered_file = map_to_reference(reference=reference,
read_fastq=read_fastq,
minimap_sorted_dir=minimap_sorted_dir,
minumap_filtered_dir=minumap_filtered_dir)
n_mapped_reads = count_mapped_reads(filtered_file)
filtered_introns = splice_sites(spliced_bam=filtered_file,
raw_gff_dir=raw_gff_dir,
mapped_reads=n_mapped_reads,
output=filtered_introns)
if n_mapped_reads > max_reads:
subsampled_bam = filtered_file.parent.joinpath(f'{filtered_file.stem}.{max_reads}_sample_reads.bam')
subsample_bam(bam=filtered_file,
subsampled_bam=subsampled_bam,
max_reads=max_reads,
mapped_reads=n_mapped_reads)
n_subsampled_reads = count_mapped_reads(subsampled_bam)
print(f'sampled {n_subsampled_reads} out of {n_mapped_reads} reads')
else:
print(f"Reference {reference_name} not found")
filtered_introns.save_gff(output.joinpath('Introns.gff'))
def trim_and_demultiplex(read_fastq: Path,
custom_adapters: Path,
custom_barcodes: Path,
demultiplexing_dir: Path,
filtered_dir: Path,
min_length: int,
min_mean_q: float):
"""
Run de-multiplexing and trimming of a single fastq file using porechop
:param read_fastq: path to fastq file to be trimmed and demultiplexed
:param custom_adapters: path to fasta file with custom adapters
:param custom_barcodes: path to fasta file with custom barcodes
:param demultiplexing_dir: directory to save demultiplexed reads
:param filtered_dir: directory to save filtered reads
:param min_length: minimum read length to keep during quality filtering
:param min_mean_q: minimum mean quality to keep during read filtering
"""
porechop_custom.main(input=read_fastq,
barcode_dir=demultiplexing_dir,
custom_adapters=custom_adapters,
custom_barcodes=custom_barcodes,
require_two_barcodes=True,
discard_unassigned=True)
# filtlong quality filtering
output_files = []
for fastq in demultiplexing_dir.glob('*.fastq'):
filtlong_command = ['filtlong',
'--min_length', min_length,
'--min_mean_q', min_mean_q,
fastq]
filtlong_output = run_external(filtlong_command, stdout='capture')
filtered_reads = f'{fastq.stem}.fastq'
filtered_fastq = filtered_dir.joinpath(filtered_reads)
with filtered_fastq.open('w') as handle:
handle.write(filtlong_output.decode())
output_files.append(filtered_fastq)
return output_files
def map_to_reference(reference: Path,
read_fastq: Path,
minimap_sorted_dir: Path,
minumap_filtered_dir: Path):
"""
Map reads to reference gene sequence
using split-read mapping with minimap2
and post-process the files
by sorting and filtering alignments
:param reference: path to reference fasta file with gene sequence
:param read_fastq: path to fastq file with reads to map
:param minimap_sorted_dir: directory to save sorted bam files
:param minumap_filtered_dir: directory to save filtered bam files
"""
minimap_command = ['minimap2', '-ax', 'splice', '-uf', '-k14', '--splice-flank=no', # '-p', 0.9, '--secondary=no',
reference, read_fastq]
sorted_bam = minimap_sorted_dir.joinpath(read_fastq.stem + '.bam')
sorting_command = ['samtools', 'sort', '-O', 'bam', '-o', sorted_bam]
minimap_output = run_external(minimap_command, stdout='capture')
run_external(sorting_command, stdin=minimap_output)
filtered_file = minumap_filtered_dir.joinpath(read_fastq.stem + '.bam')
filter_command = ['samtools', 'view', '-b', '-F', '4', '-q', '30', sorted_bam, '-o', filtered_file]
run_external(filter_command)
index_file = filtered_file.with_suffix('.bai')
index_command = ['samtools', 'index', '-o', index_file, filtered_file]
run_external(index_command)
return filtered_file
def count_mapped_reads(bam: Path) -> int:
"""
Count number of reads mapped in the bam file
:param bam: path to bam file to count
:return: number of mapped reads
"""
count_command = ['samtools', 'view', '-c', '-F', '260', bam]
mapped_reads = int(run_external(count_command, stdout='capture').decode().strip())
return mapped_reads
def splice_sites(spliced_bam: Path,
raw_gff_dir: Path,
mapped_reads: int,
output: AnnotationBase,
minimal_fraction: float = 0.1,
minimal_coverage: int = 100):
"""
Use spliced_bam2gff to extract intron borders from spliced bam file and convert them to gff format
:param spliced_bam: path to spliced bam file
:param raw_gff_dir: directory to save raw gff files
:param mapped_reads: number of mapped reads in the bam file
:param output: AnnotationBase object to store introns
:param minimal_fraction: minimal fraction of reads supporting the intron required to report it in the final gff
:param minimal_coverage: minimal number of reads supporting the intron required to report it in the final gff
"""
spliced_bam2gff_command = ['spliced_bam2gff', '-M', spliced_bam]
raw_gff = raw_gff_dir.joinpath(f'{spliced_bam.stem}.gff')
raw_gff_lines = run_external(spliced_bam2gff_command, stdout='capture')
with raw_gff.open('w') as handle:
handle.write(raw_gff_lines.decode())
annotation_lines = [l for l in raw_gff_lines.decode().splitlines() if not l.startswith('#') and l.strip()]
genes = []
exons = None
while annotation_lines:
line = annotation_lines.pop(0)
if line.split('\t')[2] == 'mRNA':
if exons:
genes.append(Gene.from_exons(exons))
exons = []
elif line.split('\t')[2] == 'exon':
exons.append(Exon.from_gff_line(line, in_attr_separator=' '))
if exons: # add the last gene
genes.append(Gene.from_exons(exons))
intron_versions = {}
for gene in genes:
possible_introns = gene.children(Intron)
for p_intron in possible_introns:
if (p_intron.start, p_intron.end, p_intron.strand) in intron_versions:
intron_versions[(p_intron.start, p_intron.end, p_intron.strand)].score += 1
else:
intron_versions[(p_intron.start, p_intron.end, p_intron.strand)] = Intron(seq_id=p_intron.seq_id,
strand=p_intron.strand,
model_id=f'pinfish',
model_name=spliced_bam.stem,
start=p_intron.start,
end=p_intron.end,
score=1,
evalue=0,
method='averaged_pinfish')
for intron_variant in intron_versions.values():
supporting_read_fraction = intron_variant.score / mapped_reads
if intron_variant.score > minimal_coverage and supporting_read_fraction > minimal_fraction:
intron_variant.model_id = f'{intron_variant.score} / {mapped_reads}'
intron_variant.evalue = supporting_read_fraction
output.annotate(intron_variant)
return output
def subsample_bam(bam: Path,
subsampled_bam: Path,
max_reads: int,
mapped_reads: int):
"""
Subsample bam file to a given number of reads
:param bam: path to bam file
:param max_reads: number of reads to subsample
:param mapped_reads: number of mapped reads in the bam file
"""
assert max_reads < mapped_reads, f'Cannot subsample {max_reads} out of {mapped_reads} reads'
fraction_to_keep = max_reads / mapped_reads
subsample_command = ['samtools', 'view', '--subsample', fraction_to_keep, '--subsample-seed', randint(0, 100000),
'-b', '-o', subsampled_bam, bam]
run_external(subsample_command)
index_command = ['samtools', 'index', '-o', subsampled_bam.with_suffix('.bai'), subsampled_bam]
run_external(index_command)
return subsampled_bam
if __name__ == '__main__':
rnaseq_analysis()