Skip to content

Commit 2c89e0c

Browse files
committed
fix the ONT deduplication code
1 parent 57b805e commit 2c89e0c

File tree

5 files changed

+138
-8
lines changed

5 files changed

+138
-8
lines changed

docker/lr-bam-dedup/Dockerfile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
FROM python:3.9.16-slim-bullseye
2+
3+
COPY remove_duplicate_ont_aln.py /opt/
4+
5+
RUN pip install pysam==0.21.0
6+
7+
ARG DEBIAN_FRONTEND=noninteractive
8+
RUN apt-get -qqy update --fix-missing && \
9+
apt-get -qqy dist-upgrade && \
10+
apt-get -qqy install --no-install-recommends \
11+
samtools && \
12+
apt-get -qqy clean && \
13+
rm -rf /tmp/* \
14+
/var/tmp/* \
15+
/var/cache/apt/* \
16+
/var/lib/apt/lists/* \
17+
/usr/share/man/?? \
18+
/usr/share/man/??_*

docker/lr-bam-dedup/Makefile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
VERSION = 0.1.1
2+
TAG1 = us.gcr.io/broad-dsp-lrma/lr-bam-dedup:$(VERSION)
3+
TAG2 = us.gcr.io/broad-dsp-lrma/lr-bam-dedup:latest
4+
5+
all: build push
6+
7+
build:
8+
docker build -t $(TAG1) -t $(TAG2) .
9+
10+
push:
11+
docker push $(TAG1)
12+
docker push $(TAG2)
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import argparse
2+
import pysam
3+
4+
5+
def main():
6+
parser = argparse.ArgumentParser(description='Remove redundant alignment records from ONT BAM file',
7+
prog='remove_redundant_reads')
8+
parser.add_argument('-p', '--prefix', type=str, default="shard", help="Output prefix")
9+
parser.add_argument('-a', '--annotations', type=str, help="Annotations on (potential) duplicate reads")
10+
11+
parser.add_argument('bam', type=str, help="BAM")
12+
args = parser.parse_args()
13+
14+
# create a dict of set's, a trick to avoid Hash collisions
15+
guilty_dict_per_chr = dict()
16+
with open(args.annotations) as f:
17+
for line in f:
18+
arr = line.strip().split('\t')
19+
name = arr[0]
20+
chrom = arr[2]
21+
guilty_dict_per_chr.setdefault(chrom, set())
22+
guilty_dict_per_chr[chrom].add(name)
23+
24+
print("chromosomes on which there are duplicate records:")
25+
print(f" {guilty_dict_per_chr}")
26+
27+
# Silence message about the .bai file not being found.
28+
pysam.set_verbosity(0)
29+
30+
num_alignments, num_dropped_alignments = 0, 0
31+
duplicate_signatures = []
32+
bf = pysam.Samfile(args.bam, 'rb', check_sq=False)
33+
with pysam.Samfile(f'{args.prefix}.bam', 'wb', header=bf.header) as out:
34+
# we rely on the observation that for coordinate sorted BAM,
35+
# duplicate records will appear in blocks, hence once we step off a position with duplicates, we start afresh
36+
current_position = -1
37+
current_signatures = set()
38+
for read in bf:
39+
num_alignments += 1
40+
41+
chrom = read.reference_name
42+
n = read.query_name
43+
if chrom in guilty_dict_per_chr and n in guilty_dict_per_chr[chrom]:
44+
45+
mq = read.mapping_quality
46+
sam_flag = read.flag
47+
pos = read.reference_start
48+
cigar = read.cigarstring
49+
signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-{cigar}"
50+
51+
if current_position != pos: # new position, let's write and reset
52+
out.write(read)
53+
current_position = pos
54+
current_signatures = set()
55+
current_signatures.add(signature)
56+
elif signature in current_signatures: # You're a duplicate record, and not appearing for the 1st time!
57+
num_dropped_alignments += 1
58+
duplicate_signatures.append(signature) # same signature may appear more than twice, hence list and append
59+
pass
60+
else: # you are in a new group of duplicates that map to this location
61+
out.write(read)
62+
current_signatures.add(signature)
63+
else:
64+
out.write(read)
65+
66+
print(f'num_alignments: {num_alignments}')
67+
print(f'num_dropped_alignments: {num_dropped_alignments}')
68+
print(f'num_kept_alignments: {num_alignments - num_dropped_alignments}')
69+
70+
with open(f'{args.prefix}.duplicate.signatures.txt', 'w') as out:
71+
for sig in duplicate_signatures:
72+
out.write(f"{sig}\n")
73+
74+
75+
if __name__ == "__main__":
76+
main()

docker/lr-utils/remove_duplicate_ont_aln.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ def main():
2121
guilty_dict_per_chr.setdefault(chrom, set())
2222
guilty_dict_per_chr[chrom].add(name)
2323

24+
print("chromosomes on which there are duplicate records:")
25+
print(f" {guilty_dict_per_chr}")
26+
2427
# Silence message about the .bai file not being found.
2528
pysam.set_verbosity(0)
2629

@@ -36,12 +39,13 @@ def main():
3639

3740
chrom = read.reference_name
3841
n = read.query_name
39-
if n in guilty_dict_per_chr[chrom]:
42+
if chrom in guilty_dict_per_chr and n in guilty_dict_per_chr[chrom]:
4043

4144
mq = read.mapping_quality
4245
sam_flag = read.flag
4346
pos = read.reference_start
44-
signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-"
47+
cigar = read.cigarstring
48+
signature = f"{n}-{chrom}-{pos}-{mq}-{sam_flag}-{cigar}"
4549

4650
if current_position != pos: # new position, let's write and reset
4751
out.write(read)

wdl/tasks/Utility/Utils.wdl

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1950,11 +1950,11 @@ task FilterBamOnTag {
19501950
task DeduplicateBam {
19511951

19521952
meta {
1953-
description: "Utility to drop (occationally happening) duplicate records in input BAM"
1953+
description: "Utility to drop (occationally happening) literal duplicate records in input BAM"
19541954
}
19551955

19561956
parameter_meta {
1957-
aligned_bam: "input BAM file"
1957+
aligned_bam: "input BAM file (must be coordinate sorted)"
19581958
aligned_bai: "input BAM index file"
19591959
same_name_as_input: "if true, output BAM will have the same name as input BAM, otherwise it will have the input basename with .dedup suffix"
19601960
runtime_attr_override: "override default runtime attributes"
@@ -1975,19 +1975,38 @@ task DeduplicateBam {
19751975
String prefix = if (same_name_as_input) then base else (base + ".dedup")
19761976

19771977
command <<<
1978+
set -eux
1979+
1980+
samtools view -H "~{aligned_bam}" | grep "@HD" > hd.line
1981+
if ! grep -qF "SO:coordinate" hd.line;
1982+
then
1983+
echo "BAM must be coordinate sorted!" && echo && cat hd.line && exit 1
1984+
fi
19781985
echo "==========================================================="
19791986
echo "collecting duplicate information"
19801987
time \
19811988
samtools view -@ 1 "~{aligned_bam}" | \
1982-
awk -F "\t" 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5}' | \
1989+
awk -F "\t" 'BEGIN {OFS="\t"} {print $1, $2, $3, $4, $5, $6}' | \
19831990
sort | uniq -d \
19841991
> "~{aligned_bam}".duplicates.txt
1992+
1993+
cnt=$(wc -l "~{aligned_bam}".duplicates.txt | awk '{print $1}')
1994+
if [[ ${cnt} -eq 0 ]];
1995+
then
1996+
echo "No duplicates found"
1997+
if ! ~{same_name_as_input} ;
1998+
then
1999+
mv "~{aligned_bam}" "~{prefix}.bam"
2000+
mv "~{aligned_bai}" "~{prefix}.bam.bai"
2001+
fi
2002+
exit 0
2003+
fi
19852004
echo "==========================================================="
19862005
echo "de-duplicating"
19872006
time python3 /opt/remove_duplicate_ont_aln.py \
2007+
"~{aligned_bam}" \
19882008
--prefix "~{prefix}" \
1989-
--annotations "~{aligned_bam}".duplicates.txt \
1990-
"~{aligned_bam}"
2009+
--annotations "~{aligned_bam}".duplicates.txt
19912010
echo "==========================================================="
19922011
echo "DONE"
19932012
samtools index "~{prefix}.bam"
@@ -1996,6 +2015,7 @@ task DeduplicateBam {
19962015
output {
19972016
File corrected_bam = "~{prefix}.bam"
19982017
File corrected_bai = "~{prefix}.bam.bai"
2018+
File duplicate_record_signatures = "~{prefix}.duplicate.signatures.txt"
19992019
}
20002020

20012021
#########################
@@ -2006,7 +2026,7 @@ task DeduplicateBam {
20062026
boot_disk_gb: 10,
20072027
preemptible_tries: 0,
20082028
max_retries: 1,
2009-
docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.10"
2029+
docker: "us.gcr.io/broad-dsp-lrma/lr-bam-dedup:0.1.1"
20102030
}
20112031
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
20122032
runtime {

0 commit comments

Comments
 (0)