Merge pull request #10 from Layth17/master

Adding a WDL-optimized Optitype Script
genome · Aug 23, 2022 · c48b7ca · c48b7ca
2 parents 0152184 + e3e19a3
commit c48b7ca
Show file tree

Hide file tree

Showing 3 changed files with 106 additions and 0 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -76,6 +76,18 @@ RUN wget https://github.com/arq5x/bedtools2/releases/download/v2.28.0/bedtools
 RUN chmod +x bedtools
 
 COPY optitype_script.sh /usr/bin/optitype_script.sh
+COPY optitype_script_wdl.sh /usr/bin/optitype_script_wdl.sh
+
+###############
+#Picard 2.18.1#
+###############
+ADD https://github.com/broadinstitute/picard/releases/download/2.18.1/picard.jar ./
+RUN mkdir /opt/picard-2.18.1
+RUN chmod -R ugo+r /opt/picard-2.18.1
+RUN mv picard.jar /opt/picard-2.18.1
+RUN chmod -R ugo+r /opt/picard-2.18.1/picard.jar
+RUN ln -s /opt/picard-2.18.1 /opt/picard
+RUN ln -s /opt/picard-2.18.1 /usr/picard 
 
 #include the optitype hla reference and pre-calculated bwa index outputs
 RUN mkdir -p /ref_data/optitype_ref

diff --git a/README.md b/README.md
@@ -0,0 +1,5 @@
+Due to worries about backward compatibility, the original file `optitype_script.sh` was left untouched and
+a new file `optitype_script_wdl.sh` has been created, tailored for the `wdl` pipeline.
+
+`optitype_script_wdl.sh` allows for control over RAM and CPU usage. 
+It also massively improves performance (from ~5h to ~20m) at the expense of a bit of sensitivity.
diff --git a/optitype_script_wdl.sh b/optitype_script_wdl.sh
@@ -0,0 +1,89 @@
+#!/bin/bash
+
+# $1 is the directory where all intermediate files will be written
+# $2 is the directory final outputs should be written to
+# $3 is the prefix that will be added to all files
+# $4 is the location of the input (normal) cram file
+# $5 is the reference file used to create the cram
+# $6 is the number of cores that will be used by processes
+# $7 is the total memory in the node
+#bsub -R 'select[mem>64000] rusage[mem=64000]' -e <error_file_name> -o <output_file_name> -q research-hpc -a 'docker(johnegarza/immuno-testing:latest)' /bin/bash /usr/bin/optitype_script.sh <intermediate files directory> <final results directory> <output file prefix>  <cram path>
+
+set -e -o pipefail
+
+# Optitype DNA reference file
+dnaref="/ref_data/optitype_ref/hla_reference_dna.fasta";
+
+DEFAULT_THREADS=4
+DEFAULT_MEM=8
+MEM_UTIL=80 # percent
+
+TEMPDIR="$1";
+outdir="$2";
+name="$3";
+cram="$4";
+reference="$5";
+THREADS=${6:-$DEFAULT_THREADS}
+if [ ${7:-$DEFAULT_MEM} -eq 1 ]; then MEM=1G; else MEM="$((${7:-$DEFAULT_MEM}*$MEM_UTIL/100))"G; fi
+
+mkdir -p $TEMPDIR
+mkdir -p $outdir
+
+echo "Step 1: extracting hla region from chr6, reads to alternate HLA sequences, and all unmapped reads ..."
+
+# filter out reads directly from an existing CRAM file of alignments, only those reads that align to this region: chr6:29836259-33148325
+echo "[INFO] /opt/samtools/bin/samtools view -h -T $reference $cram chr6:29836259-33148325 >$TEMPDIR/reads.sam"
+time /opt/samtools/bin/samtools view -h -T $reference $cram chr6:29836259-33148325 >$TEMPDIR/reads.sam
+
+# pull out only the *header* lines from the CRAM with the -H parameter. then get the sequence names and for those that match the string "HLA" do the following
+echo "[INFO] /opt/samtools/bin/samtools view -H -T $reference $cram | grep "^@SQ" | cut -f 2 | cut -f 2- -d : | grep HLA | while read chr;do ..."
+time /opt/samtools/bin/samtools view -H -T $reference $cram | grep "^@SQ" | cut -f 2 | cut -f 2- -d : | grep HLA | while read chr;do 
+# echo "checking $chr:1-9999999"
+# grab all the reads that align to each alternate "HLA" sequence
+/opt/samtools/bin/samtools view -T $reference $cram "$chr:1-9999999" >>$TEMPDIR/reads.sam
+done
+
+# grab all the reads that are unaligned
+echo "[INFO]  /opt/samtools/bin/samtools view -f 4 -T $reference $cram >>$TEMPDIR/reads.sam"
+time /opt/samtools/bin/samtools view -f 4 -T $reference $cram >>$TEMPDIR/reads.sam
+# covert from .sam to .bam format
+echo "[INFO] /opt/samtools/bin/samtools view -Sb -o $TEMPDIR/reads.bam $TEMPDIR/reads.sam"
+time /opt/samtools/bin/samtools view -Sb -o $TEMPDIR/reads.bam $TEMPDIR/reads.sam 
+
+echo "Step 2: running picard ..."
+echo "[INFO] /usr/bin/java -Xmx6g -jar /usr/picard/picard.jar SamToFastq VALIDATION_STRINGENCY=LENIENT F=$TEMPDIR/$name.q.fwd.fastq.gz F2=$TEMPDIR/$name.q.rev.fastq.gz I=$TEMPDIR/reads.bam R=$reference FU=$TEMPDIR/unpaired.fastq.gz"
+time /usr/bin/java -Xmx6g -jar /usr/picard/picard.jar SamToFastq VALIDATION_STRINGENCY=LENIENT F=$TEMPDIR/$name.q.fwd.fastq.gz F2=$TEMPDIR/$name.q.rev.fastq.gz I=$TEMPDIR/reads.bam R=$reference FU=$TEMPDIR/unpaired.fastq.gz
+
+
+#echo step 0
+#0 index the Optitype reference file:
+#/usr/local/bin/bwa index $dnaref
+
+echo "Step 3: Aligning forward reads to reference HLA locus sequence"
+echo "[INFO] /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.fwd.fastq.gz > $TEMPDIR/$name.aln.fwd.sam"
+time /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.fwd.fastq.gz > $TEMPDIR/$name.aln.fwd.sam # use bwa mem, store output IN TEMP, and skip samse step
+rm -f $TEMPDIR/$name.q.fwd.fastq.gz;
+
+echo "Step 4: Aligning reverse reads to reference HLA locus sequence"
+echo "[INFO] /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.rev.fastq.gz > $TEMPDIR/$name.aln.rev.sam"
+time /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.rev.fastq.gz > $TEMPDIR/$name.aln.rev.sam # use bwa mem, store output IN TEMP, and skip samse step
+rm -f $TEMPDIR/$name.q.rev.fastq.gz
+
+echo "Step 5: Select only the mapped reads from the sam files:"
+echo "[INFO] /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.fwd.sam > $TEMPDIR/$name.aln.map.fwd.sam"
+time /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.fwd.sam > $TEMPDIR/$name.aln.map.fwd.sam
+echo "[INFO] /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.rev.sam > $TEMPDIR/$name.aln.map.rev.sam"
+time /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.rev.sam > $TEMPDIR/$name.aln.map.rev.sam
+rm -f $TEMPDIR/$name.aln.fwd.sam
+rm -f $TEMPDIR/$name.aln.rev.sam
+
+echo "Step 6: Convert sam files to fastq files, also stored in temp dir"
+time cat $TEMPDIR/$name.aln.map.fwd.sam | grep -v ^@ | /usr/bin/awk '{print "@"$1"\n"$10"\n+\n"$11}' > $outdir/$name.hla.fwd.fastq
+time cat $TEMPDIR/$name.aln.map.rev.sam | grep -v ^@ | /usr/bin/awk '{print "@"$1"\n"$10"\n+\n"$11}' > $outdir/$name.hla.rev.fastq
+rm -f $TEMPDIR/$name.aln.map.fwd.sam
+rm -f $TEMPDIR/$name.aln.map.rev.sam
+
+echo "Step 7: run Optitype"
+# run optitype
+echo "[INFO] /usr/bin/python /usr/local/bin/OptiType/OptiTypePipeline.py -i $outdir/$name.hla.fwd.fastq $outdir/$name.hla.rev.fastq --dna -v -p $name -o $outdir" 
+time /usr/bin/python /usr/local/bin/OptiType/OptiTypePipeline.py -i $outdir/$name.hla.fwd.fastq $outdir/$name.hla.rev.fastq --dna -v -p $name -o $outdir