-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #10 from Layth17/master
Adding a WDL-optimized Optitype Script
- Loading branch information
Showing
3 changed files
with
106 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
Due to worries about backward compatibility, the original file `optitype_script.sh` was left untouched and | ||
a new file `optitype_script_wdl.sh` has been created, tailored for the `wdl` pipeline. | ||
|
||
`optitype_script_wdl.sh` allows for control over RAM and CPU usage. | ||
It also massively improves performance (from ~5h to ~20m) at the expense of a bit of sensitivity. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
#!/bin/bash | ||
|
||
# $1 is the directory where all intermediate files will be written | ||
# $2 is the directory final outputs should be written to | ||
# $3 is the prefix that will be added to all files | ||
# $4 is the location of the input (normal) cram file | ||
# $5 is the reference file used to create the cram | ||
# $6 is the number of cores that will be used by processes | ||
# $7 is the total memory in the node | ||
#bsub -R 'select[mem>64000] rusage[mem=64000]' -e <error_file_name> -o <output_file_name> -q research-hpc -a 'docker(johnegarza/immuno-testing:latest)' /bin/bash /usr/bin/optitype_script.sh <intermediate files directory> <final results directory> <output file prefix> <cram path> | ||
|
||
set -e -o pipefail | ||
|
||
# Optitype DNA reference file | ||
dnaref="/ref_data/optitype_ref/hla_reference_dna.fasta"; | ||
|
||
DEFAULT_THREADS=4 | ||
DEFAULT_MEM=8 | ||
MEM_UTIL=80 # percent | ||
|
||
TEMPDIR="$1"; | ||
outdir="$2"; | ||
name="$3"; | ||
cram="$4"; | ||
reference="$5"; | ||
THREADS=${6:-$DEFAULT_THREADS} | ||
if [ ${7:-$DEFAULT_MEM} -eq 1 ]; then MEM=1G; else MEM="$((${7:-$DEFAULT_MEM}*$MEM_UTIL/100))"G; fi | ||
|
||
mkdir -p $TEMPDIR | ||
mkdir -p $outdir | ||
|
||
echo "Step 1: extracting hla region from chr6, reads to alternate HLA sequences, and all unmapped reads ..." | ||
|
||
# filter out reads directly from an existing CRAM file of alignments, only those reads that align to this region: chr6:29836259-33148325 | ||
echo "[INFO] /opt/samtools/bin/samtools view -h -T $reference $cram chr6:29836259-33148325 >$TEMPDIR/reads.sam" | ||
time /opt/samtools/bin/samtools view -h -T $reference $cram chr6:29836259-33148325 >$TEMPDIR/reads.sam | ||
|
||
# pull out only the *header* lines from the CRAM with the -H parameter. then get the sequence names and for those that match the string "HLA" do the following | ||
echo "[INFO] /opt/samtools/bin/samtools view -H -T $reference $cram | grep "^@SQ" | cut -f 2 | cut -f 2- -d : | grep HLA | while read chr;do ..." | ||
time /opt/samtools/bin/samtools view -H -T $reference $cram | grep "^@SQ" | cut -f 2 | cut -f 2- -d : | grep HLA | while read chr;do | ||
# echo "checking $chr:1-9999999" | ||
# grab all the reads that align to each alternate "HLA" sequence | ||
/opt/samtools/bin/samtools view -T $reference $cram "$chr:1-9999999" >>$TEMPDIR/reads.sam | ||
done | ||
|
||
# grab all the reads that are unaligned | ||
echo "[INFO] /opt/samtools/bin/samtools view -f 4 -T $reference $cram >>$TEMPDIR/reads.sam" | ||
time /opt/samtools/bin/samtools view -f 4 -T $reference $cram >>$TEMPDIR/reads.sam | ||
# covert from .sam to .bam format | ||
echo "[INFO] /opt/samtools/bin/samtools view -Sb -o $TEMPDIR/reads.bam $TEMPDIR/reads.sam" | ||
time /opt/samtools/bin/samtools view -Sb -o $TEMPDIR/reads.bam $TEMPDIR/reads.sam | ||
|
||
echo "Step 2: running picard ..." | ||
echo "[INFO] /usr/bin/java -Xmx6g -jar /usr/picard/picard.jar SamToFastq VALIDATION_STRINGENCY=LENIENT F=$TEMPDIR/$name.q.fwd.fastq.gz F2=$TEMPDIR/$name.q.rev.fastq.gz I=$TEMPDIR/reads.bam R=$reference FU=$TEMPDIR/unpaired.fastq.gz" | ||
time /usr/bin/java -Xmx6g -jar /usr/picard/picard.jar SamToFastq VALIDATION_STRINGENCY=LENIENT F=$TEMPDIR/$name.q.fwd.fastq.gz F2=$TEMPDIR/$name.q.rev.fastq.gz I=$TEMPDIR/reads.bam R=$reference FU=$TEMPDIR/unpaired.fastq.gz | ||
|
||
|
||
#echo step 0 | ||
#0 index the Optitype reference file: | ||
#/usr/local/bin/bwa index $dnaref | ||
|
||
echo "Step 3: Aligning forward reads to reference HLA locus sequence" | ||
echo "[INFO] /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.fwd.fastq.gz > $TEMPDIR/$name.aln.fwd.sam" | ||
time /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.fwd.fastq.gz > $TEMPDIR/$name.aln.fwd.sam # use bwa mem, store output IN TEMP, and skip samse step | ||
rm -f $TEMPDIR/$name.q.fwd.fastq.gz; | ||
|
||
echo "Step 4: Aligning reverse reads to reference HLA locus sequence" | ||
echo "[INFO] /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.rev.fastq.gz > $TEMPDIR/$name.aln.rev.sam" | ||
time /usr/local/bin/bwa mem -t $THREADS $dnaref $TEMPDIR/$name.q.rev.fastq.gz > $TEMPDIR/$name.aln.rev.sam # use bwa mem, store output IN TEMP, and skip samse step | ||
rm -f $TEMPDIR/$name.q.rev.fastq.gz | ||
|
||
echo "Step 5: Select only the mapped reads from the sam files:" | ||
echo "[INFO] /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.fwd.sam > $TEMPDIR/$name.aln.map.fwd.sam" | ||
time /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.fwd.sam > $TEMPDIR/$name.aln.map.fwd.sam | ||
echo "[INFO] /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.rev.sam > $TEMPDIR/$name.aln.map.rev.sam" | ||
time /opt/samtools/bin/samtools view -S -F 4 $TEMPDIR/$name.aln.rev.sam > $TEMPDIR/$name.aln.map.rev.sam | ||
rm -f $TEMPDIR/$name.aln.fwd.sam | ||
rm -f $TEMPDIR/$name.aln.rev.sam | ||
|
||
echo "Step 6: Convert sam files to fastq files, also stored in temp dir" | ||
time cat $TEMPDIR/$name.aln.map.fwd.sam | grep -v ^@ | /usr/bin/awk '{print "@"$1"\n"$10"\n+\n"$11}' > $outdir/$name.hla.fwd.fastq | ||
time cat $TEMPDIR/$name.aln.map.rev.sam | grep -v ^@ | /usr/bin/awk '{print "@"$1"\n"$10"\n+\n"$11}' > $outdir/$name.hla.rev.fastq | ||
rm -f $TEMPDIR/$name.aln.map.fwd.sam | ||
rm -f $TEMPDIR/$name.aln.map.rev.sam | ||
|
||
echo "Step 7: run Optitype" | ||
# run optitype | ||
echo "[INFO] /usr/bin/python /usr/local/bin/OptiType/OptiTypePipeline.py -i $outdir/$name.hla.fwd.fastq $outdir/$name.hla.rev.fastq --dna -v -p $name -o $outdir" | ||
time /usr/bin/python /usr/local/bin/OptiType/OptiTypePipeline.py -i $outdir/$name.hla.fwd.fastq $outdir/$name.hla.rev.fastq --dna -v -p $name -o $outdir |