@@ -716,53 +716,76 @@ task GetDuplicateReadnamesInQnameSortedBam {
716716 }
717717 parameter_meta {
718718 qns_bam : {
719+ desciption : "Query name sorted BAM to be de-duplicated" ,
719720 localization_optional : true
720721 }
722+ trial_idx : "the n-th time this is being tried for (start from 1), if this value is >= trial_max, the BAM will be localized and the task will use a persistent SSD instead of persistent HDD."
723+ trial_max : "the max number of attempt to perform the duty by streaming in the BAM; this design together with trial_idx is to prevent call-caching preventing retries."
721724 }
722725 input {
723726 File qns_bam
727+ Int trial_idx = 1
728+ Int trial_max = 3
724729 }
725730
726731 output {
727732 File dup_names_txt = "dup_read_names.txt"
728733 Boolean result_may_be_corrupted = read_boolean ("samtools.failed.txt" )
729734 }
730735
736+ Boolean localize_bam = trial_idx >= trial_max
737+
731738 command <<<
732- # the way this works is the following:
733- # 0) relying on the re-auth.sh script to export the credentials
734- # 1) perform the remote sam-view subsetting in the background
735- # 2) listen to the PID of the background process, while re-auth every 1200 seconds
739+
736740 source /opt/re-auth.sh
737741 set -euxo pipefail
738742
739743 # assumption
740744 sort_order = $(samtools view -H ~{qns_bam } | grep "^@HD" | tr '\t' '\n' | grep "^SO:" | awk -F ':' '{print $2}' )
741745 if [[ "queryname" != "${sort_order} " ]]; then echo -e "Sort order ${sort_oder} isn't the expected 'queryname'." && exit 1 ; fi
742746
743- # remote grab read names
744- echo "false" > samtools.failed.txt
745- samtools view ~{qns_bam } \
746- | awk -F '\t' '{print $1}' \
747- | uniq -d \
748- > "dup_read_names.txt" \
749- || { echo "true" > samtools.failed.txt ; exit 77 ; } &
750- pid = $!
747+ if ~{localize_bam }; then
748+ time \
749+ gcloud storage cp ~{qns_bam } name_does_not_matter.bam
751750
752- set +e
753- count = 1
754- while true ; do
755- sleep 1200 && date && source /opt/re-auth.sh
756- if [[ ${count } -gt 2 ]]; then exit 0 ; fi
757- if ! pgrep -x -P $pid ; then exit 0 ; fi
758- count = $(( count + 1 ))
759- done
751+ samtools view name_does_not_matter.bam \
752+ | awk -F '\t' '{print $1}' \
753+ | uniq -d \
754+ > "dup_read_names.txt"
755+
756+ echo "false" > samtools.failed.txt
757+ else
758+ # the way this works is the following:
759+ # 0) relying on the re-auth.sh script to export the credentials
760+ # 1) perform the remote sam-view operation in the background
761+ # 2) listen to the PID of the background process, while re-auth every 1200 seconds
762+
763+ # remote grab read names
764+ echo "false" > samtools.failed.txt
765+ samtools view ~{qns_bam } \
766+ | awk -F '\t' '{print $1}' \
767+ | uniq -d \
768+ > "dup_read_names.txt" \
769+ || { echo "true" > samtools.failed.txt ; exit 77 ; } &
770+ pid = $!
771+
772+ set +e
773+ count = 1
774+ while true ; do
775+ sleep 1200 && date && source /opt/re-auth.sh
776+ if [[ ${count } -gt 2 ]]; then exit 0 ; fi
777+ if ! pgrep -x -P $pid ; then exit 0 ; fi
778+ count = $(( count + 1 ))
779+ done
780+ fi
760781 >>>
761782
783+ Int disk_size = 5 + (if (localize_bam ) then ceil (size (qns_bam , "Gib" )) else 0 )
784+ String disk_type = if (localize_bam ) then "SSD" else "HDD"
762785 runtime {
763786 cpu : 1
764787 memory : "4 GiB"
765- disks : "local-disk 10 HDD "
788+ disks : "local-disk ~{ disk_size } ~{ disk_type } "
766789 preemptible : 2
767790 maxRetries : 1
768791 docker : "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
0 commit comments