Skip to content

Commit b64ef26

Browse files
committed
New utility tasks to
* verify primrose was run on PacBio BAM * get basecall model from ONT BAM
1 parent a019403 commit b64ef26

File tree

2 files changed

+85
-0
lines changed

2 files changed

+85
-0
lines changed

wdl/tasks/Utility/ONTUtils.wdl

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,3 +229,45 @@ task PartitionManifest {
229229
}
230230
}
231231

232+
task GetBasecallModel {
233+
meta {
234+
desciption: "Getting the basecall model string of an ONT BAM"
235+
}
236+
parameter_meta {
237+
bam: {
238+
desciption: "BAM to operate on",
239+
localization_optional: true
240+
}
241+
runid_2_model: "The basecall model for each run."
242+
}
243+
input {
244+
File bam
245+
}
246+
output {
247+
Map[String, String] runid_2_model = read_map("results.tsv")
248+
}
249+
250+
command <<<
251+
set -eux
252+
253+
export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token)
254+
samtools view -H ~{bam} | grep "^@RG" > one_rg_per_line.txt
255+
256+
while IFS= read -r line
257+
do
258+
echo "$line" | tr '\t' '\n' | grep "^DS:" | sed "s/^DS://" | tr ' ' '\n' > tmp.txt
259+
runid=$(grep "^runid=" tmp.txt | awk -F '=' '{print $2}')
260+
model=$(grep "^basecall_model=" tmp.txt | awk -F '=' '{print $2}')
261+
echo -e "${runid}\t${model}" >> results.tsv
262+
done < one_rg_per_line.txt
263+
>>>
264+
265+
runtime {
266+
cpu: 1
267+
memory: "4 GiB"
268+
disks: "local-disk 10 HDD"
269+
preemptible: 2
270+
maxRetries: 1
271+
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
272+
}
273+
}

wdl/tasks/Utility/PBUtils.wdl

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,3 +1280,46 @@ task SummarizePBI {
12801280
docker: select_first([runtime_attr.docker, default_attr.docker])
12811281
}
12821282
}
1283+
1284+
# todo: primrose is rebranded as jasmine, take care of that later
1285+
task VerifyPacBioBamHasAppropriatePrimroseRuns {
1286+
meta {
1287+
desciption: "Verify that a PacBio's BAM has primrose run on all its read groups"
1288+
}
1289+
input {
1290+
String bam
1291+
}
1292+
1293+
output {
1294+
Array[String] readgroups_missing_primrose = read_lines("movies_without_primrose.txt")
1295+
}
1296+
1297+
command <<<
1298+
set -eux
1299+
1300+
export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`
1301+
samtools view -H ~{bam} > header.txt
1302+
1303+
# get read groups' movies
1304+
grep "^@RG" header.txt | tr '\t' '\n' | grep "^PU:" | awk -F ':' '{print $2}' | sort > readgroup.movies.txt
1305+
cat readgroup.movies.txt
1306+
1307+
# get primrose PG lines
1308+
grep "^@PG" header.txt | grep -v "^@SQ" | grep "^@PG" | grep -F 'ID:primrose' | tr '\t' '\n' | grep '^CL:' > primrose.pg.lines.txt
1309+
tr ' ' '\n' < primrose.pg.lines.txt
1310+
1311+
touch movies_without_primrose.txt
1312+
while IFS= read -r readgroup; do
1313+
if ! grep -q "${readgroup}" primrose.pg.lines.txt; then echo "${readgroup}" >> movies_without_primrose.txt; fi
1314+
done < readgroup.movies.txt
1315+
>>>
1316+
1317+
runtime {
1318+
cpu: 1
1319+
memory: "4 GiB"
1320+
disks: "local-disk 10 HDD"
1321+
preemptible: 2
1322+
maxRetries: 1
1323+
docker: "us.gcr.io/broad-dsp-lrma/lr-gcloud-samtools:0.1.3"
1324+
}
1325+
}

0 commit comments

Comments
 (0)