Skip to content

Commit 57dcfaf

Browse files
committed
Two new workflows:
handle (ref-indepedent, alignment) a single piece of on-instrument demultiplexed CCS bam
1 parent 332ad96 commit 57dcfaf

File tree

3 files changed

+356
-0
lines changed

3 files changed

+356
-0
lines changed

.dockstore.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,12 @@ workflows:
8181
- name: PBMASIsoSeqDemultiplex
8282
subclass: wdl
8383
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/PBMASIsoSeqDemultiplex.wdl
84+
- name: ProcessOnInstrumentDemuxedChunkRefFree
85+
subclass: wdl
86+
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunkRefFree.wdl
87+
- name: ProcessOnInstrumentDemuxedChunk
88+
subclass: wdl
89+
primaryDescriptorPath: /wdl/pipelines/PacBio/Utility/ProcessOnInstrumentDemuxedChunk.wdl
8490

8591
###################################################
8692
# TechAgnostic - *mics data processing
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
version 1.0
2+
3+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
4+
5+
import "../../../tasks/Alignment/AlignHiFiUBAM.wdl" as ALN
6+
7+
import "../../TechAgnostic/Utility/AlignedBamQCandMetrics.wdl" as QCMetrics
8+
9+
import "../../../tasks/Utility/Finalize.wdl" as FF
10+
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE
11+
12+
workflow ProcessOnInstrumentDemuxedChunk {
13+
14+
meta {
15+
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform alignment and QC check, and collect a few metrics."
16+
}
17+
18+
parameter_meta {
19+
20+
#########
21+
# inputs
22+
readgroup_id:
23+
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>; no whitespaces allowed"
24+
25+
bam_SM_field:
26+
"value to place in the SM field of the resulting BAM header's @RG line"
27+
28+
platform:
29+
"PacBio platform used for generating the data; accepted value: [Sequel, Revio]"
30+
31+
bam_descriptor:
32+
"a one-word description of the purpose of the BAM (e.g. 'rg_post_aln'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"
33+
34+
gcs_out_root_dir:
35+
"output files will be copied over there"
36+
37+
cov_bed:
38+
"An optional BED file on which coverage will be collected (a mean value for each interval)"
39+
cov_bed_descriptor:
40+
"A short description of the BED provided for targeted coverage estimation; will be used in naming output files."
41+
42+
fingerprint_store:
43+
"A GCS 'folder' holding fingerprint VCF files"
44+
sample_id_at_store:
45+
"The ID of the sample supposedly this BAM belongs to; note that the fingerprint VCF is assumed to be located at {fingerprint_store}/{sample_id_at_store}*.vcf(.gz?)"
46+
47+
vbid2_config_json:
48+
"A config json to for running the VBID2 contamination estimation sub-workflow; if provided, will trigger the VBID2 sub-workflow for cross-(human)individual contamination estimation."
49+
50+
expected_sex_type:
51+
"If provided, triggers sex concordance check. Accepted value: [M, F, NA, na]"
52+
53+
check_postaln_methyl_tags:
54+
"If true, will run a sub-workflow to collect methylation tags information."
55+
56+
#########
57+
# outputs
58+
wgs_cov:
59+
"whole genome mean coverage"
60+
61+
nanoplot_summ:
62+
"Summary on alignment metrics provided by Nanoplot (todo: study the value of this output)"
63+
64+
sam_flag_stats:
65+
"SAM flag stats"
66+
67+
contamination_est:
68+
"cross-(human)individual contamination estimation by VerifyBAMID2"
69+
70+
inferred_sex_info:
71+
"Inferred sex concordance information if expected sex type is provided"
72+
73+
methyl_tag_simple_stats:
74+
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."
75+
76+
aBAM_metrics_files_out:
77+
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
78+
}
79+
80+
input {
81+
File uBAM
82+
File? uPBI
83+
84+
String readgroup_id
85+
86+
String bam_SM_field
87+
88+
String platform
89+
90+
File ref_map_file
91+
92+
String gcs_out_root_dir
93+
94+
String disk_type = "SSD"
95+
96+
# args for optional QC subworkflows
97+
String bam_descriptor
98+
String tech
99+
100+
File? cov_bed
101+
String? cov_bed_descriptor
102+
103+
String? fingerprint_store
104+
String? sample_id_at_store
105+
106+
File? vbid2_config_json
107+
String? expected_sex_type
108+
Boolean check_postaln_methyl_tags = true
109+
}
110+
111+
output {
112+
String last_processing_date = today.yyyy_mm_dd
113+
114+
File aligned_bam = FinalizeAlignedBam.gcs_path
115+
File aligned_bai = FinalizeAlignedBai.gcs_path
116+
File aligned_pbi = FinalizeAlignedPbi.gcs_path
117+
118+
String movie = AlignHiFiUBAM.movie
119+
120+
# metrics block (caveat: always has to keep an eye on the QC subworkflow about outputs)
121+
Float wgs_cov = QCandMetrics.wgs_cov
122+
Map[String, Float] nanoplot_summ = QCandMetrics.nanoplot_summ
123+
Map[String, Float] sam_flag_stats = QCandMetrics.sam_flag_stats
124+
125+
# fingerprint
126+
Map[String, String]? fingerprint_check = QCandMetrics.fingerprint_check
127+
128+
# contam
129+
Float? contamination_est = QCandMetrics.contamination_est
130+
131+
# sex concordance
132+
Map[String, String]? inferred_sex_info = QCandMetrics.inferred_sex_info
133+
134+
# methyl
135+
Map[String, String]? methyl_tag_simple_stats = QCandMetrics.methyl_tag_simple_stats
136+
137+
# file-based QC/metrics outputs all packed into a finalization map
138+
Map[String, String] aBAM_metrics_files_out = QCandMetrics.aBAM_metrics_files_out
139+
}
140+
141+
###################################################################################
142+
# prep work
143+
144+
# where to store final results
145+
String workflow_name = "ProcessOnInstrumentDemuxedChunk"
146+
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name
147+
148+
# String bc_specific_out = outdir + '/' + readgroup_id
149+
String bc_specific_aln_out = outdir + '/alignments/' + readgroup_id
150+
String bc_specific_metrics_out = outdir + "/metrics/" + readgroup_id
151+
152+
###################################################################################
153+
# align
154+
call ALN.AlignHiFiUBAM as AlignHiFiUBAM { input:
155+
uBAM = uBAM,
156+
uPBI = uPBI,
157+
bam_sample_name = bam_SM_field,
158+
ref_map_file = ref_map_file,
159+
application = 'HIFI'
160+
}
161+
162+
File aBAM = AlignHiFiUBAM.aligned_bam
163+
File aBAI = AlignHiFiUBAM.aligned_bai
164+
File aPBI = AlignHiFiUBAM.aligned_pbi
165+
166+
# save
167+
call FF.FinalizeToFile as FinalizeAlignedBam { input: outdir = bc_specific_aln_out, file = aBAM, name = readgroup_id + '.bam' }
168+
call FF.FinalizeToFile as FinalizeAlignedBai { input: outdir = bc_specific_aln_out, file = aBAI, name = readgroup_id + '.bai' }
169+
call FF.FinalizeToFile as FinalizeAlignedPbi { input: outdir = bc_specific_aln_out, file = aPBI, name = readgroup_id + '.pbi' }
170+
171+
###################################################################################
172+
# QC
173+
call QCMetrics.Work as QCandMetrics { input:
174+
bam = aBAM,
175+
bai = aBAI,
176+
177+
bam_descriptor = bam_descriptor,
178+
tech = platform,
179+
180+
cov_bed = cov_bed,
181+
cov_bed_descriptor = cov_bed_descriptor,
182+
183+
fingerprint_store = fingerprint_store,
184+
sample_id_at_store = sample_id_at_store,
185+
186+
vbid2_config_json = vbid2_config_json,
187+
expected_sex_type = expected_sex_type,
188+
check_postaln_methyl_tags = check_postaln_methyl_tags,
189+
190+
ref_map_file = ref_map_file,
191+
disk_type = disk_type,
192+
193+
output_prefix = readgroup_id, # String output_prefix = bam_sample_name + "." + flowcell
194+
gcs_out_root_dir = bc_specific_metrics_out,
195+
}
196+
197+
###################################################################################
198+
199+
call GU.GetTodayDate as today {}
200+
}
Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
version 1.0
2+
3+
4+
import "../../../tasks/Utility/BAMutils.wdl" as BU
5+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
6+
7+
import "../../../tasks/Visualization/NanoPlot.wdl" as QC0
8+
import "../../TechAgnostic/Utility/CountTheBeans.wdl" as QC1
9+
import "../../TechAgnostic/Utility/DystPeaker.wdl" as QC2
10+
import "../../TechAgnostic/Utility/FASTQstats.wdl" as QC3
11+
12+
import "../../../tasks/Utility/Finalize.wdl" as FF
13+
import "../../TechAgnostic/Utility/SaveFilesToDestination.wdl" as SAVE
14+
15+
workflow ProcessOnInstrumentDemuxedChunkRefFree {
16+
17+
meta {
18+
desciption: "Given an on-instrument demultiplexed hifi_reads.bam, perform ref-independent prep work, and collect a few metrics"
19+
}
20+
21+
parameter_meta {
22+
readgroup_id:
23+
"Unique ID for a readgroup, for example (D|E)A[0-9]{6}-<barcode>"
24+
25+
short_reads_threshold:
26+
"a length threshold below which reads are classified as short"
27+
28+
bam_descriptor:
29+
"a one-word description of the purpose of the BAM (e.g. 'a_particular_seq_center'; used for saving the reads that don't have any MM/ML tags; doesn't need to be single-file specific)"
30+
31+
hifi_stats:
32+
"A few metrics output by Nanoplot"
33+
hifi_fq_stats:
34+
"A few metrics output by seqkit stats"
35+
36+
read_len_summaries:
37+
"A few metrics summarizing the read length distribution"
38+
read_len_peaks:
39+
"Peaks of the read length distribution (heruistic)"
40+
read_len_deciles:
41+
"Deciles of the read length distribution"
42+
43+
methyl_tag_simple_stats:
44+
"Simple stats on the reads with & without SAM methylation tags (MM/ML)."
45+
46+
uBAM_metrics_files_out:
47+
"A map where keys are summary-names and values are paths to files generated from the various QC/metrics tasks"
48+
}
49+
50+
input {
51+
File uBAM
52+
String readgroup_id
53+
54+
String bam_descriptor
55+
Int short_reads_threshold
56+
57+
String disk_type = "SSD"
58+
59+
String gcs_out_root_dir
60+
}
61+
62+
output {
63+
String movie = movie_name
64+
65+
File hifi_fq = FinalizeFQ.gcs_path
66+
67+
String last_processing_date = today.yyyy_mm_dd
68+
69+
# todo merge these two together
70+
Map[String, Float] hifi_stats = NanoPlotFromUBam.stats_map
71+
Map[String, Float] hifi_fq_stats = FASTQstats.stats
72+
73+
Map[String, String] read_len_summaries = DystPeaker.read_len_summaries
74+
Array[Int] read_len_peaks = DystPeaker.read_len_peaks
75+
Array[Int] read_len_deciles = DystPeaker.read_len_deciles
76+
77+
# methylation call rate stats
78+
Map[String, String] methyl_tag_simple_stats = NoMissingBeans.methyl_tag_simple_stats
79+
80+
# file-based outputs all packed into a finalization map
81+
Map[String, String] uBAM_metrics_files_out = FF.result
82+
}
83+
84+
###################################################################################
85+
# prep work
86+
87+
# where to store final results
88+
String workflow_name = "ProcessOnInstrumentDemuxedChunkRefFree"
89+
String outdir = sub(gcs_out_root_dir, "/$", "") + "/" + workflow_name
90+
91+
String outdir_ref_free = outdir + '/RefFree' # ideally, this isn't necessary, but it's a relic we'd not touch right now (2023-12-23)
92+
String bc_specific_out = outdir_ref_free + '/' + readgroup_id
93+
String bc_specific_metrics_out = bc_specific_out + "/metrics"
94+
95+
###################################################################################
96+
call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['PU']}
97+
String movie_name = RG.read_group_info['PU']
98+
99+
###################################################################################
100+
# convert to FASTQ (most for assembly)
101+
call BU.BamToFastq {input: bam = uBAM, prefix = "does_not_matter"}
102+
call FF.FinalizeToFile as FinalizeFQ { input:
103+
outdir = bc_specific_out,
104+
file = BamToFastq.reads_fq,
105+
name = readgroup_id + '.hifi.fq.gz'
106+
}
107+
108+
###################################################################################
109+
# more QCs and metrics
110+
111+
##########
112+
call QC0.NanoPlotFromUBam {input: uBAM = uBAM}
113+
FinalizationManifestLine a = object
114+
{files_to_save: flatten([[NanoPlotFromUBam.stats], NanoPlotFromUBam.plots]),
115+
is_singleton_file: false,
116+
destination: bc_specific_metrics_out + "/nanoplot",
117+
output_attribute_name: "nanoplot"}
118+
##########
119+
call QC1.CountTheBeans as NoMissingBeans { input:
120+
bam=uBAM,
121+
bam_descriptor=bam_descriptor,
122+
gcs_out_root_dir=bc_specific_metrics_out,
123+
use_local_ssd=disk_type=='LOCAL'
124+
}
125+
Map[String, String] methyl_out = {"missing_methyl_tag_reads":
126+
NoMissingBeans.methyl_tag_simple_stats['files_holding_reads_without_tags']}
127+
##########
128+
call QC2.DystPeaker { input:
129+
input_file=uBAM, input_is_bam=true,
130+
id=readgroup_id, short_reads_threshold=short_reads_threshold,
131+
gcs_out_root_dir=bc_specific_metrics_out
132+
}
133+
Map[String, String] read_len_out = {"read_len_hist": DystPeaker.read_len_hist,
134+
"raw_rl_file": DystPeaker.read_len_summaries['raw_rl_file']}
135+
136+
##########
137+
call QC3.FASTQstats { input:
138+
reads=BamToFastq.reads_fq,
139+
file_type='FASTQ'
140+
}
141+
142+
###################################################################################
143+
call SAVE.SaveFilestoDestination as FF { input:
144+
instructions = select_all([a]),
145+
already_finalized = select_all([methyl_out,
146+
read_len_out])
147+
}
148+
149+
call GU.GetTodayDate as today {}
150+
}

0 commit comments

Comments
 (0)