From ce90d5380c88f31f284839b2509f6ac16683ac0a Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 14:49:56 -0400 Subject: [PATCH 01/65] add peak calling --- pipelines/skylab/atac/atac.wdl | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 8918a8d8ad..2516b8088b 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -537,6 +537,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" + peakcalling_bool = true # calculate chrom size dictionary based on text file chrom_size_dict={} @@ -553,10 +554,10 @@ task CreateFragmentFile { import csv # extract CB or BB (if preindex is true) tag from bam file to create fragment file - if preindex == "true": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) - elif preindex == "false": - data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + if preindex: + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) + else: + data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) # Add NHashID to metrics nhash_ID_value = "XXX" @@ -575,7 +576,6 @@ task CreateFragmentFile { with open(csv_file_path, mode='w', newline='') as file: writer = csv.writer(file) writer.writerows(flattened_data) # Write data - print(f"Dictionary successfully written to {csv_file_path}") atac_data = ad.read_h5ad("temp_metrics.h5ad") @@ -583,9 +583,23 @@ task CreateFragmentFile { atac_data.uns['NHashID'] = atac_nhash_id # calculate tsse metrics snap.metrics.tsse(atac_data, atac_gtf) - # Write new atac file atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad") + # peak calling? https://kzhang.org/SnapATAC2/tutorials/diff.html#Peak-calling-at-the-cluster-level + if peakcalling_bool: + print("Peak calling using MACS3") + snap.tl.macs3(atac_data) # were not including cell type? + print("test") + print(data.uns['macs3']) + + print("Writing peak files") + for k, peaks in data.uns['macs3'].items(): + peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) + ls + + # Write atac file + print("Writing h5ad file") + atac_data.write_h5ad("~{bam_base_name}.peaks.h5ad") CODE >>> From 82c5c454daed51a1a8c748b61b78c54b3201cea1 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 15:32:09 -0400 Subject: [PATCH 02/65] added output for peaks --- pipelines/skylab/atac/atac.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 2516b8088b..61f38361d6 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -598,7 +598,7 @@ task CreateFragmentFile { ls # Write atac file - print("Writing h5ad file") + print("Writing h5ad file with peaks") atac_data.write_h5ad("~{bam_base_name}.peaks.h5ad") CODE >>> @@ -614,6 +614,7 @@ task CreateFragmentFile { output { File fragment_file = "~{bam_base_name}.fragments.tsv" File Snap_metrics = "~{bam_base_name}.metrics.h5ad" + File peaks_h5 = "~{bam_base_name}.peaks.h5ad" # test File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" } } From 6e833d2f3cf02a3ad7cf865651679aee0c8f914a Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 16:59:52 -0400 Subject: [PATCH 03/65] fix --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 61f38361d6..c547c0ce8e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -537,7 +537,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = true + peakcalling_bool = True # calculate chrom size dictionary based on text file chrom_size_dict={} @@ -554,7 +554,7 @@ task CreateFragmentFile { import csv # extract CB or BB (if preindex is true) tag from bam file to create fragment file - if preindex: + if preindex == "true": data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="BB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) else: data = pp.recipe_10x_metrics("~{bam}", "~{bam_base_name}.fragments.tsv", "temp_metrics.h5ad", is_paired=True, barcode_tag="CB", chrom_sizes=chrom_size_dict, gene_anno=atac_gtf, peaks=None) From 76286b482f916296c02f21e6c703b85be8da2c6b Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 17:45:15 -0400 Subject: [PATCH 04/65] fix --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index c547c0ce8e..15e938c3e5 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -590,10 +590,10 @@ task CreateFragmentFile { print("Peak calling using MACS3") snap.tl.macs3(atac_data) # were not including cell type? print("test") - print(data.uns['macs3']) + print(atac_data.uns['macs3']) print("Writing peak files") - for k, peaks in data.uns['macs3'].items(): + for k, peaks in atac_data.uns['macs3'].items(): peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) ls From 90805ca0736cdce0c3c0e0b16c565a89ceaea06d Mon Sep 17 00:00:00 2001 From: aawdeh Date: Wed, 16 Oct 2024 18:33:48 -0400 Subject: [PATCH 05/65] test --- pipelines/skylab/atac/atac.wdl | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 15e938c3e5..6073ac0cfb 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -590,11 +590,6 @@ task CreateFragmentFile { print("Peak calling using MACS3") snap.tl.macs3(atac_data) # were not including cell type? print("test") - print(atac_data.uns['macs3']) - - print("Writing peak files") - for k, peaks in atac_data.uns['macs3'].items(): - peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) ls # Write atac file From b27011c80bd5cea6cb3d79e03b68e2494754fb52 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 17 Oct 2024 11:27:40 -0400 Subject: [PATCH 06/65] remove ls --- pipelines/skylab/atac/atac.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6073ac0cfb..2326d27b64 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -590,7 +590,6 @@ task CreateFragmentFile { print("Peak calling using MACS3") snap.tl.macs3(atac_data) # were not including cell type? print("test") - ls # Write atac file print("Writing h5ad file with peaks") From c1f0e5709aedf2a4460695619575fb4e45a599ba Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 09:41:05 -0400 Subject: [PATCH 07/65] test --- pipelines/skylab/atac/atac.wdl | 51 ++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 2326d27b64..e354ca13a8 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -550,6 +550,7 @@ task CreateFragmentFile { import snapatac2.preprocessing as pp import snapatac2 as snap import anndata as ad + import scanpy as sc from collections import OrderedDict import csv @@ -585,10 +586,56 @@ task CreateFragmentFile { snap.metrics.tsse(atac_data, atac_gtf) atac_data.write_h5ad("~{bam_base_name}.metrics.h5ad") - # peak calling? https://kzhang.org/SnapATAC2/tutorials/diff.html#Peak-calling-at-the-cluster-level + # Peak calling if peakcalling_bool: + print("Peak calling") + # Calculate and plot the size distribution of fragments + print("1.1 Calculating fragment size distribution") + snap.pl.frag_size_distr(atac_data) + # Need to parameterize + # Filter cells + print("1.2 Filtering cells") + snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) + print(atac_data) + # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. + print("1.3 Creating cell by bin matrix") + snap.pp.add_tile_matrix(atac_data) + print(atac_data) + # Feature selection + print("1.4 Feature selection") + snap.pp.select_features(atac_data) + # Run customized scrublet algorithm to identify potential doublets + print("1.5 Run scrublet to identify potential doublets") + snap.pp.scrublet(atac_data) + # Employ spectral embedding for dimensionality reduction + print("1.6 Employ spectral embedding for dimensionality reduction") + snap.tl.spectral(atac_data) + # Filter doublets based on scrublet scores + print("1.7 Filter doublets based on scrublet scores") + snap.pp.filter_doublets(atac_data, probability_threshold=0.5) + # Perform graph-based clustering to identify cell clusters. + # Build a k-nearest neighbour graph using snap.pp.knn + print("1.8 Perform knn graph-based clustering to identify cell clusters") + snap.pp.knn(atac_data) + # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph + print("1.9 Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph") + snap.tl.leiden(atac_data) + # Create the cell by gene activity matrix + print("1.10 Create the cell by gene activity matrix") + gene_mat = snap.pp.make_gene_matrix(atac_data, gene_anno=atac_gtf) + # Normalize the gene matrix + print("1.11 Normalize the gene matrix") + gene_mat.obs['leiden'] = atac_data.obs['leiden'] + sc.pp.normalize_total(gene_mat) + sc.pp.log1p(gene_mat) + sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon") + + for i in np.unique(gene_mat.obs['leiden']): + markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] + print(f"Cluster {i}: {', '.join(markers)}") + print("Peak calling using MACS3") - snap.tl.macs3(atac_data) # were not including cell type? + snap.tl.macs3(atac_data, groupby='leiden') print("test") # Write atac file From 779fe9bd27d4cb0cb8efad0e2fc9842f9685c765 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 13:23:45 -0400 Subject: [PATCH 08/65] test --- pipelines/skylab/atac/atac.wdl | 71 ++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 25 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e354ca13a8..87b2cf9e83 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -588,44 +588,63 @@ task CreateFragmentFile { # Peak calling if peakcalling_bool: - print("Peak calling") + print("Peak calling starting...") + # Calculate and plot the size distribution of fragments - print("1.1 Calculating fragment size distribution") + print("Calculating fragment size distribution") snap.pl.frag_size_distr(atac_data) - # Need to parameterize - # Filter cells - print("1.2 Filtering cells") + print(atac_data) + + # Filter cells -- Need to parameterize + print("Filtering cells") snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) print(atac_data) + # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. - print("1.3 Creating cell by bin matrix") - snap.pp.add_tile_matrix(atac_data) - print(atac_data) + print("Creating cell by bin matrix") + atac_data_mod = snap.pp.add_tile_matrix(atac_data) + atac_data_mod.obsm = atac_data.obsm + print(atac_data_mod) + # Feature selection - print("1.4 Feature selection") - snap.pp.select_features(atac_data) + print("Feature selection") + snap.pp.select_features(atac_data_mod) + print(atac_data_mod) + # Run customized scrublet algorithm to identify potential doublets - print("1.5 Run scrublet to identify potential doublets") - snap.pp.scrublet(atac_data) + print("Run scrublet to identify potential doublets") + snap.pp.scrublet(atac_data_mod) + print(atac_data_mod) + # Employ spectral embedding for dimensionality reduction - print("1.6 Employ spectral embedding for dimensionality reduction") - snap.tl.spectral(atac_data) + print("Employ spectral embedding for dimensionality reduction") + snap.tl.spectral(atac_data_mod) + print(atac_data_mod) + # Filter doublets based on scrublet scores - print("1.7 Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data, probability_threshold=0.5) + print("Filter doublets based on scrublet scores") + snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) + print(atac_data_mod) + # Perform graph-based clustering to identify cell clusters. # Build a k-nearest neighbour graph using snap.pp.knn - print("1.8 Perform knn graph-based clustering to identify cell clusters") - snap.pp.knn(atac_data) + print("Perform knn graph-based clustering to identify cell clusters") + snap.pp.knn(atac_data_mod) + print(atac_data_mod) + # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph - print("1.9 Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph") - snap.tl.leiden(atac_data) + print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph") + snap.tl.leiden(atac_data_mod) + print(atac_data_mod) + # Create the cell by gene activity matrix - print("1.10 Create the cell by gene activity matrix") - gene_mat = snap.pp.make_gene_matrix(atac_data, gene_anno=atac_gtf) + print("Create the cell by gene activity matrix") + gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf) + print(atac_data_mod) + # Normalize the gene matrix - print("1.11 Normalize the gene matrix") - gene_mat.obs['leiden'] = atac_data.obs['leiden'] + print("Normalize the gene matrix") + gene_mat.obs['leiden'] = atac_data_mod.obs['leiden'] sc.pp.normalize_total(gene_mat) sc.pp.log1p(gene_mat) sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon") @@ -635,7 +654,9 @@ task CreateFragmentFile { print(f"Cluster {i}: {', '.join(markers)}") print("Peak calling using MACS3") - snap.tl.macs3(atac_data, groupby='leiden') + snap.tl.macs3(atac_data_mod, groupby='leiden') + + print(atac_data_mod.uns['macs3']) print("test") # Write atac file From 5743eff9753a3ccb183730ea275c59003aa4c89c Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 13:38:54 -0400 Subject: [PATCH 09/65] test --- pipelines/skylab/atac/atac.wdl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 87b2cf9e83..1d3a03d2fe 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -527,7 +527,8 @@ task CreateFragmentFile { command <<< set -e pipefail - + pip3 install snapatac2==2.7.0 scanpy + python3 < Date: Fri, 18 Oct 2024 14:09:06 -0400 Subject: [PATCH 10/65] test --- pipelines/skylab/atac/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 1d3a03d2fe..ab32f30397 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -605,6 +605,7 @@ task CreateFragmentFile { print("Creating cell by bin matrix") atac_data_mod = snap.pp.add_tile_matrix(atac_data) atac_data_mod.obsm = atac_data.obsm + new_adata.uns["reference_sequences"] = atac_data.uns["reference_sequences"] print(atac_data_mod) # Feature selection From 1de1c85103a0edb23d234bce4adc714f7d000bc2 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 18 Oct 2024 15:38:37 -0400 Subject: [PATCH 11/65] test --- pipelines/skylab/atac/atac.wdl | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ab32f30397..d7d2e5b34f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -526,7 +526,9 @@ task CreateFragmentFile { } command <<< - set -e pipefail + set -eou pipefail + set -x + # install packages -- need to add to the docker pip3 install snapatac2==2.7.0 scanpy python3 < Date: Fri, 18 Oct 2024 15:59:36 -0400 Subject: [PATCH 12/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index d7d2e5b34f..90953769fa 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -540,7 +540,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = False + peakcalling_bool = True # calculate chrom size dictionary based on text file chrom_size_dict={} From 13a44f9ff6eef6356d34a528f2aeb88f7a348581 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 10:11:17 -0400 Subject: [PATCH 13/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 90953769fa..ae563eb49a 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -607,7 +607,7 @@ task CreateFragmentFile { print("Creating cell by bin matrix") atac_data_mod = snap.pp.add_tile_matrix(atac_data) print("set obsm") - atac_data_mod.obsm = atac_data.obsm + atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"] print("set uns") new_adata.uns["reference_sequences"] = atac_data.uns["reference_sequences"] print(atac_data_mod) From 3f12b9e3dc84031a401132792d6c7e6ff9549c96 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 16:38:31 -0400 Subject: [PATCH 14/65] made another task --- pipelines/skylab/atac/atac.wdl | 198 +++++++++++++++++++-------------- 1 file changed, 113 insertions(+), 85 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ae563eb49a..db4b94de64 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -155,6 +155,13 @@ workflow ATAC { } } + call PeakCalling { + input: + bam = BWAPairedEndAlignment.bam_aligned_output, + annotations_gtf = annotations_gtf, + metrics_h5ad = CreateFragmentFile.Snap_metrics + } + File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) @@ -504,7 +511,6 @@ task CreateFragmentFile { File bam File annotations_gtf File chrom_sizes - File annotations_gtf Boolean preindex Int disk_size = 500 Int mem_size = 64 @@ -528,9 +534,7 @@ task CreateFragmentFile { command <<< set -eou pipefail set -x - # install packages -- need to add to the docker - pip3 install snapatac2==2.7.0 scanpy - + python3 <>> - # Peak calling - if peakcalling_bool: - print("Peak calling starting...") + runtime { + docker: docker_path + disks: "local-disk ${disk_size} SSD" + memory: "${mem_size} GiB" + cpu: nthreads + cpuPlatform: cpuPlatform + } + + output { + File fragment_file = "~{bam_base_name}.fragments.tsv" + File Snap_metrics = "~{bam_base_name}.metrics.h5ad" + File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + } +} - # Calculate and plot the size distribution of fragments - print("Calculating fragment size distribution") - snap.pl.frag_size_distr(atac_data) - print(atac_data) - # Filter cells -- Need to parameterize - print("Filtering cells") - snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) - print(atac_data) +task PeakCalling { + input { + File bam + File annotations_gtf + File metrics_h5ad + } + String bam_base_name = basename(bam, ".bam") + + command <<< + set -euo pipefail + set -x + + # install packages -- need to add to the docker + pip3 install snapatac2==2.7.0 scanpy + + python3 <>> - - runtime { - docker: docker_path - disks: "local-disk ${disk_size} SSD" - memory: "${mem_size} GiB" - cpu: nthreads - cpuPlatform: cpuPlatform - } - + output { - File fragment_file = "~{bam_base_name}.fragments.tsv" - File Snap_metrics = "~{bam_base_name}.metrics.h5ad" - File peaks_h5 = "~{bam_base_name}.peaks.h5ad" # test - File atac_library_metrics = "~{bam_base_name}_~{atac_nhash_id}.atac_metrics.csv" + File peaks_h5ad = "~{bam_base_name}.peaks.h5ad" } -} + +} \ No newline at end of file From 08b2f2410dca33918a15456bd4963959cbbc6170 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 17:04:27 -0400 Subject: [PATCH 15/65] test --- pipelines/skylab/atac/atac.wdl | 38 +++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index db4b94de64..1a8a5533bd 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -153,14 +153,14 @@ workflow ATAC { atac_nhash_id = atac_nhash_id } - } - - call PeakCalling { + call PeakCalling { input: bam = BWAPairedEndAlignment.bam_aligned_output, annotations_gtf = annotations_gtf, - metrics_h5ad = CreateFragmentFile.Snap_metrics + metrics_h5ad = CreateFragmentFile.Snap_metrics, + docker_path = docker_prefix + snap_atac_docker } + } File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) @@ -267,7 +267,6 @@ task GetNumSplits { } } - # trim read 1 and read 2 adapter sequeunce with cutadapt task TrimAdapters { input { @@ -615,8 +614,21 @@ task PeakCalling { File bam File annotations_gtf File metrics_h5ad + # copied from previous task + Int disk_size = 500 + Int mem_size = 64 + Int nthreads = 4 + String docker_path } String bam_base_name = basename(bam, ".bam") + + parameter_meta { + bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task." + annotations_gtf: "GTF for SnapATAC2 to calculate TSS sites of fragment file." + disk_size: "Disk size used in create fragment file step." + mem_size: "The size of memory used in create fragment file." + docker_path: "The docker image path containing the runtime environment for this task" + } command <<< set -euo pipefail @@ -627,12 +639,17 @@ task PeakCalling { python3 <>> + runtime { + docker: docker_path + disks: "local-disk ${disk_size} SSD" + memory: "${mem_size} GiB" + cpu: nthreads + } + output { File peaks_h5ad = "~{bam_base_name}.peaks.h5ad" } From 80c1416b5debd07775a8fefd0b5f0d3038a2bf6c Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 17:19:26 -0400 Subject: [PATCH 16/65] test --- pipelines/skylab/atac/atac.wdl | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 1a8a5533bd..d0ef3ee92e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -510,6 +510,7 @@ task CreateFragmentFile { File bam File annotations_gtf File chrom_sizes + File annotations_gtf Boolean preindex Int disk_size = 500 Int mem_size = 64 @@ -531,8 +532,7 @@ task CreateFragmentFile { } command <<< - set -eou pipefail - set -x + set -e pipefail python3 <>> @@ -608,7 +609,6 @@ task CreateFragmentFile { } } - task PeakCalling { input { File bam @@ -666,8 +666,10 @@ task PeakCalling { atac_data_mod = snap.pp.add_tile_matrix(atac_data) print("set obsm") atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"] - print("set uns") - new_adata.uns["reference_sequences"] = atac_data.uns["reference_sequences"] + print("set all uns") + for key in atac_data.uns.keys(): + print("set ",key) + new_adata.uns[key] = atac_data.uns[key] print(atac_data_mod) # Feature selection From b5191e526d28b1b8a83dc89c7a792b62d089351e Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 17:44:49 -0400 Subject: [PATCH 17/65] test --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index d0ef3ee92e..6a39a3472d 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -663,13 +663,13 @@ task PeakCalling { # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. print("Creating cell by bin matrix") - atac_data_mod = snap.pp.add_tile_matrix(atac_data) + atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False) print("set obsm") atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"] print("set all uns") for key in atac_data.uns.keys(): print("set ",key) - new_adata.uns[key] = atac_data.uns[key] + atac_data_mod.uns[key] = atac_data.uns[key] print(atac_data_mod) # Feature selection From 4725080516d8cffaa47382c659c3989bbb72775e Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 21 Oct 2024 18:49:20 -0400 Subject: [PATCH 18/65] test --- pipelines/skylab/atac/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6a39a3472d..28d746600d 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -642,6 +642,7 @@ task PeakCalling { # use snap atac2 import snapatac2 as snap import scanpy as sc + import numpy as np bam = "~{bam}" bam_base_name = "~{bam_base_name}" From f139f84743b66aefe5cab0a0c16d7e6838ed7179 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 22 Oct 2024 08:51:50 -0400 Subject: [PATCH 19/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 28d746600d..8af59d2a3f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -721,7 +721,7 @@ task PeakCalling { print(f"Cluster {i}: {', '.join(markers)}") print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden') + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") From 9a0ea9b47e5de7dec7d968a001894247351d225e Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 22 Oct 2024 10:08:42 -0400 Subject: [PATCH 20/65] test --- pipelines/skylab/atac/atac.wdl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 8af59d2a3f..ccb474acc2 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -720,8 +720,9 @@ task PeakCalling { markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] print(f"Cluster {i}: {', '.join(markers)}") - print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) + if __name__ == '__main__': + print("Peak calling using MACS3") + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") From 04cf0d2a91df97d737a70ec562fe61cc1301fe15 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 22 Oct 2024 10:28:26 -0400 Subject: [PATCH 21/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ccb474acc2..5227c534ff 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -722,7 +722,7 @@ task PeakCalling { if __name__ == '__main__': print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") From f9bc8718b488bc13d1e20028fe56a2180d81013b Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 12:58:18 -0400 Subject: [PATCH 22/65] change a line --- pipelines/skylab/atac/atac.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 5227c534ff..9576c42602 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -637,6 +637,9 @@ task PeakCalling { # install packages -- need to add to the docker pip3 install snapatac2==2.7.0 scanpy + file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py + sed -i '164s/.*/\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + python3 < Date: Thu, 24 Oct 2024 14:44:52 -0400 Subject: [PATCH 23/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 9576c42602..5c57a7c6ba 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -638,7 +638,7 @@ task PeakCalling { pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + sed -i '164s/.*/peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 < Date: Thu, 24 Oct 2024 15:10:32 -0400 Subject: [PATCH 24/65] test --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 5c57a7c6ba..60bd780f22 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -638,7 +638,7 @@ task PeakCalling { pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + sed -i '164s/.*/\t\t\t\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 < Date: Thu, 24 Oct 2024 16:00:38 -0400 Subject: [PATCH 25/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 60bd780f22..563796f72f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -638,7 +638,7 @@ task PeakCalling { pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/\t\t\t\tpeaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change + sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 < Date: Thu, 24 Oct 2024 18:50:22 -0400 Subject: [PATCH 26/65] test --- pipelines/skylab/atac/atac.wdl | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 563796f72f..e11f6853c2 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -646,6 +646,7 @@ task PeakCalling { import snapatac2 as snap import scanpy as sc import numpy as np + import polars as pl bam = "~{bam}" bam_base_name = "~{bam_base_name}" @@ -723,11 +724,29 @@ task PeakCalling { markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] print(f"Cluster {i}: {', '.join(markers)}") - if __name__ == '__main__': - print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) - - atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") + + print("Peak calling using MACS3") + snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) + + print("NarrowPeak") + for k, peaks in atac_data_mod.uns['macs3'].items(): + peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) + + print("test") + snap.ex.export_coverage( + atac_data_mod, + groupby='leiden', + ) + + print("Convert pl.DataFrame to pandas DataFrame") + # Convert pl.DataFrame to pandas DataFrame + for key in new_adata.uns.keys(): + if isinstance(new_adata.uns[key], pl.DataFrame): + print(key) + new_adata.uns[key] = new_adata.uns[key].to_pandas() + + print("Write into h5ad file") + atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad", compression='gzip') print("test") CODE From b15b146281e944f4a809ac364870a84fc387b458 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 18:50:52 -0400 Subject: [PATCH 27/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e11f6853c2..f645309639 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -746,7 +746,7 @@ task PeakCalling { new_adata.uns[key] = new_adata.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad", compression='gzip') + atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") print("test") CODE From ff36c155c5b010c117ed3d34c9574344f54e0fe2 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 18:51:29 -0400 Subject: [PATCH 28/65] test --- pipelines/skylab/atac/atac.wdl | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index f645309639..dc41406540 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -724,20 +724,9 @@ task PeakCalling { markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] print(f"Cluster {i}: {', '.join(markers)}") - print("Peak calling using MACS3") snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=8) - print("NarrowPeak") - for k, peaks in atac_data_mod.uns['macs3'].items(): - peaks.to_csv(f'{k}.NarrowPeak', sep='\t', header=False, index=False) - - print("test") - snap.ex.export_coverage( - atac_data_mod, - groupby='leiden', - ) - print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame for key in new_adata.uns.keys(): From 27a419678613a1a624cdbcab2f7426a0476a9e34 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Thu, 24 Oct 2024 19:49:17 -0400 Subject: [PATCH 29/65] test --- pipelines/skylab/atac/atac.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index dc41406540..41abc21bcd 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -729,10 +729,10 @@ task PeakCalling { print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame - for key in new_adata.uns.keys(): - if isinstance(new_adata.uns[key], pl.DataFrame): + for key in atac_data_mod.uns.keys(): + if isinstance(atac_data_mod.uns[key], pl.DataFrame): print(key) - new_adata.uns[key] = new_adata.uns[key].to_pandas() + atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") From 987b5d2d6f0bb8201d13e7fd1936eb992dba03e0 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 28 Oct 2024 15:02:49 -0400 Subject: [PATCH 30/65] Add peakcall to task --- pipelines/skylab/atac/atac.wdl | 96 +++++++++++++++++++++++++++++++++- 1 file changed, 95 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 41abc21bcd..6b563244e9 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -532,7 +532,14 @@ task CreateFragmentFile { } command <<< - set -e pipefail + set -euo pipefail + set -x + + # install packages -- need to add to the docker + pip3 install snapatac2==2.7.0 scanpy + + file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py + sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change python3 <>> From cce58417212c7d576a7a049cd66579d010bf3cae Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 28 Oct 2024 15:21:15 -0400 Subject: [PATCH 31/65] test --- pipelines/skylab/atac/atac.wdl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6b563244e9..265bb5e90e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -562,6 +562,9 @@ task CreateFragmentFile { # use snap atac2 import snapatac2.preprocessing as pp import snapatac2 as snap + import scanpy as sc + import numpy as np + import polars as pl import anndata as ad from collections import OrderedDict import csv From bf0e9ce33829aafd1166d96fc08c6183effda6da Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 28 Oct 2024 17:57:44 -0400 Subject: [PATCH 32/65] add parameters --- pipelines/skylab/atac/atac.wdl | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 265bb5e90e..974578dbd7 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -710,12 +710,17 @@ task PeakCalling { input { File bam File annotations_gtf - File metrics_h5ad - # copied from previous task + File metrics_h5ad + # SnapATAC2 parameters + Int min_counts = 5000 + Int min_tsse = 10 + Int max_counts = 100000 + Int doublet_probability_threshold = 0.5 + # Runtime attributes/docker + String docker_path Int disk_size = 500 Int mem_size = 64 - Int nthreads = 4 - String docker_path + Int nthreads = 4 } String bam_base_name = basename(bam, ".bam") @@ -749,6 +754,10 @@ task PeakCalling { bam_base_name = "~{bam_base_name}" atac_gtf = "~{annotations_gtf}" metrics_h5ad = "~{metrics_h5ad}" + min_counts = "~{min_counts}" + min_tsse = "~{min_tsse}" + max_counts = "~{max_counts}" + probability_threshold = "~{doublet_probability_threshold}" print("Peak calling starting...") atac_data = snap.read(metrics_h5ad) @@ -760,7 +769,7 @@ task PeakCalling { # Filter cells -- Need to parameterize print("Filtering cells") - snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) + snap.pp.filter_cells(atac_data, min_counts=min_counts, min_tsse=min_tsse, max_counts=max_counts) print(atac_data) # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. @@ -791,7 +800,7 @@ task PeakCalling { # Filter doublets based on scrublet scores print("Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) + snap.pp.filter_doublets(atac_data_mod, probability_threshold=probability_threshold) print(atac_data_mod) # Perform graph-based clustering to identify cell clusters. From 18f77d8d0189ba30ec445b88ace4ba528be8882b Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 4 Nov 2024 09:46:07 -0500 Subject: [PATCH 33/65] test --- pipelines/skylab/atac/atac.wdl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 89646be7ea..8ac108e959 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -715,7 +715,6 @@ task PeakCalling { Int min_counts = 5000 Int min_tsse = 10 Int max_counts = 100000 - Int doublet_probability_threshold = 0.5 # Runtime attributes/docker String docker_path Int disk_size = 500 @@ -757,7 +756,6 @@ task PeakCalling { min_counts = "~{min_counts}" min_tsse = "~{min_tsse}" max_counts = "~{max_counts}" - probability_threshold = "~{doublet_probability_threshold}" print("Peak calling starting...") atac_data = snap.read(metrics_h5ad) @@ -800,7 +798,7 @@ task PeakCalling { # Filter doublets based on scrublet scores print("Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data_mod, probability_threshold=probability_threshold) + snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) print(atac_data_mod) # Perform graph-based clustering to identify cell clusters. From db59231f001dfa60306e62f23cc024b1b9fdcbd4 Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Fri, 13 Dec 2024 10:24:41 -0500 Subject: [PATCH 34/65] peak calling --- pipelines/skylab/atac/atac.wdl | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 7e42a65802..87df29afa0 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -763,13 +763,8 @@ task PeakCalling { command <<< set -euo pipefail set -x - - # install packages -- need to add to the docker - pip3 install snapatac2==2.7.0 scanpy file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change - python3 < Date: Fri, 13 Dec 2024 11:00:33 -0500 Subject: [PATCH 35/65] remove bam_base_name from CreateFragments task --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 3290d22498..0a55334c15 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -625,7 +625,7 @@ task CreateFragmentFile { if peakcalling_bool: print("Peak calling starting...") - atac_data = snap.read("~{bam_base_name}.metrics.h5ad") + atac_data = snap.read("~{input_id}.metrics.h5ad") # Calculate and plot the size distribution of fragments print("Calculating fragment size distribution") @@ -706,7 +706,7 @@ task CreateFragmentFile { atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") + atac_data_mod.write_h5ad("~{input_id}.peaks.h5ad") print("test") CODE From 987e1e527e5cf1d4de8436aaa4ccdaddb7c79c48 Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Fri, 13 Dec 2024 11:10:40 -0500 Subject: [PATCH 36/65] separate task for macs3 consistent with create fragments --- pipelines/skylab/atac/atac.wdl | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 0a55334c15..e03edb7e2c 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -542,12 +542,6 @@ task CreateFragmentFile { set -euo pipefail set -x - # install packages -- need to add to the docker - pip3 install snapatac2==2.7.0 scanpy - - file_to_change=/usr/local/lib/python3.10/site-packages/snapatac2/tools/_call_peaks.py - sed -i '164s/.*/ peaks = [_call_peaks(x) for x in fragments.values()]/' $file_to_change - python3 < Date: Mon, 16 Dec 2024 09:40:14 -0500 Subject: [PATCH 37/65] Update atac.wdl update snap_atac_docker tag to 2.0.0 --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e03edb7e2c..ffd4ae3f0e 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -61,7 +61,7 @@ workflow ATAC { String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919" String samtools_docker = "samtools-dist-bwa:3.0.0" String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311" - String snap_atac_docker = "snapatac2:1.1.0" + String snap_atac_docker = "snapatac2:2.0.0" # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error if ((cloud_provider != "gcp") && (cloud_provider != "azure")) { From b136ba30e94df8895f0def5592c012cd87be7628 Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Mon, 16 Dec 2024 10:26:30 -0500 Subject: [PATCH 38/65] snapatac2 updates --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ffd4ae3f0e..a2375ad802 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -551,7 +551,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = True + peakcalling_bool = True # set peakcalling expected_cells = ~{atac_expected_cells} # calculate chrom size dictionary based on text file From 573a29fea508cc75552bcd4b76b7bfddb8d02817 Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Mon, 13 Jan 2025 15:12:49 -0500 Subject: [PATCH 39/65] turn off createfragments peakcalling --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index a2375ad802..868ddbf161 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -551,7 +551,7 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = True # set peakcalling + peakcalling_bool = False # set peakcalling expected_cells = ~{atac_expected_cells} # calculate chrom size dictionary based on text file From 7b8d1fcb82d503f2ed12b66d9f33511ceca7d8ee Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Tue, 14 Jan 2025 08:53:19 -0500 Subject: [PATCH 40/65] remove peakcalling from createfragments and parameterize probability_threshold --- pipelines/skylab/atac/atac.wdl | 93 ++-------------------------------- 1 file changed, 3 insertions(+), 90 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 868ddbf161..f9edb0362b 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -551,7 +551,6 @@ task CreateFragmentFile { atac_gtf = "~{annotations_gtf}" preindex = "~{preindex}" atac_nhash_id = "~{atac_nhash_id}" - peakcalling_bool = False # set peakcalling expected_cells = ~{atac_expected_cells} # calculate chrom size dictionary based on text file @@ -587,8 +586,7 @@ task CreateFragmentFile { atac_percent_target = number_of_cells / expected_cells*100 print("Setting percent target in nested dictionary") data['Cells']['atac_percent_target'] = atac_percent_target - - + # Flatten the dictionary flattened_data = [] for category, metrics in data.items(): @@ -617,92 +615,6 @@ task CreateFragmentFile { # Write new atac file atac_data.write_h5ad("~{input_id}.metrics.h5ad") - if peakcalling_bool: - print("Peak calling starting...") - atac_data = snap.read("~{input_id}.metrics.h5ad") - - # Calculate and plot the size distribution of fragments - print("Calculating fragment size distribution") - snap.pl.frag_size_distr(atac_data) - print(atac_data) - - # Filter cells -- Need to parameterize - print("Filtering cells") - snap.pp.filter_cells(atac_data, min_counts=5000, min_tsse=10, max_counts=100000) - print(atac_data) - - # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins. - print("Creating cell by bin matrix") - atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False) - print("set obsm") - atac_data_mod.obsm["fragment_paired"] = atac_data.obsm["fragment_paired"] - print("set all uns") - for key in atac_data.uns.keys(): - print("set ",key) - atac_data_mod.uns[key] = atac_data.uns[key] - print(atac_data_mod) - - # Feature selection - print("Feature selection") - snap.pp.select_features(atac_data_mod) - print(atac_data_mod) - - # Run customized scrublet algorithm to identify potential doublets - print("Run scrublet to identify potential doublets") - snap.pp.scrublet(atac_data_mod) - print(atac_data_mod) - - # Employ spectral embedding for dimensionality reduction - print("Employ spectral embedding for dimensionality reduction") - snap.tl.spectral(atac_data_mod) - print(atac_data_mod) - - # Filter doublets based on scrublet scores - print("Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) - print(atac_data_mod) - - # Perform graph-based clustering to identify cell clusters. - # Build a k-nearest neighbour graph using snap.pp.knn - print("Perform knn graph-based clustering to identify cell clusters") - snap.pp.knn(atac_data_mod) - print(atac_data_mod) - - # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph - print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph") - snap.tl.leiden(atac_data_mod) - print(atac_data_mod) - - # Create the cell by gene activity matrix - print("Create the cell by gene activity matrix") - gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf) - print(atac_data_mod) - - # Normalize the gene matrix - print("Normalize the gene matrix") - gene_mat.obs['leiden'] = atac_data_mod.obs['leiden'] - sc.pp.normalize_total(gene_mat) - sc.pp.log1p(gene_mat) - sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon") - - for i in np.unique(gene_mat.obs['leiden']): - markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names'] - print(f"Cluster {i}: {', '.join(markers)}") - - print("Peak calling using MACS3") - snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) - - print("Convert pl.DataFrame to pandas DataFrame") - # Convert pl.DataFrame to pandas DataFrame - for key in atac_data_mod.uns.keys(): - if isinstance(atac_data_mod.uns[key], pl.DataFrame): - print(key) - atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() - - print("Write into h5ad file") - atac_data_mod.write_h5ad("~{input_id}.peaks.h5ad") - print("test") - CODE # sorting the file @@ -740,6 +652,7 @@ task PeakCalling { Int min_counts = 5000 Int min_tsse = 10 Int max_counts = 100000 + Float probability_threshold = 0.5 # Runtime attributes/docker String docker_path Int disk_size = 500 @@ -817,7 +730,7 @@ task PeakCalling { # Filter doublets based on scrublet scores print("Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) + snap.pp.filter_doublets(atac_data_mod, probability_threshold=probability_threshold) print(atac_data_mod) # Perform graph-based clustering to identify cell clusters. From 0689868c2543282088c020bc575c35e594660dad Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Tue, 14 Jan 2025 15:54:40 -0500 Subject: [PATCH 41/65] remove probability_threshold parameterization --- pipelines/skylab/atac/atac.wdl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index f9edb0362b..03a5f466e5 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -652,7 +652,6 @@ task PeakCalling { Int min_counts = 5000 Int min_tsse = 10 Int max_counts = 100000 - Float probability_threshold = 0.5 # Runtime attributes/docker String docker_path Int disk_size = 500 @@ -730,7 +729,7 @@ task PeakCalling { # Filter doublets based on scrublet scores print("Filter doublets based on scrublet scores") - snap.pp.filter_doublets(atac_data_mod, probability_threshold=probability_threshold) + snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5) print(atac_data_mod) # Perform graph-based clustering to identify cell clusters. From 32f4b7fb70ffba7c436c1947b1bd846ed065caca Mon Sep 17 00:00:00 2001 From: Sid Cox Date: Tue, 14 Jan 2025 17:39:55 -0500 Subject: [PATCH 42/65] remove repeated line --- pipelines/skylab/atac/atac.wdl | 1 - 1 file changed, 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 03a5f466e5..43b2ad21ec 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -517,7 +517,6 @@ task CreateFragmentFile { File bam File annotations_gtf File chrom_sizes - File annotations_gtf Boolean preindex Int disk_size = 500 Int mem_size = 64 From fffb2564a740484931853a68127adadcc08f4ac9 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Tue, 14 Jan 2025 22:43:31 +0000 Subject: [PATCH 43/65] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 56 +++++++++++++++++++++---------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 5ecffd5530..729f50381e 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,40 +1,40 @@ Pipeline Name Version Date of Last Commit -CheckFingerprint 1.0.22 2024-10-28 -RNAWithUMIsPipeline 1.0.18 2024-11-04 -AnnotationFiltration 1.2.7 2024-11-04 -UltimaGenomicsWholeGenomeGermline 1.1.2 2024-11-04 -WholeGenomeGermlineSingleSample 3.3.3 2024-11-04 -ExomeGermlineSingleSample 3.2.3 2024-11-04 -JointGenotypingByChromosomePartTwo 1.5.2 2024-11-04 -JointGenotypingByChromosomePartOne 1.5.2 2024-11-04 +ExomeReprocessing 3.3.3 2024-11-04 +CramToUnmappedBams 1.1.3 2024-08-02 +WholeGenomeReprocessing 3.3.3 2024-11-04 +ExternalExomeReprocessing 3.3.3 2024-11-04 +ExternalWholeGenomeReprocessing 2.3.3 2024-11-04 +UltimaGenomicsJointGenotyping 1.2.2 2024-11-04 ReblockGVCF 2.3.2 2024-11-04 +JointGenotypingByChromosomePartOne 1.5.2 2024-11-04 +JointGenotypingByChromosomePartTwo 1.5.2 2024-11-04 JointGenotyping 1.7.2 2024-11-04 -UltimaGenomicsJointGenotyping 1.2.2 2024-11-04 +ExomeGermlineSingleSample 3.2.3 2024-11-04 +WholeGenomeGermlineSingleSample 3.3.3 2024-11-04 +UltimaGenomicsWholeGenomeGermline 1.1.2 2024-11-04 VariantCalling 2.2.4 2024-11-04 -UltimaGenomicsWholeGenomeCramOnly 1.0.23 2024-11-04 GDCWholeGenomeSomaticSingleSample 1.3.4 2024-11-04 -BroadInternalRNAWithUMIs 1.0.36 2024-11-04 -BroadInternalUltimaGenomics 1.1.2 2024-11-04 -BroadInternalArrays 1.1.14 2024-11-04 -BroadInternalImputation 1.1.14 2024-11-04 -Arrays 2.6.30 2024-11-04 +UltimaGenomicsWholeGenomeCramOnly 1.0.23 2024-11-04 +IlluminaGenotypingArray 1.12.24 2024-11-04 +AnnotationFiltration 1.2.7 2024-11-04 +CheckFingerprint 1.0.22 2024-10-28 ValidateChip 1.16.7 2024-11-04 -MultiSampleArrays 1.6.2 2024-08-02 Imputation 1.1.15 2024-11-04 -IlluminaGenotypingArray 1.12.24 2024-11-04 -ExternalWholeGenomeReprocessing 2.3.3 2024-11-04 -ExternalExomeReprocessing 3.3.3 2024-11-04 -CramToUnmappedBams 1.1.3 2024-08-02 -WholeGenomeReprocessing 3.3.3 2024-11-04 -ExomeReprocessing 3.3.3 2024-11-04 +MultiSampleArrays 1.6.2 2024-08-02 +Arrays 2.6.30 2024-11-04 +BroadInternalUltimaGenomics 1.1.2 2024-11-04 +BroadInternalImputation 1.1.14 2024-11-04 +BroadInternalArrays 1.1.14 2024-11-04 +BroadInternalRNAWithUMIs 1.0.36 2024-11-04 +RNAWithUMIsPipeline 1.0.18 2024-11-04 +Multiome 5.9.3 2024-12-3 +MultiSampleSmartSeq2SingleNucleus 2.0.6 2024-11-15 BuildIndices 3.1.0 2024-11-26 +SlideSeq 3.4.7 2024-12-3 +PairedTag 1.8.4 2024-12-3 +atac 2.5.3 2024-11-22 scATAC 1.3.2 2023-08-03 snm3C 4.0.4 2024-08-06 -Multiome 5.9.3 2024-12-3 -PairedTag 1.8.4 2024-12-3 -MultiSampleSmartSeq2 2.2.22 2024-09-11 -MultiSampleSmartSeq2SingleNucleus 2.0.6 2024-11-15 Optimus 7.8.4 2024-12-3 -atac 2.5.3 2024-11-22 +MultiSampleSmartSeq2 2.2.22 2024-09-11 SmartSeq2SingleSample 5.1.21 2024-09-11 -SlideSeq 3.4.7 2024-12-3 From cf2a77b841882fe7bc7dfc96d8eae86c4cdb63ab Mon Sep 17 00:00:00 2001 From: aawdeh Date: Sun, 19 Jan 2025 13:17:50 -0500 Subject: [PATCH 44/65] add optional parameter --- pipelines/skylab/atac/atac.wdl | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 43b2ad21ec..43b9e920f0 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -28,6 +28,8 @@ workflow ATAC { # Option for running files with preindex Boolean preindex = false + # Option for running peak calling + Boolean peak_calling = true # BWA ref File tar_bwa_reference @@ -71,7 +73,6 @@ workflow ATAC { } } - parameter_meta { read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads" read2_fastq_gzipped: "read 2 FASTQ file as input for the pipeline, contains the cellular barcodes corresponding to the reads in the read1 FASTQ and read 3 FASTQ" @@ -160,15 +161,17 @@ workflow ATAC { input_id = input_id } - call PeakCalling { - input: - bam = BWAPairedEndAlignment.bam_aligned_output, - annotations_gtf = annotations_gtf, - metrics_h5ad = CreateFragmentFile.Snap_metrics, - docker_path = docker_prefix + snap_atac_docker + if (peak_calling) { + call PeakCalling { + input: + bam = BWAPairedEndAlignment.bam_aligned_output, + annotations_gtf = annotations_gtf, + metrics_h5ad = CreateFragmentFile.Snap_metrics, + docker_path = docker_prefix + snap_atac_docker + } } } - + File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output]) File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) @@ -642,6 +645,7 @@ task CreateFragmentFile { } } +# peak calling using SnapATAC2 task PeakCalling { input { File bam From cf63ad1f3ca83054941842906efd9c497bb66797 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 21 Jan 2025 10:29:17 -0500 Subject: [PATCH 45/65] add output --- pipelines/skylab/atac/atac.wdl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 43b9e920f0..7941597e50 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -176,12 +176,16 @@ workflow ATAC { File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics]) - + + # if peakcalling task not called set peakcall_h5ad to null + File peakcall_h5ad = if (peak_calling) PeakCalling.peaks_h5ad else snap_metrics_atac + output { File bam_aligned_output = bam_aligned_output_atac File fragment_file = fragment_file_atac File snap_metrics = snap_metrics_atac File library_metrics_file = library_metrics + File peakcall_h5ad_file = peakcall_h5ad } } @@ -639,7 +643,6 @@ task CreateFragmentFile { output { File fragment_file = "~{input_id}.fragments.sorted.tsv.gz" File fragment_file_index = "~{input_id}.fragments.sorted.tsv.gz.csi" - File Snap_metrics = "~{input_id}.metrics.h5ad" File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv" } From 5e81613d7a61cc2b3283a7947e7c5ad8548d1d5c Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 21 Jan 2025 10:38:27 -0500 Subject: [PATCH 46/65] add output --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 0fa2d09aff..9563fd0896 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -178,8 +178,8 @@ workflow ATAC { File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics]) # if peakcalling task not called set peakcall_h5ad to null - File peakcall_h5ad = if (peak_calling) PeakCalling.peaks_h5ad else snap_metrics_atac - + File peakcall_h5ad = (if (peak_calling) PeakCalling.peaks_h5ad else snap_metrics_atac) + output { File bam_aligned_output = bam_aligned_output_atac File fragment_file = fragment_file_atac From f0216ab5f4d3d0e4418a8afc10970547f6bb8517 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 21 Jan 2025 10:47:10 -0500 Subject: [PATCH 47/65] test --- pipelines/skylab/atac/atac.wdl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 9563fd0896..cbbcbf461f 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -178,8 +178,11 @@ workflow ATAC { File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics]) # if peakcalling task not called set peakcall_h5ad to null - File peakcall_h5ad = (if (peak_calling) PeakCalling.peaks_h5ad else snap_metrics_atac) - + File peakcall_h5ad = snap_metrics_atac + if (peak_calling) { + File peakcall_h5ad_file = PeakCalling.peaks_h5ad + } + output { File bam_aligned_output = bam_aligned_output_atac File fragment_file = fragment_file_atac From ef09ec90c283d99c78ad6e65678e321e25def90d Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 21 Jan 2025 10:51:46 -0500 Subject: [PATCH 48/65] test --- pipelines/skylab/atac/atac.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index cbbcbf461f..fae7e2bf8c 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -180,7 +180,7 @@ workflow ATAC { # if peakcalling task not called set peakcall_h5ad to null File peakcall_h5ad = snap_metrics_atac if (peak_calling) { - File peakcall_h5ad_file = PeakCalling.peaks_h5ad + File peakcall_h5ad = PeakCalling.peaks_h5ad } output { From d64ad4dfca29a50140bc916fba78db73258e0cb4 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 21 Jan 2025 12:39:55 -0500 Subject: [PATCH 49/65] test --- pipelines/skylab/atac/atac.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index fae7e2bf8c..c633a348b8 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -180,7 +180,7 @@ workflow ATAC { # if peakcalling task not called set peakcall_h5ad to null File peakcall_h5ad = snap_metrics_atac if (peak_calling) { - File peakcall_h5ad = PeakCalling.peaks_h5ad + peakcall_h5ad = PeakCalling.peaks_h5ad } output { @@ -188,7 +188,7 @@ workflow ATAC { File fragment_file = fragment_file_atac File snap_metrics = snap_metrics_atac File library_metrics_file = library_metrics - File peakcall_h5ad_file = peakcall_h5ad + File? peakcall_h5ad_file = peakcall_h5ad } } From e3fe25fa55c09f8d5c9a516864c4718b9594ea00 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 21 Jan 2025 12:42:32 -0500 Subject: [PATCH 50/65] test --- pipelines/skylab/atac/atac.wdl | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index c633a348b8..6a434061d5 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -176,19 +176,13 @@ workflow ATAC { File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file]) File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics]) File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics]) - - # if peakcalling task not called set peakcall_h5ad to null - File peakcall_h5ad = snap_metrics_atac - if (peak_calling) { - peakcall_h5ad = PeakCalling.peaks_h5ad - } output { File bam_aligned_output = bam_aligned_output_atac File fragment_file = fragment_file_atac File snap_metrics = snap_metrics_atac File library_metrics_file = library_metrics - File? peakcall_h5ad_file = peakcall_h5ad + File? peakcall_h5ad_file = PeakCalling.peaks_h5ad } } From e6dad583dcff51b7470bc7b720988bedbc001f4f Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 21 Jan 2025 12:48:19 -0500 Subject: [PATCH 51/65] test --- pipelines/skylab/multiome/Multiome.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 1553513fd7..1c2e11f08b 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -149,6 +149,7 @@ workflow Multiome { File fragment_file_index = JoinBarcodes.atac_fragment_tsv_index File snap_metrics_atac = JoinBarcodes.atac_h5ad_file File atac_library_metrics = Atac.library_metrics_file + File? peak_call_atac = Atac.peakcall_h5ad_file # optimus outputs File genomic_reference_version_gex = Optimus.genomic_reference_version From 80e3fa1cdff2fa339a2ba094cd6ae0a869625eee Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 27 Jan 2025 15:07:36 -0500 Subject: [PATCH 52/65] Added matrix as output --- pipelines/skylab/atac/atac.wdl | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 6a434061d5..971a98006a 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -167,6 +167,7 @@ workflow ATAC { bam = BWAPairedEndAlignment.bam_aligned_output, annotations_gtf = annotations_gtf, metrics_h5ad = CreateFragmentFile.Snap_metrics, + chrom_sizes = chrom_sizes, docker_path = docker_prefix + snap_atac_docker } } @@ -657,7 +658,8 @@ task PeakCalling { input { File bam File annotations_gtf - File metrics_h5ad + File metrics_h5ad + File chrom_sizes # SnapATAC2 parameters Int min_counts = 5000 Int min_tsse = 10 @@ -694,6 +696,7 @@ task PeakCalling { bam_base_name = "~{bam_base_name}" atac_gtf = "~{annotations_gtf}" metrics_h5ad = "~{metrics_h5ad}" + chrom_sizes = "~{chrom_sizes}" min_counts = "~{min_counts}" min_tsse = "~{min_tsse}" max_counts = "~{max_counts}" @@ -772,6 +775,9 @@ task PeakCalling { print("Peak calling using MACS3") snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) + peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chrom_sizes) + peak_matrix = snap.pp.make_peak_matrix(atac_data_mod, use_rep=peaks['Peaks']) + print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame for key in atac_data_mod.uns.keys(): @@ -781,6 +787,7 @@ task PeakCalling { print("Write into h5ad file") atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") + peak_matrix.write_h5ad("~{bam_base_name}.matrix.h5ad") print("test") CODE @@ -795,6 +802,6 @@ task PeakCalling { output { File peaks_h5ad = "~{bam_base_name}.peaks.h5ad" + File matrix_h5ad = "~{bam_base_name}.matrix.h5ad" } - } From b3ebae925351d439a84ed1e44675621badd0ffda Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 27 Jan 2025 20:08:33 +0000 Subject: [PATCH 53/65] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 50 +++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 85b1dbb384..745d09057e 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -1,40 +1,40 @@ Pipeline Name Version Date of Last Commit -ExomeReprocessing 3.3.3 2024-11-04 -CramToUnmappedBams 1.1.3 2024-08-02 +IlluminaGenotypingArray 1.12.24 2024-11-04 WholeGenomeReprocessing 3.3.3 2024-11-04 -ExternalExomeReprocessing 3.3.3 2024-11-04 ExternalWholeGenomeReprocessing 2.3.3 2024-11-04 -UltimaGenomicsJointGenotyping 1.2.2 2024-11-04 -ReblockGVCF 2.4.0 2024-12-05 -JointGenotypingByChromosomePartOne 1.5.2 2024-11-04 -JointGenotypingByChromosomePartTwo 1.5.2 2024-11-04 -JointGenotyping 1.7.2 2024-11-04 -ExomeGermlineSingleSample 3.2.3 2024-11-04 +ExternalExomeReprocessing 3.3.3 2024-11-04 +CramToUnmappedBams 1.1.3 2024-08-02 +ExomeReprocessing 3.3.3 2024-11-04 +GDCWholeGenomeSomaticSingleSample 1.3.4 2024-11-04 +UltimaGenomicsWholeGenomeCramOnly 1.0.23 2024-11-04 WholeGenomeGermlineSingleSample 3.3.3 2024-11-04 UltimaGenomicsWholeGenomeGermline 1.1.3 2024-12-05 +ExomeGermlineSingleSample 3.2.3 2024-11-04 +JointGenotyping 1.7.2 2024-11-04 +ReblockGVCF 2.4.0 2024-12-05 +UltimaGenomicsJointGenotyping 1.2.2 2024-11-04 +JointGenotypingByChromosomePartTwo 1.5.2 2024-11-04 +JointGenotypingByChromosomePartOne 1.5.2 2024-11-04 VariantCalling 2.2.4 2024-11-04 -GDCWholeGenomeSomaticSingleSample 1.3.4 2024-11-04 -UltimaGenomicsWholeGenomeCramOnly 1.0.23 2024-11-04 -IlluminaGenotypingArray 1.12.24 2024-11-04 -AnnotationFiltration 1.2.7 2024-11-04 CheckFingerprint 1.0.22 2024-10-28 -ValidateChip 1.16.7 2024-11-04 -Imputation 1.1.15 2024-11-04 -MultiSampleArrays 1.6.2 2024-08-02 -Arrays 2.6.30 2024-11-04 +RNAWithUMIsPipeline 1.0.18 2024-11-04 BroadInternalUltimaGenomics 1.1.3 2024-12-05 +BroadInternalRNAWithUMIs 1.0.36 2024-11-04 BroadInternalImputation 1.1.14 2024-11-04 BroadInternalArrays 1.1.14 2024-11-04 -BroadInternalRNAWithUMIs 1.0.36 2024-11-04 -RNAWithUMIsPipeline 1.0.18 2024-11-04 +Imputation 1.1.15 2024-11-04 +MultiSampleArrays 1.6.2 2024-08-02 +ValidateChip 1.16.7 2024-11-04 +Arrays 2.6.30 2024-11-04 +AnnotationFiltration 1.2.7 2024-11-04 Multiome 5.9.6 2025-01-21 -MultiSampleSmartSeq2SingleNucleus 2.0.7 2025-01-13 -BuildIndices 4.0.0 2025-01-17 +snm3C 4.0.4 2024-08-06 SlideSeq 3.4.8 2025-01-13 -PairedTag 1.10.0 2025-01-21 -atac 2.6.0 2025-01-21 scATAC 1.3.2 2023-08-03 -snm3C 4.0.4 2024-08-06 -Optimus 7.9.1 2025-01-13 +BuildIndices 4.0.0 2025-01-17 MultiSampleSmartSeq2 2.2.22 2024-09-11 +Optimus 7.9.1 2025-01-13 +atac 2.6.0 2025-01-21 +PairedTag 1.10.0 2025-01-21 SmartSeq2SingleSample 5.1.21 2024-09-11 +MultiSampleSmartSeq2SingleNucleus 2.0.7 2025-01-13 From 9e4916aaab19ee0e45aae955efb18aa24fbaaccc Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 27 Jan 2025 15:09:32 -0500 Subject: [PATCH 54/65] Added matrix as output --- pipelines/skylab/atac/atac.wdl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index e159a9c855..4883a27ec7 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -777,8 +777,10 @@ task PeakCalling { print("Peak calling using MACS3") snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) + print("Merge peaks and create peak matrix") peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chrom_sizes) peak_matrix = snap.pp.make_peak_matrix(atac_data_mod, use_rep=peaks['Peaks']) + print(type(peak_matrix)) print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame From cd7c100633351128395c08ef27824137621be851 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Mon, 27 Jan 2025 16:23:21 -0500 Subject: [PATCH 55/65] Added matrix as output --- pipelines/skylab/atac/atac.wdl | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 4883a27ec7..bfce85f386 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -778,7 +778,11 @@ task PeakCalling { snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1) print("Merge peaks and create peak matrix") - peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chrom_sizes) + # read chrom sizes + chromsize_dict = pd.read_csv(chrom_sizes, sep='\t', header=None) + chromsize_dict = pd.Series(mouse_chromsize[1].values, index=mouse_chromsize[0]).to_dict() + # merge peaks and create peak matrix + peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chromsize_dict) peak_matrix = snap.pp.make_peak_matrix(atac_data_mod, use_rep=peaks['Peaks']) print(type(peak_matrix)) From d7486d98999f120cf41d230d5a16e3e79c54bf06 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 28 Jan 2025 07:34:42 -0500 Subject: [PATCH 56/65] test --- pipelines/skylab/atac/atac.wdl | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index bfce85f386..75a38c0010 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -693,6 +693,7 @@ task PeakCalling { import scanpy as sc import numpy as np import polars as pl + import pandas as pd bam = "~{bam}" bam_base_name = "~{bam_base_name}" From 89824b03c9606c6cef4212dcb19ea7091c062251 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 28 Jan 2025 08:23:12 -0500 Subject: [PATCH 57/65] test --- pipelines/skylab/atac/atac.wdl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 75a38c0010..35a2676e58 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -781,11 +781,13 @@ task PeakCalling { print("Merge peaks and create peak matrix") # read chrom sizes chromsize_dict = pd.read_csv(chrom_sizes, sep='\t', header=None) - chromsize_dict = pd.Series(mouse_chromsize[1].values, index=mouse_chromsize[0]).to_dict() + chromsize_dict = pd.Series(chromsize_dict[1].values, index=chromsize_dict[0]).to_dict() # merge peaks and create peak matrix peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chromsize_dict) + print(peaks) peak_matrix = snap.pp.make_peak_matrix(atac_data_mod, use_rep=peaks['Peaks']) print(type(peak_matrix)) + print(peak_matrix) print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame From aee56ecf1b720d32e7b34144875a0c1d957ed51b Mon Sep 17 00:00:00 2001 From: aawdeh Date: Tue, 28 Jan 2025 14:09:37 -0500 Subject: [PATCH 58/65] remove comments --- pipelines/skylab/atac/atac.wdl | 3 --- 1 file changed, 3 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 35a2676e58..4ad785eeec 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -784,10 +784,7 @@ task PeakCalling { chromsize_dict = pd.Series(chromsize_dict[1].values, index=chromsize_dict[0]).to_dict() # merge peaks and create peak matrix peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chromsize_dict) - print(peaks) peak_matrix = snap.pp.make_peak_matrix(atac_data_mod, use_rep=peaks['Peaks']) - print(type(peak_matrix)) - print(peak_matrix) print("Convert pl.DataFrame to pandas DataFrame") # Convert pl.DataFrame to pandas DataFrame From 83fbe0506f19cf58211e5bf38ed112e5ed74cd52 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 31 Jan 2025 09:46:05 -0500 Subject: [PATCH 59/65] update changelogs/versions --- pipelines/skylab/atac/atac.changelog.md | 4 ++++ pipelines/skylab/atac/atac.wdl | 2 +- pipelines/skylab/multiome/Multiome.changelog.md | 4 ++++ pipelines/skylab/multiome/Multiome.wdl | 7 +++++-- pipelines/skylab/paired_tag/PairedTag.changelog.md | 4 ++++ pipelines/skylab/paired_tag/PairedTag.wdl | 2 +- 6 files changed, 19 insertions(+), 4 deletions(-) diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md index d47665f255..8145cb8761 100644 --- a/pipelines/skylab/atac/atac.changelog.md +++ b/pipelines/skylab/atac/atac.changelog.md @@ -1,3 +1,7 @@ +# 2.6.1 +2025-01-31 (Date of Last Commit) +* Added an optional PeakCalling task. Added a boolean variable run_peak_calling; default is false. + # 2.6.0 2025-01-21 (Date of Last Commit) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 4ad785eeec..47383df63a 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -51,7 +51,7 @@ workflow ATAC { String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG" } - String pipeline_version = "2.6.0" + String pipeline_version = "2.6.1" # Determine docker prefix based on cloud provider String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/" diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md index c73565dbbc..e870007734 100644 --- a/pipelines/skylab/multiome/Multiome.changelog.md +++ b/pipelines/skylab/multiome/Multiome.changelog.md @@ -1,3 +1,7 @@ +# 5.9.7 +2025-01-31 (Date of Last Commit) +* Added an optional PeakCalling task to the ATAC workflow. Added a boolean variable run_peak_calling to the Multiome pipeline; default is false. + # 5.9.6 2025-01-21 (Date of Last Commit) diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl index 6cf6b92e4e..d6d97a0f84 100644 --- a/pipelines/skylab/multiome/Multiome.wdl +++ b/pipelines/skylab/multiome/Multiome.wdl @@ -7,7 +7,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow Multiome { - String pipeline_version = "5.9.6" + String pipeline_version = "5.9.7" input { String cloud_provider @@ -50,6 +50,8 @@ workflow Multiome { # CellBender Boolean run_cellbender = false + # Peak Calling + Boolean run_peak_calling = false } # Determine docker prefix based on cloud provider @@ -121,7 +123,8 @@ workflow Multiome { annotations_gtf = annotations_gtf, atac_nhash_id = atac_nhash_id, adapter_seq_read3 = adapter_seq_read3, - atac_expected_cells = expected_cells + atac_expected_cells = expected_cells, + peak_calling = run_peak_calling } call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes { input: diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md index d26db399ff..1bc923a3cd 100644 --- a/pipelines/skylab/paired_tag/PairedTag.changelog.md +++ b/pipelines/skylab/paired_tag/PairedTag.changelog.md @@ -1,3 +1,7 @@ +# 1.10.1 +2025-01-31 (Date of Last Commit) +* Added an optional PeakCalling task to the ATAC workflow. Added a boolean variable run_peak_calling to the Multiome pipeline; default is false. This does not affect the outputs of the pipeline + # 1.10.0 2025-01-21 (Date of Last Commit) diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl index fdf942bf90..05d1b74b96 100644 --- a/pipelines/skylab/paired_tag/PairedTag.wdl +++ b/pipelines/skylab/paired_tag/PairedTag.wdl @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils workflow PairedTag { - String pipeline_version = "1.10.0" + String pipeline_version = "1.10.1" input { From a0dd316c26238a85928eaa8f7164c2e881d8bf73 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Fri, 31 Jan 2025 14:46:42 +0000 Subject: [PATCH 60/65] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index 745d09057e..f2cfe20fed 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -27,14 +27,14 @@ MultiSampleArrays 1.6.2 2024-08-02 ValidateChip 1.16.7 2024-11-04 Arrays 2.6.30 2024-11-04 AnnotationFiltration 1.2.7 2024-11-04 -Multiome 5.9.6 2025-01-21 +Multiome 5.9.7 2025-01-31 snm3C 4.0.4 2024-08-06 SlideSeq 3.4.8 2025-01-13 scATAC 1.3.2 2023-08-03 BuildIndices 4.0.0 2025-01-17 MultiSampleSmartSeq2 2.2.22 2024-09-11 Optimus 7.9.1 2025-01-13 -atac 2.6.0 2025-01-21 -PairedTag 1.10.0 2025-01-21 +atac 2.6.1 2025-01-31 +PairedTag 1.10.1 2025-01-31 SmartSeq2SingleSample 5.1.21 2024-09-11 MultiSampleSmartSeq2SingleNucleus 2.0.7 2025-01-13 From 56806372f6c297f2a1664399226eb1a22763f9d4 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 31 Jan 2025 14:35:16 -0500 Subject: [PATCH 61/65] documenation --- website/docs/Pipelines/ATAC/README.md | 8 ++++++-- website/docs/Pipelines/Multiome_Pipeline/README.md | 1 + 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 86d3eaab3e..2651b080f4 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -65,6 +65,7 @@ The following describes the inputs of the ATAC workflow. For more details on how | adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. | | vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). | atac_nhash_id | String that represents an optional library aliquot identifier. When used, it is echoed in the h5ad unstructured data. | +| peak_calling | Optional boolean used to determine if the ATAC pipeline should run Peak Calling; default is "false". | Boolean | ## ATAC tasks and tools @@ -86,7 +87,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | [TrimAdapters](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | Cutadapt v4.4 | cutadapt | Trims adaptor sequences. | | [BWAPairedEndAlignment](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | bwa-mem2 | mem | Aligns reads from each set of partitioned FASTQ files to the genome and outputs a BAM with ATAC barcodes in the CB:Z tag. | | [CreateFragmentFile](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | make_fragment_file, import_data | SnapATAC2 | Generates a fragment file from the final aligned BAM and outputs per barcode quality metrics in h5ad. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). | - +| [PeakCalling](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | macs3 | SnapATAC2 | Generates two h5ad files (`peaks.h5ad` and `matrix.h5ad`) from the CreateFragmentFile h5ad output file (`metrics.h5ad`). The `peaks.h5ad` contains the peak called per cluster in the macs3 unstructured metadata and `matrix.h5ad` contains the merged peaks and the count matrix of peaks per fragment. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). | ## Output variables @@ -95,7 +96,10 @@ To see specific tool parameters, select the task WDL link in the table; then vie | bam_aligned_output | ``.bam | BAM containing aligned reads from ATAC workflow. | | fragment_file | ``.fragments.sorted.tsv.gz | Bgzipped TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_`_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) + library_metrics | ``_`_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) | +| snap_peaks | ` Date: Fri, 31 Jan 2025 14:46:47 -0500 Subject: [PATCH 62/65] rename --- pipelines/skylab/atac/atac.wdl | 9 ++++----- website/docs/Pipelines/ATAC/README.md | 7 +++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 47383df63a..ff34a7239d 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -794,9 +794,8 @@ task PeakCalling { atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{bam_base_name}.peaks.h5ad") - peak_matrix.write_h5ad("~{bam_base_name}.matrix.h5ad") - print("test") + atac_data_mod.write_h5ad("~{bam_base_name}.cellbybin.h5ad") + peak_matrix.write_h5ad("~{bam_base_name}.cellbypeak.h5ad") CODE >>> @@ -809,7 +808,7 @@ task PeakCalling { } output { - File peaks_h5ad = "~{bam_base_name}.peaks.h5ad" - File matrix_h5ad = "~{bam_base_name}.matrix.h5ad" + File peaks_h5ad = "~{bam_base_name}.cellbybin.h5ad" + File matrix_h5ad = "~{bam_base_name}.cellbypeak.h5ad" } } diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md index 2651b080f4..f3d7bc6767 100644 --- a/website/docs/Pipelines/ATAC/README.md +++ b/website/docs/Pipelines/ATAC/README.md @@ -87,7 +87,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie | [TrimAdapters](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | Cutadapt v4.4 | cutadapt | Trims adaptor sequences. | | [BWAPairedEndAlignment](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | bwa-mem2 | mem | Aligns reads from each set of partitioned FASTQ files to the genome and outputs a BAM with ATAC barcodes in the CB:Z tag. | | [CreateFragmentFile](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | make_fragment_file, import_data | SnapATAC2 | Generates a fragment file from the final aligned BAM and outputs per barcode quality metrics in h5ad. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). | -| [PeakCalling](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | macs3 | SnapATAC2 | Generates two h5ad files (`peaks.h5ad` and `matrix.h5ad`) from the CreateFragmentFile h5ad output file (`metrics.h5ad`). The `peaks.h5ad` contains the peak called per cluster in the macs3 unstructured metadata and `matrix.h5ad` contains the merged peaks and the count matrix of peaks per fragment. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). | +| [PeakCalling](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | macs3 | SnapATAC2 | Generates two h5ad files (`cellbybin.h5ad` and `cellbypeak.h5ad`) from the CreateFragmentFile h5ad output file (`metrics.h5ad`). The `peaks.h5ad` contains the peak called per cluster in the macs3 unstructured metadata and `matrix.h5ad` contains the merged peaks and the count matrix of peaks per fragment. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). | ## Output variables @@ -97,9 +97,8 @@ To see specific tool parameters, select the task WDL link in the table; then vie | fragment_file | ``.fragments.sorted.tsv.gz | Bgzipped TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | | snap_metrics | ``_`_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) | -| snap_peaks | ` Date: Fri, 31 Jan 2025 14:59:44 -0500 Subject: [PATCH 63/65] rename --- pipelines/skylab/atac/atac.wdl | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index ff34a7239d..3ed3228f25 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -658,7 +658,6 @@ task CreateFragmentFile { # peak calling using SnapATAC2 task PeakCalling { input { - File bam File annotations_gtf File metrics_h5ad File chrom_sizes @@ -672,7 +671,7 @@ task PeakCalling { Int mem_size = 64 Int nthreads = 4 } - String bam_base_name = basename(bam, ".bam") + String base_name = basename(metrics_h5ad, ".h5ad") parameter_meta { bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task." @@ -695,8 +694,7 @@ task PeakCalling { import polars as pl import pandas as pd - bam = "~{bam}" - bam_base_name = "~{bam_base_name}" + base_name = "~{base_name}" atac_gtf = "~{annotations_gtf}" metrics_h5ad = "~{metrics_h5ad}" chrom_sizes = "~{chrom_sizes}" @@ -794,8 +792,8 @@ task PeakCalling { atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{bam_base_name}.cellbybin.h5ad") - peak_matrix.write_h5ad("~{bam_base_name}.cellbypeak.h5ad") + atac_data_mod.write_h5ad("~{base_name}.cellbybin.h5ad") + peak_matrix.write_h5ad("~{base_name}.cellbypeak.h5ad") CODE >>> @@ -808,7 +806,7 @@ task PeakCalling { } output { - File peaks_h5ad = "~{bam_base_name}.cellbybin.h5ad" - File matrix_h5ad = "~{bam_base_name}.cellbypeak.h5ad" + File peaks_h5ad = "~{base_name}.cellbybin.h5ad" + File matrix_h5ad = "~{base_name}.cellbypeak.h5ad" } } From c664618367be41e10d321cb859a8115c7a5cfbc0 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 31 Jan 2025 15:05:35 -0500 Subject: [PATCH 64/65] rename --- pipelines/skylab/atac/atac.wdl | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 3ed3228f25..98c1a5b990 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -661,6 +661,8 @@ task PeakCalling { File annotations_gtf File metrics_h5ad File chrom_sizes + String output_base_name + # SnapATAC2 parameters Int min_counts = 5000 Int min_tsse = 10 @@ -671,7 +673,6 @@ task PeakCalling { Int mem_size = 64 Int nthreads = 4 } - String base_name = basename(metrics_h5ad, ".h5ad") parameter_meta { bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task." @@ -694,7 +695,7 @@ task PeakCalling { import polars as pl import pandas as pd - base_name = "~{base_name}" + base_name = "~{output_base_name}" atac_gtf = "~{annotations_gtf}" metrics_h5ad = "~{metrics_h5ad}" chrom_sizes = "~{chrom_sizes}" @@ -710,7 +711,7 @@ task PeakCalling { snap.pl.frag_size_distr(atac_data) print(atac_data) - # Filter cells -- Need to parameterize + # Filter cells print("Filtering cells") snap.pp.filter_cells(atac_data, min_counts=min_counts, min_tsse=min_tsse, max_counts=max_counts) print(atac_data) @@ -792,8 +793,8 @@ task PeakCalling { atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{base_name}.cellbybin.h5ad") - peak_matrix.write_h5ad("~{base_name}.cellbypeak.h5ad") + atac_data_mod.write_h5ad("~{output_base_name}.cellbybin.h5ad") + peak_matrix.write_h5ad("~{output_base_name}.cellbypeak.h5ad") CODE >>> @@ -806,7 +807,7 @@ task PeakCalling { } output { - File peaks_h5ad = "~{base_name}.cellbybin.h5ad" - File matrix_h5ad = "~{base_name}.cellbypeak.h5ad" + File peaks_h5ad = "~{output_base_name}.cellbybin.h5ad" + File matrix_h5ad = "~{output_base_name}.cellbypeak.h5ad" } } From 328ec9a0362f819317ece5d111d2941dd6f33359 Mon Sep 17 00:00:00 2001 From: aawdeh Date: Fri, 31 Jan 2025 15:24:02 -0500 Subject: [PATCH 65/65] rename --- pipelines/skylab/atac/atac.wdl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl index 98c1a5b990..10cfb5af66 100644 --- a/pipelines/skylab/atac/atac.wdl +++ b/pipelines/skylab/atac/atac.wdl @@ -164,7 +164,6 @@ workflow ATAC { if (peak_calling) { call PeakCalling { input: - bam = BWAPairedEndAlignment.bam_aligned_output, annotations_gtf = annotations_gtf, metrics_h5ad = CreateFragmentFile.Snap_metrics, chrom_sizes = chrom_sizes, @@ -793,8 +792,8 @@ task PeakCalling { atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas() print("Write into h5ad file") - atac_data_mod.write_h5ad("~{output_base_name}.cellbybin.h5ad") - peak_matrix.write_h5ad("~{output_base_name}.cellbypeak.h5ad") + atac_data_mod.write_h5ad("~{base_name}.cellbybin.h5ad") + peak_matrix.write_h5ad("~{base_name}.cellbypeak.h5ad") CODE >>>