broadinstitute · rsc3 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024 · Oct 16, 2024
diff --git a/pipeline_versions.txt b/pipeline_versions.txt
@@ -27,14 +27,14 @@ MultiSampleArrays	 1.6.2	2024-08-02
 ValidateChip	 1.16.7	2024-11-04 
 Arrays	 2.6.30	2024-11-04 
 AnnotationFiltration	 1.2.7	2024-11-04 
-Multiome	 5.9.6	2025-01-21 
+Multiome	 5.9.7	2025-01-31 
 snm3C	 4.0.4	2024-08-06 
 SlideSeq	 3.4.8	2025-01-13 
 scATAC	 1.3.2	2023-08-03 
 BuildIndices	 4.0.0	2025-01-17 
 MultiSampleSmartSeq2	 2.2.22	2024-09-11 
 Optimus	 7.9.1	2025-01-13 
-atac	 2.6.0	2025-01-21 
-PairedTag	 1.10.0	2025-01-21 
+atac	 2.6.1	2025-01-31 
+PairedTag	 1.10.1	2025-01-31 
 SmartSeq2SingleSample	 5.1.21	2024-09-11 
 MultiSampleSmartSeq2SingleNucleus	 2.0.7	2025-01-13 
diff --git a/pipelines/skylab/atac/atac.changelog.md b/pipelines/skylab/atac/atac.changelog.md
@@ -1,3 +1,7 @@
+# 2.6.1
+2025-01-31 (Date of Last Commit)
+* Added an optional PeakCalling task. Added a boolean variable run_peak_calling; default is false. 
+
 # 2.6.0
 2025-01-21 (Date of Last Commit)
 

diff --git a/pipelines/skylab/atac/atac.wdl b/pipelines/skylab/atac/atac.wdl
@@ -28,6 +28,8 @@ workflow ATAC {
 
     # Option for running files with preindex
     Boolean preindex = false
+    # Option for running peak calling
+    Boolean peak_calling = true
 
     # BWA ref
     File tar_bwa_reference
@@ -49,7 +51,7 @@ workflow ATAC {
     String adapter_seq_read3 = "TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG"
   }
 
-  String pipeline_version = "2.6.0"
+  String pipeline_version = "2.6.1"
 
   # Determine docker prefix based on cloud provider
   String gcr_docker_prefix = "us.gcr.io/broad-gotc-prod/"
@@ -61,7 +63,7 @@ workflow ATAC {
   String cutadapt_docker = "cutadapt:1.0.0-4.4-1686752919"
   String samtools_docker = "samtools-dist-bwa:3.0.0"
   String upstools_docker = "upstools:1.0.0-2023.03.03-1704300311"
-  String snap_atac_docker = "snapatac2:1.1.0"
+  String snap_atac_docker = "snapatac2:2.0.0"
 
   # Make sure either 'gcp' or 'azure' is supplied as cloud_provider input. If not, raise an error
   if ((cloud_provider != "gcp") && (cloud_provider != "azure")) {
@@ -71,7 +73,6 @@ workflow ATAC {
     }
   }
 
-
   parameter_meta {
     read1_fastq_gzipped: "read 1 FASTQ file as input for the pipeline, contains read 1 of paired reads"
     read2_fastq_gzipped: "read 2 FASTQ file as input for the pipeline, contains the cellular barcodes corresponding to the reads in the read1 FASTQ and read 3 FASTQ"
@@ -160,20 +161,30 @@ workflow ATAC {
         input_id = input_id
 
     }
+    if (peak_calling) {
+      call PeakCalling {
+        input:
+          annotations_gtf = annotations_gtf,
+          metrics_h5ad = CreateFragmentFile.Snap_metrics,
+          chrom_sizes = chrom_sizes,
+          docker_path = docker_prefix + snap_atac_docker
+      }
+    }
   }
-
+  
   File bam_aligned_output_atac = select_first([BBTag.bb_bam, BWAPairedEndAlignment.bam_aligned_output])
   File fragment_file_atac = select_first([BB_fragment.fragment_file, CreateFragmentFile.fragment_file])
   File fragment_file_index_atac = select_first([BB_fragment.fragment_file_index, CreateFragmentFile.fragment_file_index])
   File snap_metrics_atac = select_first([BB_fragment.Snap_metrics,CreateFragmentFile.Snap_metrics])
   File library_metrics = select_first([BB_fragment.atac_library_metrics, CreateFragmentFile.atac_library_metrics])
-
+    
   output {
     File bam_aligned_output = bam_aligned_output_atac
     File fragment_file = fragment_file_atac
     File fragment_file_index = fragment_file_index_atac
     File snap_metrics = snap_metrics_atac
     File library_metrics_file = library_metrics
+    File? peakcall_h5ad_file = PeakCalling.peaks_h5ad
   }
 }
 
@@ -269,7 +280,6 @@ task GetNumSplits {
   }
 }
 
-
 # trim read 1 and read 2 adapter sequeunce with cutadapt
 task TrimAdapters {
   input {
@@ -513,7 +523,6 @@ task CreateFragmentFile {
     File bam
     File annotations_gtf
     File chrom_sizes
-    File annotations_gtf
     Boolean preindex
     Int disk_size = 500
     Int mem_size = 64
@@ -536,7 +545,8 @@ task CreateFragmentFile {
   }
 
   command <<<
-    set -e pipefail
+    set -euo pipefail
+    set -x 
 
     python3 <<CODE
 
@@ -559,6 +569,9 @@ task CreateFragmentFile {
     # use snap atac2
     import snapatac2.preprocessing as pp
     import snapatac2 as snap
+    import scanpy as sc
+    import numpy as np
+    import polars as pl
     import anndata as ad
     from collections import OrderedDict
     import csv
@@ -579,8 +592,7 @@ task CreateFragmentFile {
     atac_percent_target = number_of_cells / expected_cells*100
     print("Setting percent target in nested dictionary")
     data['Cells']['atac_percent_target'] = atac_percent_target
-
-
+
     # Flatten the dictionary
     flattened_data = []
     for category, metrics in data.items():
@@ -637,8 +649,164 @@ task CreateFragmentFile {
   output {
     File fragment_file = "~{input_id}.fragments.sorted.tsv.gz"
     File fragment_file_index = "~{input_id}.fragments.sorted.tsv.gz.csi"
-
     File Snap_metrics = "~{input_id}.metrics.h5ad"
     File atac_library_metrics = "~{input_id}_~{atac_nhash_id}_library_metrics.csv"
   }
 }
+
+# peak calling using SnapATAC2
+task PeakCalling {
+  input {
+    File annotations_gtf
+    File metrics_h5ad  
+    File chrom_sizes
+    String output_base_name
+
+    # SnapATAC2 parameters
+    Int min_counts = 5000
+    Int min_tsse = 10
+    Int max_counts = 100000
+    # Runtime attributes/docker
+    String docker_path
+    Int disk_size = 500
+    Int mem_size = 64
+    Int nthreads = 4   
+  }
+
+  parameter_meta {
+    bam: "Aligned bam with CB in CB tag. This is the output of the BWAPairedEndAlignment task."
+    annotations_gtf: "GTF for SnapATAC2 to calculate TSS sites of fragment file."
+    disk_size: "Disk size used in create fragment file step."
+    mem_size: "The size of memory used in create fragment file."
+    docker_path: "The docker image path containing the runtime environment for this task"
+  }
+
+  command <<<
+    set -euo pipefail
+    set -x 
+
+    python3 <<CODE
+
+    # use snap atac2
+    import snapatac2 as snap
+    import scanpy as sc
+    import numpy as np
+    import polars as pl
+    import pandas as pd
+
+    base_name = "~{output_base_name}"
+    atac_gtf = "~{annotations_gtf}"
+    metrics_h5ad = "~{metrics_h5ad}"
+    chrom_sizes = "~{chrom_sizes}"
+    min_counts = "~{min_counts}"
+    min_tsse = "~{min_tsse}"
+    max_counts = "~{max_counts}"
+
+    print("Peak calling starting...")
+    atac_data = snap.read(metrics_h5ad)
+
+    # Calculate and plot the size distribution of fragments
+    print("Calculating fragment size distribution")
+    snap.pl.frag_size_distr(atac_data)
+    print(atac_data)
+
+    # Filter cells
+    print("Filtering cells")
+    snap.pp.filter_cells(atac_data, min_counts=min_counts, min_tsse=min_tsse, max_counts=max_counts)
+    print(atac_data)
+
+    # Create a cell by bin matrix containing insertion counts across genome-wide 500-bp bins.
+    print("Creating cell by bin matrix")
+    atac_data_mod = snap.pp.add_tile_matrix(atac_data, inplace=False)
+    print("set obsm")
+    atac_data_mod.obsm["fragment_paired"] =  atac_data.obsm["fragment_paired"]
+    print("set all uns")
+    for key in atac_data.uns.keys():
+      print("set ",key)
+      atac_data_mod.uns[key] = atac_data.uns[key]
+    print(atac_data_mod)
+
+    # Feature selection
+    print("Feature selection")
+    snap.pp.select_features(atac_data_mod)
+    print(atac_data_mod)
+
+    # Run customized scrublet algorithm to identify potential doublets
+    print("Run scrublet to identify potential doublets")
+    snap.pp.scrublet(atac_data_mod)
+    print(atac_data_mod)
+
+    # Employ spectral embedding for dimensionality reduction
+    print("Employ spectral embedding for dimensionality reduction")
+    snap.tl.spectral(atac_data_mod)
+    print(atac_data_mod)
+
+    # Filter doublets based on scrublet scores 
+    print("Filter doublets based on scrublet scores")
+    snap.pp.filter_doublets(atac_data_mod, probability_threshold=0.5)
+    print(atac_data_mod)
+
+    # Perform graph-based clustering to identify cell clusters. 
+    # Build a k-nearest neighbour graph using snap.pp.knn
+    print("Perform knn graph-based clustering to identify cell clusters")
+    snap.pp.knn(atac_data_mod)
+    print(atac_data_mod)
+
+    # Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph
+    print("Use the Leiden community detection algorithm to identify densely-connected subgraphs/clusters in the graph")
+    snap.tl.leiden(atac_data_mod)
+    print(atac_data_mod)
+
+    # Create the cell by gene activity matrix
+    print("Create the cell by gene activity matrix")
+    gene_mat = snap.pp.make_gene_matrix(atac_data_mod, gene_anno=atac_gtf)
+    print(atac_data_mod)
+
+    # Normalize the gene matrix
+    print("Normalize the gene matrix")
+    gene_mat.obs['leiden'] = atac_data_mod.obs['leiden']
+    sc.pp.normalize_total(gene_mat)
+    sc.pp.log1p(gene_mat)
+    sc.tl.rank_genes_groups(gene_mat, groupby="leiden", method="wilcoxon")
+
+    for i in np.unique(gene_mat.obs['leiden']):
+        markers = sc.get.rank_genes_groups_df(gene_mat, group=i).head(7)['names']
+        print(f"Cluster {i}: {', '.join(markers)}")
+
+    print("Peak calling using MACS3")
+    snap.tl.macs3(atac_data_mod, groupby='leiden', n_jobs=1)
+
+    print("Merge peaks and create peak matrix")
+    # read chrom sizes
+    chromsize_dict = pd.read_csv(chrom_sizes, sep='\t', header=None)
+    chromsize_dict = pd.Series(chromsize_dict[1].values, index=chromsize_dict[0]).to_dict()
+    # merge peaks and create peak matrix
+    peaks = snap.tl.merge_peaks(atac_data_mod.uns['macs3'], chromsize_dict)
+    peak_matrix = snap.pp.make_peak_matrix(atac_data_mod, use_rep=peaks['Peaks'])
+
+    print("Convert pl.DataFrame to pandas DataFrame")
+    # Convert pl.DataFrame to pandas DataFrame
+    for key in atac_data_mod.uns.keys():
+      if isinstance(atac_data_mod.uns[key], pl.DataFrame):
+          print(key)
+          atac_data_mod.uns[key] = atac_data_mod.uns[key].to_pandas()
+
+    print("Write into h5ad file")
+    atac_data_mod.write_h5ad("~{base_name}.cellbybin.h5ad")
+    peak_matrix.write_h5ad("~{base_name}.cellbypeak.h5ad")
+
+    CODE
+  >>>
+
+  runtime {
+    docker: docker_path
+    disks: "local-disk ${disk_size} SSD"
+    memory: "${mem_size} GiB"
+    cpu: nthreads
+  }
+
+  output {
+    File peaks_h5ad = "~{output_base_name}.cellbybin.h5ad"
+    File matrix_h5ad = "~{output_base_name}.cellbypeak.h5ad"
+  }
+}
diff --git a/pipelines/skylab/multiome/Multiome.changelog.md b/pipelines/skylab/multiome/Multiome.changelog.md
@@ -1,3 +1,7 @@
+# 5.9.7
+2025-01-31 (Date of Last Commit)
+* Added an optional PeakCalling task to the ATAC workflow. Added a boolean variable run_peak_calling to the Multiome pipeline; default is false. 
+
 # 5.9.6
 2025-01-21 (Date of Last Commit)
 

diff --git a/pipelines/skylab/multiome/Multiome.wdl b/pipelines/skylab/multiome/Multiome.wdl
@@ -7,7 +7,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow Multiome {
 
-    String pipeline_version = "5.9.6"
+    String pipeline_version = "5.9.7"
 
     input {
         String cloud_provider
@@ -50,6 +50,8 @@ workflow Multiome {
 
         # CellBender
         Boolean run_cellbender = false
+        # Peak Calling
+        Boolean run_peak_calling = false
     }
 
     # Determine docker prefix based on cloud provider
@@ -121,7 +123,8 @@ workflow Multiome {
             annotations_gtf = annotations_gtf,
             atac_nhash_id = atac_nhash_id,
             adapter_seq_read3 = adapter_seq_read3,
-            atac_expected_cells = expected_cells
+            atac_expected_cells = expected_cells,
+            peak_calling = run_peak_calling
     }
     call H5adUtils.JoinMultiomeBarcodes as JoinBarcodes {
         input:
@@ -149,6 +152,7 @@ workflow Multiome {
         File fragment_file_index = JoinBarcodes.atac_fragment_tsv_index
         File snap_metrics_atac = JoinBarcodes.atac_h5ad_file
         File atac_library_metrics = Atac.library_metrics_file
+        File? peak_call_atac = Atac.peakcall_h5ad_file
 
         # optimus outputs
         File genomic_reference_version_gex = Optimus.genomic_reference_version

diff --git a/pipelines/skylab/paired_tag/PairedTag.changelog.md b/pipelines/skylab/paired_tag/PairedTag.changelog.md
@@ -1,3 +1,7 @@
+# 1.10.1
+2025-01-31 (Date of Last Commit)
+* Added an optional PeakCalling task to the ATAC workflow. Added a boolean variable run_peak_calling to the Multiome pipeline; default is false. This does not affect the outputs of the pipeline
+
 # 1.10.0
 2025-01-21 (Date of Last Commit)
 

diff --git a/pipelines/skylab/paired_tag/PairedTag.wdl b/pipelines/skylab/paired_tag/PairedTag.wdl
@@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils
 
 workflow PairedTag {
 
-    String pipeline_version = "1.10.0"
+    String pipeline_version = "1.10.1"
 
 
     input {

diff --git a/website/docs/Pipelines/ATAC/README.md b/website/docs/Pipelines/ATAC/README.md
@@ -65,6 +65,7 @@ The following describes the inputs of the ATAC workflow. For more details on how
 | adapter_seq_read3 | TrimAdapters input: Sequence adapter for read 3 fastq. |
 | vm_size | String defining the Azure virtual machine family for the workflow (default: "Standard_M128s"). 
 | atac_nhash_id | String that represents an optional library aliquot identifier. When used, it is echoed in the h5ad unstructured data. |
+| peak_calling | Optional boolean used to determine if the ATAC pipeline should run Peak Calling; default is "false".  | Boolean |
 
 ## ATAC tasks and tools
 
@@ -86,7 +87,7 @@ To see specific tool parameters, select the task WDL link in the table; then vie
 | [TrimAdapters](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | Cutadapt v4.4 | cutadapt | Trims adaptor sequences. |
 | [BWAPairedEndAlignment](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | bwa-mem2 | mem | Aligns reads from each set of partitioned FASTQ files to the genome and outputs a BAM with ATAC barcodes in the CB:Z tag. |
 | [CreateFragmentFile](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | make_fragment_file, import_data | SnapATAC2 | Generates a fragment file from the final aligned BAM and outputs per barcode quality metrics in h5ad. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). |
-
+| [PeakCalling](https://github.com/broadinstitute/warp/blob/master/pipelines/skylab/atac/atac.wdl) | macs3 | SnapATAC2 | Generates two h5ad files (`cellbybin.h5ad` and `cellbypeak.h5ad`) from the CreateFragmentFile h5ad output file (`metrics.h5ad`). The `peaks.h5ad` contains the peak called per cluster in the macs3 unstructured metadata and `matrix.h5ad` contains the merged peaks and the count matrix of peaks per fragment. A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). |
 
 ## Output variables
 
@@ -95,7 +96,9 @@ To see specific tool parameters, select the task WDL link in the table; then vie
 | bam_aligned_output | `<input_id>`.bam | BAM containing aligned reads from ATAC workflow. |
 | fragment_file | `<input_id>`.fragments.sorted.tsv.gz | Bgzipped TSV containing fragment start and stop coordinates per barcode. In order, the columns are "Chromosome", "Start", "Stop", "ATAC Barcode", and "Number Reads". | 
 | snap_metrics | `<input_id`.metrics.h5ad | h5ad (Anndata) containing per barcode metrics from [SnapATAC2](https://github.com/kaizhang/SnapATAC2). A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). |
- library_metrics | `<input_id>`_`<atac_nhash_id>_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md)
+ library_metrics | `<input_id>`_`<atac_nhash_id>_library_metrics.csv | CSV file containing library-level metrics. Read more in the [Library Metrics Overview](library-metrics.md) |
+| snap_peaks | `<input_id`.cellbybin.h5ad | h5ad (Anndata) containing peaks (called by MACS3) per cluster [SnapATAC2](https://github.com/kaizhang/SnapATAC2). A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). |
+| snap_matrix | `<input_id`.cellbypeak.h5ad | h5ad (Anndata) containing per merged peaks (called by MACS3) per cluster and count matrix of peaks per fragment [SnapATAC2](https://github.com/kaizhang/SnapATAC2). A detailed list of these metrics is found in the [ATAC Count Matrix Overview](./count-matrix-overview.md). 
 
 ## Versioning and testing
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,7 +8,7 @@ import "../../../tasks/broad/Utilities.wdl" as utils @@
     workflow PairedTag {
-        String pipeline_version = "1.10.0"
+        String pipeline_version = "1.10.1"
         input {
@@ Expand Down @@