feat: add proksee viewer

matinnuhamunada · Sep 9, 2024 · d03c6cc · d03c6cc
1 parent 62cdd8a
commit d03c6cc
Show file tree

Hide file tree

Showing 6 changed files with 245 additions and 2 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,34 @@
+repos:
+-   repo: https://github.com/Lucas-C/pre-commit-hooks
+    rev: v1.5.1
+    hooks:
+    -   id: forbid-crlf
+    -   id: remove-crlf
+    -   id: forbid-tabs
+        exclude_types: [csv]
+    -   id: remove-tabs
+        exclude_types: [csv]
+-   repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.3.0
+    hooks:
+    - id: trailing-whitespace
+    - id: end-of-file-fixer
+    - id: check-merge-conflict
+    - id: check-yaml
+      args: [--unsafe]
+-   repo: https://github.com/pre-commit/mirrors-isort
+    rev: v5.10.1
+    hooks:
+    - id: isort
+-   repo: https://github.com/ambv/black
+    rev: 23.3.0
+    hooks:
+    - id: black
+      language_version: python3
+-   repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
+    hooks:
+    -   id: flake8
+        args: ['--ignore=E501,W503']
+        additional_dependencies: [flake8-typing-imports==1.10.0]
+        exclude: ^tests
diff --git a/workflow/Snakefile-consensus b/workflow/Snakefile-consensus
@@ -10,7 +10,11 @@ except KeyError as e:
 ##### Target rules #####
 rule all:
     input:
-        expand('data/processed/{strains}/03_trycycler_consensus/{strains}.fna', strains=STRAINS)
+        expand('data/processed/{strains}/03_trycycler_consensus/{strains}.fna', strains=STRAINS),
+        expand('data/processed/{strains}/04_final_qc/{strains}_depth.txt', strains=STRAINS),
+        expand('data/processed/{strains}/05_proksee_{strains}_output', strains=STRAINS),
 
 ##### Modules #####
-include: "rules/03_trycycler_consensus.smk"
+include: "rules/03_trycycler_consensus.smk"
+include: "rules/04_final_qc.smk"
+include: "rules/05_proksee.smk"
diff --git a/workflow/envs/proksee.yaml b/workflow/envs/proksee.yaml
@@ -0,0 +1,8 @@
+name: proksee-batch
+channels:
+  - conda-forge
+dependencies:
+  - python=3.11
+  - pip
+  - pip:
+      - proksee-batch
diff --git a/workflow/rules/04_final_qc.smk b/workflow/rules/04_final_qc.smk
@@ -0,0 +1,32 @@
+rule final_qc:
+    input:
+        assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna',
+        raw_reads = 'data/interim/01_trycycler_assembly/{strains}/nanopore/min1kb.fq',
+    output:
+        index_assembly = 'data/processed/{strains}/04_final_qc/{strains}.mmi',
+        aligned_reads_sam = 'data/processed/{strains}/04_final_qc/{strains}.sam',
+        aligned_reads_bam = temp('data/processed/{strains}/04_final_qc/{strains}.bam'),
+        aligned_reads_bam_sorted = 'data/processed/{strains}/04_final_qc/{strains}_sorted.bam',
+        depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt',
+    threads: 4
+    log:
+        "logs/04_final_qc/final_qc-{strains}.log"
+    conda:
+        "../envs/utilities.yaml"
+    shell:
+        """
+        # Step 1: Index the genome (optional for minimap2)
+        minimap2 -d {output.index_assembly} {input.assembly} 2>> {log}
+
+        # Step 2: Align the reads
+        minimap2 -ax map-ont -t {threads} {output.index_assembly} {input.raw_reads} > {output.aligned_reads_sam} 2>> {log}
+
+        # Step 3: Convert SAM to BAM
+        samtools view -@ {threads} -S -b {output.aligned_reads_sam} > {output.aligned_reads_bam} 2>> {log}
+
+        # Step 4: Sort the BAM file
+        samtools sort -@ {threads} {output.aligned_reads_bam} -o {output.aligned_reads_bam_sorted} 2>> {log}
+
+        # Step 5: Calculate depth
+        samtools depth {output.aligned_reads_bam_sorted} > {output.depth} 2>> {log}
+        """
diff --git a/workflow/rules/05_proksee.smk b/workflow/rules/05_proksee.smk
@@ -0,0 +1,70 @@
+rule prepare_proksee:
+    input:
+        assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna',
+        depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt'
+    output:
+        proksee_input = directory('data/interim/05_proksee/{strains}/05_proksee_{strains}'),
+        output_depth = directory('data/processed/{strains}/05_proksee_{strains}/')
+    threads: 4
+    log:
+        "logs/05_proksee/proksee-{strains}.log"
+    conda:
+        "../envs/proksee.yaml"
+    shell:
+        """
+        # Step 1: Split fasta files per contigs
+        input_fasta={input.assembly}
+
+        # Initialize variables
+        output_file=""
+        contig_name=""
+
+        # Read the input FASTA file line by line
+        while IFS= read -r line; do
+            if [[ $line == ">"* ]]; then
+                # If the line starts with '>', it's a header line
+                # Close the previous output file if it exists
+                if [ -n "$output_file" ]; then
+                    exec 3>&-
+                fi
+                # Extract the contig name and create a new output file
+                contig_name=$(echo "$line" | sed 's/>//; s/ .*//')
+
+                output_directory="{output.proksee_input}/$contig_name/fasta"
+                mkdir -p "$output_directory"
+
+                output_file="$output_directory/$contig_name.fasta"
+                exec 3>"$output_file"
+                echo "$line" >&3
+            else
+                # Write the sequence line to the current output file
+                echo "$line" >&3
+            fi
+        done < "$input_fasta"
+
+        # Close the last output file
+        if [ -n "$output_file" ]; then
+            exec 3>&-
+        fi
+
+        echo "Splitting fasta completed. Fasta files are saved in $output_directory" >> {log}
+
+        # Step 2: Split depth table per contig
+        python workflow/scripts/proksee_prepare.py {input.depth} {output.proksee_input} {wildcards.strains} --output_depth {output.output_depth} 2>> {log}
+
+        # echo "Splitting depth table completed. Depth files are saved in $output_directory" >> {log}
+        """
+
+rule run_proksee:
+    input:
+        proksee_input = 'data/interim/05_proksee/{strains}/05_proksee_{strains}'
+    output:
+        proksee_output = directory('data/processed/{strains}/05_proksee_{strains}_output')
+    log:
+        "logs/05_proksee/proksee-run-{strains}.log"
+    conda:
+        "../envs/proksee.yaml"
+    shell:
+        """
+        proksee-batch --input {input.proksee_input} --output {output.proksee_output} 2>> {log}
+        """
diff --git a/workflow/scripts/proksee_prepare.py b/workflow/scripts/proksee_prepare.py
@@ -0,0 +1,95 @@
+import argparse
+import json
+import logging
+from pathlib import Path
+
+import pandas as pd
+
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
+)
+
+
+def split_depth_file(
+    input_file, output_dir, genome_id, output_depth=None, window_size=100
+):
+    # Read the depth file
+    logging.info(f"Reading depth file: {input_file}")
+    df = pd.read_csv(input_file, sep="\t", header=None)
+    df = df.rename(columns={0: "contig_id", 1: "start", 2: "score"})
+
+    # Create the output directory if it doesn't exist
+    outdir = Path(output_dir)
+    outdir.mkdir(parents=True, exist_ok=True)
+    logging.info(f"Output directory created: {output_dir}")
+
+    # Split the depth file by contig and save each to a separate file
+    for contig in df.contig_id.unique():
+        logging.info(f"Processing contig {contig}")
+        subset = df[df.contig_id == contig]
+
+        # Calculate non-overlapping window average
+        subset["window"] = (subset["start"] - 1) // window_size
+        window_avg = subset.groupby("window")["score"].mean().reset_index()
+        window_avg["start"] = window_avg["window"] * window_size + 1
+        window_avg["end"] = window_avg["start"] + window_size - 1
+
+        if output_depth is None:
+            output_depth = outdir
+
+        outfile = Path(output_depth) / f"{contig}/plot/depth.txt"
+        outfile.parent.mkdir(parents=True, exist_ok=True)
+        window_avg.loc[:, ["start", "end", "score"]].to_csv(
+            outfile, sep="\t", index=False
+        )
+        logging.info(f"Depth file saved for contig {contig}: {outfile}")
+
+        # Create metadata
+        metadata = {
+            "metadata": {
+                "description": f"Contig {contig} extracted from genome {genome_id}"
+            }
+        }
+
+        # Save metadata to a JSON file
+        metadata_file = outdir / f"{contig}/metadata/{contig}.json"
+        metadata_file.parent.mkdir(parents=True, exist_ok=True)
+        with open(metadata_file, "w") as f:
+            json.dump(metadata, f, indent=2)
+        logging.info(f"Metadata file saved for contig {contig}: {metadata_file}")
+
+    logging.info(f"Splitting completed. Files are saved in {output_dir}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Split depth file by contig and save each to a separate file with metadata."
+    )
+    parser.add_argument("input_file", help="Path to the input depth file")
+    parser.add_argument("output_dir", help="Directory to save the output files")
+    parser.add_argument("genome_id", help="Genome ID for metadata")
+    parser.add_argument(
+        "--output_depth",
+        help="Optional directory to save the depth files",
+        default=None,
+    )
+    parser.add_argument(
+        "--window_size",
+        type=int,
+        help="Window size for calculating the non-overlapping window average",
+        default=100,
+    )
+
+    args = parser.parse_args()
+
+    split_depth_file(
+        args.input_file,
+        args.output_dir,
+        args.genome_id,
+        args.output_depth,
+        args.window_size,
+    )
+
+
+if __name__ == "__main__":
+    main()