Skip to content

Commit

Permalink
feat: add proksee viewer
Browse files Browse the repository at this point in the history
  • Loading branch information
matinnuhamunada committed Sep 9, 2024
1 parent 62cdd8a commit d03c6cc
Show file tree
Hide file tree
Showing 6 changed files with 245 additions and 2 deletions.
34 changes: 34 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
repos:
- repo: https://github.com/Lucas-C/pre-commit-hooks
rev: v1.5.1
hooks:
- id: forbid-crlf
- id: remove-crlf
- id: forbid-tabs
exclude_types: [csv]
- id: remove-tabs
exclude_types: [csv]
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.3.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-merge-conflict
- id: check-yaml
args: [--unsafe]
- repo: https://github.com/pre-commit/mirrors-isort
rev: v5.10.1
hooks:
- id: isort
- repo: https://github.com/ambv/black
rev: 23.3.0
hooks:
- id: black
language_version: python3
- repo: https://github.com/PyCQA/flake8
rev: 5.0.4
hooks:
- id: flake8
args: ['--ignore=E501,W503']
additional_dependencies: [flake8-typing-imports==1.10.0]
exclude: ^tests
8 changes: 6 additions & 2 deletions workflow/Snakefile-consensus
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,11 @@ except KeyError as e:
##### Target rules #####
rule all:
input:
expand('data/processed/{strains}/03_trycycler_consensus/{strains}.fna', strains=STRAINS)
expand('data/processed/{strains}/03_trycycler_consensus/{strains}.fna', strains=STRAINS),
expand('data/processed/{strains}/04_final_qc/{strains}_depth.txt', strains=STRAINS),
expand('data/processed/{strains}/05_proksee_{strains}_output', strains=STRAINS),

##### Modules #####
include: "rules/03_trycycler_consensus.smk"
include: "rules/03_trycycler_consensus.smk"
include: "rules/04_final_qc.smk"
include: "rules/05_proksee.smk"
8 changes: 8 additions & 0 deletions workflow/envs/proksee.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
name: proksee-batch
channels:
- conda-forge
dependencies:
- python=3.11
- pip
- pip:
- proksee-batch
32 changes: 32 additions & 0 deletions workflow/rules/04_final_qc.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
rule final_qc:
input:
assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna',
raw_reads = 'data/interim/01_trycycler_assembly/{strains}/nanopore/min1kb.fq',
output:
index_assembly = 'data/processed/{strains}/04_final_qc/{strains}.mmi',
aligned_reads_sam = 'data/processed/{strains}/04_final_qc/{strains}.sam',
aligned_reads_bam = temp('data/processed/{strains}/04_final_qc/{strains}.bam'),
aligned_reads_bam_sorted = 'data/processed/{strains}/04_final_qc/{strains}_sorted.bam',
depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt',
threads: 4
log:
"logs/04_final_qc/final_qc-{strains}.log"
conda:
"../envs/utilities.yaml"
shell:
"""
# Step 1: Index the genome (optional for minimap2)
minimap2 -d {output.index_assembly} {input.assembly} 2>> {log}
# Step 2: Align the reads
minimap2 -ax map-ont -t {threads} {output.index_assembly} {input.raw_reads} > {output.aligned_reads_sam} 2>> {log}
# Step 3: Convert SAM to BAM
samtools view -@ {threads} -S -b {output.aligned_reads_sam} > {output.aligned_reads_bam} 2>> {log}
# Step 4: Sort the BAM file
samtools sort -@ {threads} {output.aligned_reads_bam} -o {output.aligned_reads_bam_sorted} 2>> {log}
# Step 5: Calculate depth
samtools depth {output.aligned_reads_bam_sorted} > {output.depth} 2>> {log}
"""
70 changes: 70 additions & 0 deletions workflow/rules/05_proksee.smk
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
rule prepare_proksee:
input:
assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna',
depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt'
output:
proksee_input = directory('data/interim/05_proksee/{strains}/05_proksee_{strains}'),
output_depth = directory('data/processed/{strains}/05_proksee_{strains}/')
threads: 4
log:
"logs/05_proksee/proksee-{strains}.log"
conda:
"../envs/proksee.yaml"
shell:
"""
# Step 1: Split fasta files per contigs
input_fasta={input.assembly}
# Initialize variables
output_file=""
contig_name=""
# Read the input FASTA file line by line
while IFS= read -r line; do
if [[ $line == ">"* ]]; then
# If the line starts with '>', it's a header line
# Close the previous output file if it exists
if [ -n "$output_file" ]; then
exec 3>&-
fi
# Extract the contig name and create a new output file
contig_name=$(echo "$line" | sed 's/>//; s/ .*//')
output_directory="{output.proksee_input}/$contig_name/fasta"
mkdir -p "$output_directory"
output_file="$output_directory/$contig_name.fasta"
exec 3>"$output_file"
echo "$line" >&3
else
# Write the sequence line to the current output file
echo "$line" >&3
fi
done < "$input_fasta"
# Close the last output file
if [ -n "$output_file" ]; then
exec 3>&-
fi
echo "Splitting fasta completed. Fasta files are saved in $output_directory" >> {log}
# Step 2: Split depth table per contig
python workflow/scripts/proksee_prepare.py {input.depth} {output.proksee_input} {wildcards.strains} --output_depth {output.output_depth} 2>> {log}
# echo "Splitting depth table completed. Depth files are saved in $output_directory" >> {log}
"""

rule run_proksee:
input:
proksee_input = 'data/interim/05_proksee/{strains}/05_proksee_{strains}'
output:
proksee_output = directory('data/processed/{strains}/05_proksee_{strains}_output')
log:
"logs/05_proksee/proksee-run-{strains}.log"
conda:
"../envs/proksee.yaml"
shell:
"""
proksee-batch --input {input.proksee_input} --output {output.proksee_output} 2>> {log}
"""
95 changes: 95 additions & 0 deletions workflow/scripts/proksee_prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import argparse
import json
import logging
from pathlib import Path

import pandas as pd

logging.basicConfig(
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO
)


def split_depth_file(
input_file, output_dir, genome_id, output_depth=None, window_size=100
):
# Read the depth file
logging.info(f"Reading depth file: {input_file}")
df = pd.read_csv(input_file, sep="\t", header=None)
df = df.rename(columns={0: "contig_id", 1: "start", 2: "score"})

# Create the output directory if it doesn't exist
outdir = Path(output_dir)
outdir.mkdir(parents=True, exist_ok=True)
logging.info(f"Output directory created: {output_dir}")

# Split the depth file by contig and save each to a separate file
for contig in df.contig_id.unique():
logging.info(f"Processing contig {contig}")
subset = df[df.contig_id == contig]

# Calculate non-overlapping window average
subset["window"] = (subset["start"] - 1) // window_size
window_avg = subset.groupby("window")["score"].mean().reset_index()
window_avg["start"] = window_avg["window"] * window_size + 1
window_avg["end"] = window_avg["start"] + window_size - 1

if output_depth is None:
output_depth = outdir

outfile = Path(output_depth) / f"{contig}/plot/depth.txt"
outfile.parent.mkdir(parents=True, exist_ok=True)
window_avg.loc[:, ["start", "end", "score"]].to_csv(
outfile, sep="\t", index=False
)
logging.info(f"Depth file saved for contig {contig}: {outfile}")

# Create metadata
metadata = {
"metadata": {
"description": f"Contig {contig} extracted from genome {genome_id}"
}
}

# Save metadata to a JSON file
metadata_file = outdir / f"{contig}/metadata/{contig}.json"
metadata_file.parent.mkdir(parents=True, exist_ok=True)
with open(metadata_file, "w") as f:
json.dump(metadata, f, indent=2)
logging.info(f"Metadata file saved for contig {contig}: {metadata_file}")

logging.info(f"Splitting completed. Files are saved in {output_dir}")


def main():
parser = argparse.ArgumentParser(
description="Split depth file by contig and save each to a separate file with metadata."
)
parser.add_argument("input_file", help="Path to the input depth file")
parser.add_argument("output_dir", help="Directory to save the output files")
parser.add_argument("genome_id", help="Genome ID for metadata")
parser.add_argument(
"--output_depth",
help="Optional directory to save the depth files",
default=None,
)
parser.add_argument(
"--window_size",
type=int,
help="Window size for calculating the non-overlapping window average",
default=100,
)

args = parser.parse_args()

split_depth_file(
args.input_file,
args.output_dir,
args.genome_id,
args.output_depth,
args.window_size,
)


if __name__ == "__main__":
main()

0 comments on commit d03c6cc

Please sign in to comment.