generated from snakemake-workflows/snakemake-workflow-template
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
62cdd8a
commit d03c6cc
Showing
6 changed files
with
245 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
repos: | ||
- repo: https://github.com/Lucas-C/pre-commit-hooks | ||
rev: v1.5.1 | ||
hooks: | ||
- id: forbid-crlf | ||
- id: remove-crlf | ||
- id: forbid-tabs | ||
exclude_types: [csv] | ||
- id: remove-tabs | ||
exclude_types: [csv] | ||
- repo: https://github.com/pre-commit/pre-commit-hooks | ||
rev: v4.3.0 | ||
hooks: | ||
- id: trailing-whitespace | ||
- id: end-of-file-fixer | ||
- id: check-merge-conflict | ||
- id: check-yaml | ||
args: [--unsafe] | ||
- repo: https://github.com/pre-commit/mirrors-isort | ||
rev: v5.10.1 | ||
hooks: | ||
- id: isort | ||
- repo: https://github.com/ambv/black | ||
rev: 23.3.0 | ||
hooks: | ||
- id: black | ||
language_version: python3 | ||
- repo: https://github.com/PyCQA/flake8 | ||
rev: 5.0.4 | ||
hooks: | ||
- id: flake8 | ||
args: ['--ignore=E501,W503'] | ||
additional_dependencies: [flake8-typing-imports==1.10.0] | ||
exclude: ^tests |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
name: proksee-batch | ||
channels: | ||
- conda-forge | ||
dependencies: | ||
- python=3.11 | ||
- pip | ||
- pip: | ||
- proksee-batch |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
rule final_qc: | ||
input: | ||
assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna', | ||
raw_reads = 'data/interim/01_trycycler_assembly/{strains}/nanopore/min1kb.fq', | ||
output: | ||
index_assembly = 'data/processed/{strains}/04_final_qc/{strains}.mmi', | ||
aligned_reads_sam = 'data/processed/{strains}/04_final_qc/{strains}.sam', | ||
aligned_reads_bam = temp('data/processed/{strains}/04_final_qc/{strains}.bam'), | ||
aligned_reads_bam_sorted = 'data/processed/{strains}/04_final_qc/{strains}_sorted.bam', | ||
depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt', | ||
threads: 4 | ||
log: | ||
"logs/04_final_qc/final_qc-{strains}.log" | ||
conda: | ||
"../envs/utilities.yaml" | ||
shell: | ||
""" | ||
# Step 1: Index the genome (optional for minimap2) | ||
minimap2 -d {output.index_assembly} {input.assembly} 2>> {log} | ||
# Step 2: Align the reads | ||
minimap2 -ax map-ont -t {threads} {output.index_assembly} {input.raw_reads} > {output.aligned_reads_sam} 2>> {log} | ||
# Step 3: Convert SAM to BAM | ||
samtools view -@ {threads} -S -b {output.aligned_reads_sam} > {output.aligned_reads_bam} 2>> {log} | ||
# Step 4: Sort the BAM file | ||
samtools sort -@ {threads} {output.aligned_reads_bam} -o {output.aligned_reads_bam_sorted} 2>> {log} | ||
# Step 5: Calculate depth | ||
samtools depth {output.aligned_reads_bam_sorted} > {output.depth} 2>> {log} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
rule prepare_proksee: | ||
input: | ||
assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna', | ||
depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt' | ||
output: | ||
proksee_input = directory('data/interim/05_proksee/{strains}/05_proksee_{strains}'), | ||
output_depth = directory('data/processed/{strains}/05_proksee_{strains}/') | ||
threads: 4 | ||
log: | ||
"logs/05_proksee/proksee-{strains}.log" | ||
conda: | ||
"../envs/proksee.yaml" | ||
shell: | ||
""" | ||
# Step 1: Split fasta files per contigs | ||
input_fasta={input.assembly} | ||
# Initialize variables | ||
output_file="" | ||
contig_name="" | ||
# Read the input FASTA file line by line | ||
while IFS= read -r line; do | ||
if [[ $line == ">"* ]]; then | ||
# If the line starts with '>', it's a header line | ||
# Close the previous output file if it exists | ||
if [ -n "$output_file" ]; then | ||
exec 3>&- | ||
fi | ||
# Extract the contig name and create a new output file | ||
contig_name=$(echo "$line" | sed 's/>//; s/ .*//') | ||
output_directory="{output.proksee_input}/$contig_name/fasta" | ||
mkdir -p "$output_directory" | ||
output_file="$output_directory/$contig_name.fasta" | ||
exec 3>"$output_file" | ||
echo "$line" >&3 | ||
else | ||
# Write the sequence line to the current output file | ||
echo "$line" >&3 | ||
fi | ||
done < "$input_fasta" | ||
# Close the last output file | ||
if [ -n "$output_file" ]; then | ||
exec 3>&- | ||
fi | ||
echo "Splitting fasta completed. Fasta files are saved in $output_directory" >> {log} | ||
# Step 2: Split depth table per contig | ||
python workflow/scripts/proksee_prepare.py {input.depth} {output.proksee_input} {wildcards.strains} --output_depth {output.output_depth} 2>> {log} | ||
# echo "Splitting depth table completed. Depth files are saved in $output_directory" >> {log} | ||
""" | ||
|
||
rule run_proksee: | ||
input: | ||
proksee_input = 'data/interim/05_proksee/{strains}/05_proksee_{strains}' | ||
output: | ||
proksee_output = directory('data/processed/{strains}/05_proksee_{strains}_output') | ||
log: | ||
"logs/05_proksee/proksee-run-{strains}.log" | ||
conda: | ||
"../envs/proksee.yaml" | ||
shell: | ||
""" | ||
proksee-batch --input {input.proksee_input} --output {output.proksee_output} 2>> {log} | ||
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,95 @@ | ||
import argparse | ||
import json | ||
import logging | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
||
logging.basicConfig( | ||
format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO | ||
) | ||
|
||
|
||
def split_depth_file( | ||
input_file, output_dir, genome_id, output_depth=None, window_size=100 | ||
): | ||
# Read the depth file | ||
logging.info(f"Reading depth file: {input_file}") | ||
df = pd.read_csv(input_file, sep="\t", header=None) | ||
df = df.rename(columns={0: "contig_id", 1: "start", 2: "score"}) | ||
|
||
# Create the output directory if it doesn't exist | ||
outdir = Path(output_dir) | ||
outdir.mkdir(parents=True, exist_ok=True) | ||
logging.info(f"Output directory created: {output_dir}") | ||
|
||
# Split the depth file by contig and save each to a separate file | ||
for contig in df.contig_id.unique(): | ||
logging.info(f"Processing contig {contig}") | ||
subset = df[df.contig_id == contig] | ||
|
||
# Calculate non-overlapping window average | ||
subset["window"] = (subset["start"] - 1) // window_size | ||
window_avg = subset.groupby("window")["score"].mean().reset_index() | ||
window_avg["start"] = window_avg["window"] * window_size + 1 | ||
window_avg["end"] = window_avg["start"] + window_size - 1 | ||
|
||
if output_depth is None: | ||
output_depth = outdir | ||
|
||
outfile = Path(output_depth) / f"{contig}/plot/depth.txt" | ||
outfile.parent.mkdir(parents=True, exist_ok=True) | ||
window_avg.loc[:, ["start", "end", "score"]].to_csv( | ||
outfile, sep="\t", index=False | ||
) | ||
logging.info(f"Depth file saved for contig {contig}: {outfile}") | ||
|
||
# Create metadata | ||
metadata = { | ||
"metadata": { | ||
"description": f"Contig {contig} extracted from genome {genome_id}" | ||
} | ||
} | ||
|
||
# Save metadata to a JSON file | ||
metadata_file = outdir / f"{contig}/metadata/{contig}.json" | ||
metadata_file.parent.mkdir(parents=True, exist_ok=True) | ||
with open(metadata_file, "w") as f: | ||
json.dump(metadata, f, indent=2) | ||
logging.info(f"Metadata file saved for contig {contig}: {metadata_file}") | ||
|
||
logging.info(f"Splitting completed. Files are saved in {output_dir}") | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
description="Split depth file by contig and save each to a separate file with metadata." | ||
) | ||
parser.add_argument("input_file", help="Path to the input depth file") | ||
parser.add_argument("output_dir", help="Directory to save the output files") | ||
parser.add_argument("genome_id", help="Genome ID for metadata") | ||
parser.add_argument( | ||
"--output_depth", | ||
help="Optional directory to save the depth files", | ||
default=None, | ||
) | ||
parser.add_argument( | ||
"--window_size", | ||
type=int, | ||
help="Window size for calculating the non-overlapping window average", | ||
default=100, | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
split_depth_file( | ||
args.input_file, | ||
args.output_dir, | ||
args.genome_id, | ||
args.output_depth, | ||
args.window_size, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |