diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000..2eb5c4f --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,34 @@ +repos: +- repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.5.1 + hooks: + - id: forbid-crlf + - id: remove-crlf + - id: forbid-tabs + exclude_types: [csv] + - id: remove-tabs + exclude_types: [csv] +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.3.0 + hooks: + - id: trailing-whitespace + - id: end-of-file-fixer + - id: check-merge-conflict + - id: check-yaml + args: [--unsafe] +- repo: https://github.com/pre-commit/mirrors-isort + rev: v5.10.1 + hooks: + - id: isort +- repo: https://github.com/ambv/black + rev: 23.3.0 + hooks: + - id: black + language_version: python3 +- repo: https://github.com/PyCQA/flake8 + rev: 5.0.4 + hooks: + - id: flake8 + args: ['--ignore=E501,W503'] + additional_dependencies: [flake8-typing-imports==1.10.0] + exclude: ^tests diff --git a/workflow/Snakefile-consensus b/workflow/Snakefile-consensus index 1881b8e..98adc36 100644 --- a/workflow/Snakefile-consensus +++ b/workflow/Snakefile-consensus @@ -10,7 +10,11 @@ except KeyError as e: ##### Target rules ##### rule all: input: - expand('data/processed/{strains}/03_trycycler_consensus/{strains}.fna', strains=STRAINS) + expand('data/processed/{strains}/03_trycycler_consensus/{strains}.fna', strains=STRAINS), + expand('data/processed/{strains}/04_final_qc/{strains}_depth.txt', strains=STRAINS), + expand('data/processed/{strains}/05_proksee_{strains}_output', strains=STRAINS), ##### Modules ##### -include: "rules/03_trycycler_consensus.smk" \ No newline at end of file +include: "rules/03_trycycler_consensus.smk" +include: "rules/04_final_qc.smk" +include: "rules/05_proksee.smk" diff --git a/workflow/envs/proksee.yaml b/workflow/envs/proksee.yaml new file mode 100644 index 0000000..54bd752 --- /dev/null +++ b/workflow/envs/proksee.yaml @@ -0,0 +1,8 @@ +name: proksee-batch +channels: + - conda-forge +dependencies: + - python=3.11 + - pip + - pip: + - proksee-batch diff --git a/workflow/rules/04_final_qc.smk b/workflow/rules/04_final_qc.smk new file mode 100644 index 0000000..bef4b5f --- /dev/null +++ b/workflow/rules/04_final_qc.smk @@ -0,0 +1,32 @@ +rule final_qc: + input: + assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna', + raw_reads = 'data/interim/01_trycycler_assembly/{strains}/nanopore/min1kb.fq', + output: + index_assembly = 'data/processed/{strains}/04_final_qc/{strains}.mmi', + aligned_reads_sam = 'data/processed/{strains}/04_final_qc/{strains}.sam', + aligned_reads_bam = temp('data/processed/{strains}/04_final_qc/{strains}.bam'), + aligned_reads_bam_sorted = 'data/processed/{strains}/04_final_qc/{strains}_sorted.bam', + depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt', + threads: 4 + log: + "logs/04_final_qc/final_qc-{strains}.log" + conda: + "../envs/utilities.yaml" + shell: + """ + # Step 1: Index the genome (optional for minimap2) + minimap2 -d {output.index_assembly} {input.assembly} 2>> {log} + + # Step 2: Align the reads + minimap2 -ax map-ont -t {threads} {output.index_assembly} {input.raw_reads} > {output.aligned_reads_sam} 2>> {log} + + # Step 3: Convert SAM to BAM + samtools view -@ {threads} -S -b {output.aligned_reads_sam} > {output.aligned_reads_bam} 2>> {log} + + # Step 4: Sort the BAM file + samtools sort -@ {threads} {output.aligned_reads_bam} -o {output.aligned_reads_bam_sorted} 2>> {log} + + # Step 5: Calculate depth + samtools depth {output.aligned_reads_bam_sorted} > {output.depth} 2>> {log} + """ diff --git a/workflow/rules/05_proksee.smk b/workflow/rules/05_proksee.smk new file mode 100644 index 0000000..152f483 --- /dev/null +++ b/workflow/rules/05_proksee.smk @@ -0,0 +1,70 @@ +rule prepare_proksee: + input: + assembly = 'data/processed/{strains}/03_trycycler_consensus/{strains}.fna', + depth = 'data/processed/{strains}/04_final_qc/{strains}_depth.txt' + output: + proksee_input = directory('data/interim/05_proksee/{strains}/05_proksee_{strains}'), + output_depth = directory('data/processed/{strains}/05_proksee_{strains}/') + threads: 4 + log: + "logs/05_proksee/proksee-{strains}.log" + conda: + "../envs/proksee.yaml" + shell: + """ + # Step 1: Split fasta files per contigs + input_fasta={input.assembly} + + # Initialize variables + output_file="" + contig_name="" + + # Read the input FASTA file line by line + while IFS= read -r line; do + if [[ $line == ">"* ]]; then + # If the line starts with '>', it's a header line + # Close the previous output file if it exists + if [ -n "$output_file" ]; then + exec 3>&- + fi + # Extract the contig name and create a new output file + contig_name=$(echo "$line" | sed 's/>//; s/ .*//') + + output_directory="{output.proksee_input}/$contig_name/fasta" + mkdir -p "$output_directory" + + output_file="$output_directory/$contig_name.fasta" + exec 3>"$output_file" + echo "$line" >&3 + else + # Write the sequence line to the current output file + echo "$line" >&3 + fi + done < "$input_fasta" + + # Close the last output file + if [ -n "$output_file" ]; then + exec 3>&- + fi + + echo "Splitting fasta completed. Fasta files are saved in $output_directory" >> {log} + + # Step 2: Split depth table per contig + python workflow/scripts/proksee_prepare.py {input.depth} {output.proksee_input} {wildcards.strains} --output_depth {output.output_depth} 2>> {log} + + # echo "Splitting depth table completed. Depth files are saved in $output_directory" >> {log} + """ + +rule run_proksee: + input: + proksee_input = 'data/interim/05_proksee/{strains}/05_proksee_{strains}' + output: + proksee_output = directory('data/processed/{strains}/05_proksee_{strains}_output') + log: + "logs/05_proksee/proksee-run-{strains}.log" + conda: + "../envs/proksee.yaml" + shell: + """ + proksee-batch --input {input.proksee_input} --output {output.proksee_output} 2>> {log} + """ diff --git a/workflow/scripts/proksee_prepare.py b/workflow/scripts/proksee_prepare.py new file mode 100644 index 0000000..83a4a69 --- /dev/null +++ b/workflow/scripts/proksee_prepare.py @@ -0,0 +1,95 @@ +import argparse +import json +import logging +from pathlib import Path + +import pandas as pd + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO +) + + +def split_depth_file( + input_file, output_dir, genome_id, output_depth=None, window_size=100 +): + # Read the depth file + logging.info(f"Reading depth file: {input_file}") + df = pd.read_csv(input_file, sep="\t", header=None) + df = df.rename(columns={0: "contig_id", 1: "start", 2: "score"}) + + # Create the output directory if it doesn't exist + outdir = Path(output_dir) + outdir.mkdir(parents=True, exist_ok=True) + logging.info(f"Output directory created: {output_dir}") + + # Split the depth file by contig and save each to a separate file + for contig in df.contig_id.unique(): + logging.info(f"Processing contig {contig}") + subset = df[df.contig_id == contig] + + # Calculate non-overlapping window average + subset["window"] = (subset["start"] - 1) // window_size + window_avg = subset.groupby("window")["score"].mean().reset_index() + window_avg["start"] = window_avg["window"] * window_size + 1 + window_avg["end"] = window_avg["start"] + window_size - 1 + + if output_depth is None: + output_depth = outdir + + outfile = Path(output_depth) / f"{contig}/plot/depth.txt" + outfile.parent.mkdir(parents=True, exist_ok=True) + window_avg.loc[:, ["start", "end", "score"]].to_csv( + outfile, sep="\t", index=False + ) + logging.info(f"Depth file saved for contig {contig}: {outfile}") + + # Create metadata + metadata = { + "metadata": { + "description": f"Contig {contig} extracted from genome {genome_id}" + } + } + + # Save metadata to a JSON file + metadata_file = outdir / f"{contig}/metadata/{contig}.json" + metadata_file.parent.mkdir(parents=True, exist_ok=True) + with open(metadata_file, "w") as f: + json.dump(metadata, f, indent=2) + logging.info(f"Metadata file saved for contig {contig}: {metadata_file}") + + logging.info(f"Splitting completed. Files are saved in {output_dir}") + + +def main(): + parser = argparse.ArgumentParser( + description="Split depth file by contig and save each to a separate file with metadata." + ) + parser.add_argument("input_file", help="Path to the input depth file") + parser.add_argument("output_dir", help="Directory to save the output files") + parser.add_argument("genome_id", help="Genome ID for metadata") + parser.add_argument( + "--output_depth", + help="Optional directory to save the depth files", + default=None, + ) + parser.add_argument( + "--window_size", + type=int, + help="Window size for calculating the non-overlapping window average", + default=100, + ) + + args = parser.parse_args() + + split_depth_file( + args.input_file, + args.output_dir, + args.genome_id, + args.output_depth, + args.window_size, + ) + + +if __name__ == "__main__": + main()