Skip to content

Commit

Permalink
2 alignment based cnv calling (#11)
Browse files Browse the repository at this point in the history
Add high-recall deletion and duplication detection without clustering
  • Loading branch information
jonperdomo authored Nov 17, 2023
1 parent 7456f53 commit 51dcdeb
Show file tree
Hide file tree
Showing 52 changed files with 3,695 additions and 992 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,4 @@ jobs:
shell: bash --login {0}
run: |
mkdir -p tests/output
python -m pytest
python -m pytest -s -v tests/test_general.py
10 changes: 6 additions & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,14 @@ __pycache__/
*.code-workspace
CMakeSettings.json

# Doxygen
doc/doxygen
*.code-workspace

# Shell scripts
*.sh

# Output folder
output/

# Doxygen
docs/html/

# Singularity images
*.sif
859 changes: 572 additions & 287 deletions doc/Doxyfile → Doxyfile

Large diffs are not rendered by default.

5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,7 @@
An alignment-based, generalized structural variant caller for long-read
sequencing/mapping data

[![build tests](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml/badge.svg)](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml)
[![build
tests](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml/badge.svg)](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml)

Documentation available at [https://wglab.openbioinformatics.org/ContextSV](https://wglab.openbioinformatics.org/ContextSV)
212 changes: 191 additions & 21 deletions __main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,52 +3,222 @@
"""

import os
import sys
import argparse
import logging as log
from lib import contextsv
from python import cnv_plots

# Set up logging.
log.basicConfig(
level=log.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
log.StreamHandler(sys.stdout)
]
)

def main():
"""Entry point and user interface for the program."""

# Grab the command line arguments using argparse.
parser = argparse.ArgumentParser(
description="ContextSV: A tool for integrative structural variant detection."
)

# Add common arguments.
parser.add_argument(
"-r", "--region",
help="The region to analyze.",
required=True
)

parser.add_argument(
"-o", "--output",
help="The path to the output directory.",
required=True
)

parser.add_argument(
"-v", "--version",
help="Print the version number and exit.",
action="version",
version="%(prog)s 0.0.1"
)

# Thread count.
parser.add_argument(
"-t", "--threads",
help="The number of threads to use.",
required=False,
default=1,
type=int
)

# CNV data path.
parser.add_argument(
"--cnv",
help="The path to the CNV data in TSV format.",
required=False
)

# Mode 1: SV detection mode.
parser.add_argument(
"-b", "--bam",
help="The path to the BAM file.",
required=True
required=False
)
parser.add_argument(
"-g", "--reference",
help="The path to the reference genome.",
required=False
)
parser.add_argument(
"-s", "--snps",
help="The path to the SNPs file.",
required=True
required=False
)

# PFB file of population allele frequencies.
parser.add_argument(
"-o", "--output",
help="The path to the output file.",
required=True
"-p", "--pfb",
help="The path to the PFB file of population allele frequencies.",
required=False
)

# HMM file path.
parser.add_argument(
"-r", "--region",
help="The region to analyze.",
required=True
"--hmm",
help="The path to the HMM file.",
required=False
)

# Pass in chromosome mean coverage data for speediness.
parser.add_argument(
"--chr-cov",
help="Chromosome mean coverage values passed in as a comma-separated list (e.g. chr1:100,chr2:200,chr3:300).",
required=False
)

# Turn off CIGAR string SV detection. This is for debugging purposes (speeds
# up the program).
parser.add_argument(
"--disable-cigar",
help="Turn off CIGAR string SV detection.",
required=False,
action="store_true",
default=False
)

# Mode 2: CNV plots mode. If only the VCF file is provided, then the program
# will run in this mode.
parser.add_argument(
"--vcf",
help="The path to the VCF file of SV calls.",
required=False
)

# Get the command line arguments.
args = parser.parse_args()

# Run the program.
print("Running contextsv with the following arguments:")
print("BAM: {}".format(args.bam))
print("SNPs: {}".format(args.snps))
print("Output: {}".format(args.output))
print("Region: {}".format(args.region))

contextsv.run(
args.bam,
args.snps,
args.output,
args.region
)
# Remove commas or spaces from the region.
region = args.region
region = region.replace(",", "")
region = region.replace(" ", "")

# Determine the selected program mode (SV detection or CNV plots).
if (args.vcf is not None):
# Run SV analysis mode.

# Ensure that the CNV data path is provided.
arg_error = False
if (args.cnv is None):
log.error("Please provide the CNV data path.")
arg_error = True

# Exit if there are any errors.
if (arg_error):
sys.exit(1)

# Create the output directory if it doesn't exist.
if (not os.path.exists(args.output)):
os.makedirs(args.output)

# Set the data paths from user input for downstream analysis.
vcf_path = args.vcf
cnv_data_path = args.cnv
output_dir = args.output

log.info("Analyzing SV calls in VCF file %s...", args.vcf)

else:
# Run SV detection mode.

# Ensure BAM, reference, and SNPs files are provided.
arg_error = False
if (args.bam is None):
log.error("Please provide the BAM file.")
arg_error = True

if (args.reference is None):
log.error("Please provide the reference genome.")
arg_error = True

if (args.snps is None):
log.error("Please provide the SNPs file.")
arg_error = True

# Exit if there are any errors.
if (arg_error):
# Exit with error code 1.
sys.exit(1)

# Set all None values to empty strings.
for key, value in vars(args).items():
if value is None:
setattr(args, key, "")

else:
log.info("Setting %s to %s", key, value)

# Loop and set all None values to empty strings.
for key, value in vars(args).items():
if value is None:
setattr(args, key, "")

log.info("Chromosome mean coverage: %s", args.chr_cov)
log.info("PFB filepath: %s", args.pfb)

# Set input parameters.
input_data = contextsv.InputData()
input_data.setBAMFilepath(args.bam)
input_data.setRefGenome(args.reference)
input_data.setSNPFilepath(args.snps)
input_data.setRegion(args.region)
input_data.setThreadCount(args.threads)
input_data.setChrCov(args.chr_cov)
input_data.setPFBFilepath(args.pfb)
input_data.setHMMFilepath(args.hmm)
input_data.setOutputDir(args.output)
input_data.setDisableCIGAR(args.disable_cigar)
input_data.setCNVFilepath(args.cnv)

# Run the analysis.
contextsv.run(input_data)

# Determine the data paths for downstream analysis.
vcf_path = os.path.join(args.output, "sv_calls.vcf")
output_dir = args.output

if (args.cnv == ""):
cnv_data_path = os.path.join(args.output, "cnv_data.tsv")
else:
cnv_data_path = args.cnv

# Run the python-based analysis.
log.info("Running python-based analysis...")
cnv_plots.run(vcf_path, cnv_data_path, output_dir, region)
log.info("Done.")


if __name__ == '__main__':
Expand Down
3 changes: 3 additions & 0 deletions data/chr_cov.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# This text file includes chromosome coverage values for speeding up runtime
chr3=39.561
chr6=39.4096
36 changes: 36 additions & 0 deletions data/hhall.hmm
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
M=6
N=6
A:
0.936719716 0.006332139 0.048770575 0.000000001 0.008177573 0.000000001
0.000801036 0.949230924 0.048770575 0.000000001 0.001168245 0.000029225
0.000004595 0.000047431 0.999912387 0.000000001 0.000034971 0.000000621
0.000049998 0.000049998 0.000049998 0.999750015 0.000049998 0.000049998
0.000916738 0.001359036 0.048770575 0.000000001 0.948953653 0.000000002
0.000000001 0.000000001 0.027257213 0.000000001 0.000000004 0.972742785
B:
0.950000 0.000001 0.050000 0.000001 0.000001 0.000001
0.000001 0.950000 0.050000 0.000001 0.000001 0.000001
0.000001 0.000001 0.999995 0.000001 0.000001 0.000001
0.000001 0.000001 0.050000 0.950000 0.000001 0.000001
0.000001 0.000001 0.050000 0.000001 0.950000 0.000001
0.000001 0.000001 0.050000 0.000001 0.000001 0.950000
pi:
0.000001 0.000500 0.999000 0.000001 0.000500 0.000001
B1_mean:
-3.527211 -0.664184 0.000000 100.000000 0.395621 0.678345
B1_sd:
1.329152 0.284338 0.159645 0.211396 0.209089 0.191579
B1_uf:
0.010000
B2_mean:
0.000000 0.250000 0.333333 0.500000 0.500000
B2_sd:
0.016372 0.042099 0.045126 0.034982 0.304243
B2_uf:
0.010000
B3_mean:
-2.051407 -0.572210 0.000000 0.000000 0.361669 0.626711
B3_sd:
2.132843 0.382025 0.184001 0.200297 0.253551 0.353183
B3_uf:
0.010000
36 changes: 36 additions & 0 deletions data/hhall_loh.hmm
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
M=6
N=6
A:
0.936719716 0.006332139 0.048770575 0.000000001 0.008177573 0.000000001
0.000801036 0.949230924 0.048770575 0.000000001 0.001168245 0.000029225
0.000004595 0.000047431 0.999912387 0.000000001 0.000034971 0.000000621
0.000049998 0.000049998 0.000049998 0.999750015 0.000049998 0.000049998
0.000916738 0.001359036 0.048770575 0.000000001 0.948953653 0.000000002
0.000000001 0.000000001 0.027257213 0.000000001 0.000000004 0.972742785
B:
0.950000 0.000001 0.050000 0.000001 0.000001 0.000001
0.000001 0.950000 0.050000 0.000001 0.000001 0.000001
0.000001 0.000001 0.999995 0.000001 0.000001 0.000001
0.000001 0.000001 0.050000 0.950000 0.000001 0.000001
0.000001 0.000001 0.050000 0.000001 0.950000 0.000001
0.000001 0.000001 0.050000 0.000001 0.000001 0.950000
pi:
0.000001 0.000500 0.999000 0.000001 0.000500 0.000001
B1_mean:
-3.527211 -0.664184 0.000000 0.000000 0.395621 0.678345
B1_sd:
1.329152 0.284338 0.159645 0.211396 0.209089 0.191579
B1_uf:
0.010000
B2_mean:
0.000000 0.250000 0.333333 0.500000 0.500000
B2_sd:
0.016372 0.042099 0.045126 0.034982 0.304243
B2_uf:
0.010000
B3_mean:
-2.051407 -0.572210 0.000000 0.000000 0.361669 0.626711
B3_sd:
2.132843 0.382025 0.184001 0.200297 0.253551 0.353183
B3_uf:
0.010000
Loading

0 comments on commit 51dcdeb

Please sign in to comment.