From 51dcdeb28559d7ef379bac9f5ac90175358342ff Mon Sep 17 00:00:00 2001 From: Jonathan Perdomo Date: Fri, 17 Nov 2023 11:35:03 -0500 Subject: [PATCH] 2 alignment based cnv calling (#11) Add high-recall deletion and duplication detection without clustering --- .github/workflows/build-tests.yml | 2 +- .gitignore | 10 +- doc/Doxyfile => Doxyfile | 859 ++++++++++++++------- README.md | 5 +- __main__.py | 212 ++++- data/chr_cov.txt | 3 + data/hhall.hmm | 36 + data/hhall_loh.hmm | 36 + docs/genome-assembly-guide.md | 43 ++ doc/OUTLINE.md => docs/project-overview.md | 84 +- environment.yml | 4 + include/cli.h | 15 - include/cnv_caller.h | 44 +- include/cnv_data.h | 26 + include/common.h | 45 -- include/contextsv.h | 27 +- include/fasta_query.h | 25 + include/input_data.h | 93 +++ include/integrative_caller.h | 36 - include/kc.h | 9 +- include/khmm.h | 47 +- include/sv_caller.h | 38 + include/sv_data.h | 54 ++ include/swig_interface.h | 17 + include/types.h | 63 ++ include/utils.h | 12 +- include/vcf_writer.h | 23 + python/cnv_plots.py | 263 +++++++ python/scoring_model.py | 6 + python/train_model.py | 69 ++ python/utils.py | 44 ++ samtools/htslib | 1 - scripts/find_cigar_deletions.py | 58 ++ setup.py | 44 +- src/cli.cpp | 40 - src/cnv_caller.cpp | 494 +++++++++--- src/cnv_data.cpp | 114 +++ src/common.cpp | 169 ---- src/contextsv.cpp | 87 +++ src/fasta_query.cpp | 168 ++++ src/input_data.cpp | 324 ++++++++ src/integrative_caller.cpp | 40 - src/kc.cpp | 18 +- src/khmm.cpp | 145 ++-- src/sv_caller.cpp | 350 +++++++++ src/sv_data.cpp | 194 +++++ src/swig_interface.cpp | 31 + src/swig_wrapper.i | 9 +- src/utils.cpp | 36 + src/vcf_writer.cpp | 73 ++ tests/data/ref.fa | 0 tests/test_general.py | 42 +- 52 files changed, 3695 insertions(+), 992 deletions(-) rename doc/Doxyfile => Doxyfile (77%) create mode 100644 data/chr_cov.txt create mode 100644 data/hhall.hmm create mode 100644 data/hhall_loh.hmm create mode 100644 docs/genome-assembly-guide.md rename doc/OUTLINE.md => docs/project-overview.md (80%) delete mode 100644 include/cli.h create mode 100644 include/cnv_data.h delete mode 100644 include/common.h create mode 100644 include/fasta_query.h create mode 100644 include/input_data.h delete mode 100644 include/integrative_caller.h create mode 100644 include/sv_caller.h create mode 100644 include/sv_data.h create mode 100644 include/swig_interface.h create mode 100644 include/types.h create mode 100644 include/vcf_writer.h create mode 100644 python/cnv_plots.py create mode 100644 python/scoring_model.py create mode 100644 python/train_model.py create mode 100644 python/utils.py delete mode 160000 samtools/htslib create mode 100644 scripts/find_cigar_deletions.py delete mode 100644 src/cli.cpp create mode 100644 src/cnv_data.cpp delete mode 100644 src/common.cpp create mode 100644 src/contextsv.cpp create mode 100644 src/fasta_query.cpp create mode 100644 src/input_data.cpp delete mode 100644 src/integrative_caller.cpp create mode 100644 src/sv_caller.cpp create mode 100644 src/sv_data.cpp create mode 100644 src/swig_interface.cpp create mode 100644 src/vcf_writer.cpp create mode 100644 tests/data/ref.fa diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 18ba20bd..82269bbf 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -34,4 +34,4 @@ jobs: shell: bash --login {0} run: | mkdir -p tests/output - python -m pytest + python -m pytest -s -v tests/test_general.py diff --git a/.gitignore b/.gitignore index ff375ec3..e5b1f803 100644 --- a/.gitignore +++ b/.gitignore @@ -50,12 +50,14 @@ __pycache__/ *.code-workspace CMakeSettings.json -# Doxygen -doc/doxygen -*.code-workspace - # Shell scripts *.sh # Output folder output/ + +# Doxygen +docs/html/ + +# Singularity images +*.sif diff --git a/doc/Doxyfile b/Doxyfile similarity index 77% rename from doc/Doxyfile rename to Doxyfile index 691382ab..12593cc4 100644 --- a/doc/Doxyfile +++ b/Doxyfile @@ -1,4 +1,4 @@ -# Doxyfile 1.8.17 +# Doxyfile 1.9.8 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. @@ -12,6 +12,16 @@ # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). +# +# Note: +# +# Use doxygen to compare the used configuration file with the template +# configuration file: +# doxygen -x [configFile] +# Use doxygen to compare the used configuration file with the template +# configuration file without replacing the environment variables or CMake type +# replacement variables: +# doxygen -x_noenv [configFile] #--------------------------------------------------------------------------- # Project related configuration options @@ -58,18 +68,30 @@ PROJECT_LOGO = # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. -OUTPUT_DIRECTORY = doc/doxygen +OUTPUT_DIRECTORY = ./docs -# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- -# directories (in 2 levels) under the output directory of each output format and -# will distribute the generated files over these directories. Enabling this +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create up to 4096 +# sub-directories (in 2 levels) under the output directory of each output format +# and will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes -# performance problems for the file system. +# performance problems for the file system. Adapt CREATE_SUBDIRS_LEVEL to +# control the number of sub-directories. # The default value is: NO. CREATE_SUBDIRS = NO +# Controls the number of sub-directories that will be created when +# CREATE_SUBDIRS tag is set to YES. Level 0 represents 16 directories, and every +# level increment doubles the number of directories, resulting in 4096 +# directories at level 8 which is the default and also the maximum value. The +# sub-directories are organized in 2 levels, the first level always has a fixed +# number of 16 directories. +# Minimum value: 0, maximum value: 8, default value: 8. +# This tag requires that the tag CREATE_SUBDIRS is set to YES. + +CREATE_SUBDIRS_LEVEL = 8 + # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode @@ -81,26 +103,18 @@ ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. -# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, -# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), -# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, -# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), -# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, -# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, -# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, -# Ukrainian and Vietnamese. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Bulgarian, +# Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch, English +# (United States), Esperanto, Farsi (Persian), Finnish, French, German, Greek, +# Hindi, Hungarian, Indonesian, Italian, Japanese, Japanese-en (Japanese with +# English messages), Korean, Korean-en (Korean with English messages), Latvian, +# Lithuanian, Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, +# Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, +# Swedish, Turkish, Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English -# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all -# documentation generated by doxygen is written. Doxygen will use this -# information to generate all generated output in the proper direction. -# Possible values are: None, LTR, RTL and Context. -# The default value is: None. - -OUTPUT_TEXT_DIRECTION = None - # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. @@ -227,6 +241,14 @@ QT_AUTOBRIEF = NO MULTILINE_CPP_IS_BRIEF = NO +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. @@ -250,25 +272,19 @@ TAB_SIZE = 4 # the documentation. An alias has the form: # name=value # For example adding -# "sideeffect=@par Side Effects:\n" +# "sideeffect=@par Side Effects:^^" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading -# "Side Effects:". You can put \n's in the value part of an alias to insert -# newlines (in the resulting output). You can put ^^ in the value part of an -# alias to insert a newline as if a physical newline was in the original file. -# When you need a literal { or } or , in the value part of an alias you have to -# escape them by means of a backslash (\), this can lead to conflicts with the -# commands \{ and \} for these it is advised to use the version @{ and @} or use -# a double escape (\\{ and \\}) +# "Side Effects:". Note that you cannot put \n's in the value part of an alias +# to insert newlines (in the resulting output). You can put ^^ in the value part +# of an alias to insert a newline as if a physical newline was in the original +# file. When you need a literal { or } or , in the value part of an alias you +# have to escape them by means of a backslash (\), this can lead to conflicts +# with the commands \{ and \} for these it is advised to use the version @{ and +# @} or use a double escape (\\{ and \\}) ALIASES = -# This tag can be used to specify a number of word-keyword mappings (TCL only). -# A mapping has the form "name=value". For example adding "class=itcl::class" -# will allow you to use the command class in the itcl::class meaning. - -TCL_SUBST = - # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all @@ -310,18 +326,21 @@ OPTIMIZE_OUTPUT_SLICE = NO # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, JavaScript, -# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, -# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# Csharp (C#), C, C++, Lex, D, PHP, md (Markdown), Objective-C, Python, Slice, +# VHDL, Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: # FortranFree, unknown formatted Fortran: Fortran. In the later case the parser # tries to guess whether the code is fixed or free formatted code, this is the -# default for Fortran type files), VHDL, tcl. For instance to make doxygen treat -# .inc files as Fortran files (default is PHP), and .f files as C (default is -# Fortran), use: inc=Fortran f=C. +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise -# the files are not read by doxygen. +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. EXTENSION_MAPPING = @@ -344,6 +363,17 @@ MARKDOWN_SUPPORT = YES TOC_INCLUDE_HEADINGS = 5 +# The MARKDOWN_ID_STYLE tag can be used to specify the algorithm used to +# generate identifiers for the Markdown headings. Note: Every identifier is +# unique. +# Possible values are: DOXYGEN use a fixed 'autotoc_md' string followed by a +# sequence number starting at 0 and GITHUB use the lower case version of title +# with any whitespace replaced by '-' and punctuation characters removed. +# The default value is: DOXYGEN. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +MARKDOWN_ID_STYLE = DOXYGEN + # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or @@ -455,6 +485,27 @@ TYPEDEF_HIDES_STRUCT = NO LOOKUP_CACHE_SIZE = 0 +# The NUM_PROC_THREADS specifies the number of threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which effectively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + +# If the TIMESTAMP tag is set different from NO then each generated page will +# contain the date or date and time when the page was generated. Setting this to +# NO can help when comparing the output of multiple runs. +# Possible values are: YES, NO, DATETIME and DATE. +# The default value is: NO. + +TIMESTAMP = NO + #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- @@ -467,7 +518,7 @@ LOOKUP_CACHE_SIZE = 0 # normally produced when WARNINGS is set to YES. # The default value is: NO. -EXTRACT_ALL = NO +EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. @@ -518,6 +569,13 @@ EXTRACT_LOCAL_METHODS = NO EXTRACT_ANON_NSPACES = NO +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation @@ -529,7 +587,8 @@ HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option -# has no effect if EXTRACT_ALL is enabled. +# will also hide undocumented C++ concepts if enabled. This option has no effect +# if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO @@ -555,12 +614,20 @@ HIDE_IN_BODY_DOCS = NO INTERNAL_DOCS = NO -# If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file -# names in lower-case letters. If set to YES, upper-case letters are also -# allowed. This is useful if you have classes or files whose names only differ -# in case and if your file system supports case sensitive file names. Windows -# (including Cygwin) ands Mac users are advised to set this option to NO. -# The default value is: system dependent. +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. +# Possible values are: SYSTEM, NO and YES. +# The default value is: SYSTEM. CASE_SENSE_NAMES = NO @@ -578,6 +645,12 @@ HIDE_SCOPE_NAMES = NO HIDE_COMPOUND_REFERENCE= NO +# If the SHOW_HEADERFILE tag is set to YES then the documentation for a class +# will show which file needs to be included to use the class. +# The default value is: YES. + +SHOW_HEADERFILE = YES + # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. @@ -735,7 +808,8 @@ FILE_VERSION_FILTER = # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml -# will be used as the name of the layout file. +# will be used as the name of the layout file. See also section "Changing the +# layout of pages" for information. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE @@ -781,24 +855,50 @@ WARNINGS = YES WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for -# potential errors in the documentation, such as not documenting some parameters -# in a documented function, or documenting parameters that don't exist or using -# markup commands wrongly. +# potential errors in the documentation, such as documenting some parameters in +# a documented function twice, or documenting parameters that don't exist or +# using markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES +# If WARN_IF_INCOMPLETE_DOC is set to YES, doxygen will warn about incomplete +# function parameter documentation. If set to NO, doxygen will accept that some +# parameters have no documentation without warning. +# The default value is: YES. + +WARN_IF_INCOMPLETE_DOC = YES + # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return -# value. If set to NO, doxygen will only warn about wrong or incomplete -# parameter documentation, but not about the absence of documentation. If -# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# value. If set to NO, doxygen will only warn about wrong parameter +# documentation, but not about the absence of documentation. If EXTRACT_ALL is +# set to YES then this flag will automatically be disabled. See also +# WARN_IF_INCOMPLETE_DOC # The default value is: NO. WARN_NO_PARAMDOC = NO +# If WARN_IF_UNDOC_ENUM_VAL option is set to YES, doxygen will warn about +# undocumented enumeration values. If set to NO, doxygen will accept +# undocumented enumeration values. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: NO. + +WARN_IF_UNDOC_ENUM_VAL = NO + # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when -# a warning is encountered. +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS_PRINT then doxygen behaves +# like FAIL_ON_WARNINGS but in case no WARN_LOGFILE is defined doxygen will not +# write the warning messages in between other messages but write them at the end +# of a run, in case a WARN_LOGFILE is defined the warning messages will be +# besides being in the defined file also be shown at the end of a run, unless +# the WARN_LOGFILE is defined as - i.e. standard output (stdout) in that case +# the behavior will remain as with the setting FAIL_ON_WARNINGS. +# Possible values are: NO, YES, FAIL_ON_WARNINGS and FAIL_ON_WARNINGS_PRINT. # The default value is: NO. WARN_AS_ERROR = NO @@ -809,13 +909,27 @@ WARN_AS_ERROR = NO # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) +# See also: WARN_LINE_FORMAT # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" +# In the $text part of the WARN_FORMAT command it is possible that a reference +# to a more specific place is given. To make it easier to jump to this place +# (outside of doxygen) the user can define a custom "cut" / "paste" string. +# Example: +# WARN_LINE_FORMAT = "'vi $file +$line'" +# See also: WARN_FORMAT +# The default value is: at line $line of file $file. + +WARN_LINE_FORMAT = "at line $line of file $file" + # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard -# error (stderr). +# error (stderr). In case the file specified cannot be opened for writing the +# warning and error messages are written to standard error. When as file - is +# specified the warning and error messages are written to standard output +# (stdout). WARN_LOGFILE = @@ -829,17 +943,30 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = . +INPUT = include \ + src \ + docs/project-overview.md \ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv -# documentation (see: https://www.gnu.org/software/libiconv/) for the list of -# possible encodings. +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. +# See also: INPUT_FILE_ENCODING # The default value is: UTF-8. INPUT_ENCODING = UTF-8 +# This tag can be used to specify the character encoding of the source files +# that doxygen parses The INPUT_FILE_ENCODING tag can be used to specify +# character encoding on a per file pattern basis. Doxygen will compare the file +# name with each pattern and apply the encoding instead of the default +# INPUT_ENCODING) if there is a match. The character encodings are a list of the +# form: pattern=encoding (like *.php=ISO-8859-1). See cfg_input_encoding +# "INPUT_ENCODING" for further information on supported encodings. + +INPUT_FILE_ENCODING = + # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. @@ -848,13 +975,15 @@ INPUT_ENCODING = UTF-8 # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # -# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, -# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, -# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, -# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), -# *.doc (to be provided as doxygen C comment), *.txt (to be provided as doxygen -# C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f, *.for, *.tcl, *.vhd, -# *.vhdl, *.ucf, *.qsf and *.ice. +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cxxm, +# *.cpp, *.cppm, *.c++, *.c++m, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, +# *.ddl, *.odl, *.h, *.hh, *.hxx, *.hpp, *.h++, *.ixx, *.l, *.cs, *.d, *.php, +# *.php4, *.php5, *.phtml, *.inc, *.m, *.markdown, *.md, *.mm, *.dox (to be +# provided as doxygen C comment), *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, +# *.f18, *.f, *.for, *.vhd, *.vhdl, *.ucf, *.qsf and *.ice. FILE_PATTERNS = *.c \ *.cc \ @@ -918,7 +1047,10 @@ RECURSIVE = YES # run. EXCLUDE = build \ - cmake-build-debug + cmake-build-debug \ + lib \ + docs \ + src/swig_wrapper.cpp # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded @@ -940,12 +1072,9 @@ EXCLUDE_PATTERNS = # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, -# AClass::ANamespace, ANamespace::*Test -# -# Note that the wildcards are matched against the file with absolute path, so to -# exclude all test directories use the pattern */test/* +# ANamespace::AClass, ANamespace::*Test -EXCLUDE_SYMBOLS = +EXCLUDE_SYMBOLS = swig::* # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include @@ -988,6 +1117,11 @@ IMAGE_PATH = # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # +# Note that doxygen will use the data processed and written to standard output +# for further processing, therefore nothing else, like debug statements or used +# commands (so in case of a Windows batch file always use @echo OFF), should be +# written to standard output. +# # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. @@ -1029,6 +1163,15 @@ FILTER_SOURCE_PATTERNS = USE_MDFILE_AS_MAINPAGE = +# The Fortran standard specifies that for fixed formatted Fortran code all +# characters from position 72 are to be considered as comment. A common +# extension is to allow longer lines before the automatic comment starts. The +# setting FORTRAN_COMMENT_AFTER will also make it possible that longer lines can +# be processed before the automatic comment starts. +# Minimum value: 7, maximum value: 10000, default value: 72. + +FORTRAN_COMMENT_AFTER = 72 + #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- @@ -1116,16 +1259,24 @@ USE_HTAGS = NO VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the -# clang parser (see: http://clang.llvm.org/) for more accurate parsing at the -# cost of reduced performance. This can be particularly helpful with template -# rich C++ code for which doxygen's built-in parser lacks the necessary type -# information. +# clang parser (see: +# http://clang.llvm.org/) for more accurate parsing at the cost of reduced +# performance. This can be particularly helpful with template rich C++ code for +# which doxygen's built-in parser lacks the necessary type information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO +# If the CLANG_ASSISTED_PARSING tag is set to YES and the CLANG_ADD_INC_PATHS +# tag is set to YES then doxygen will add the directory of each input to the +# include path. +# The default value is: YES. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_ADD_INC_PATHS = YES + # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories @@ -1135,10 +1286,13 @@ CLANG_ASSISTED_PARSING = NO CLANG_OPTIONS = # If clang assisted parsing is enabled you can provide the clang parser with the -# path to the compilation database (see: -# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) used when the files -# were built. This is equivalent to specifying the "-p" option to a clang tool, -# such as clang-check. These options will then be passed to the parser. +# path to the directory containing a file called compile_commands.json. This +# file is the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the +# options used when the source files were built. This is equivalent to +# specifying the -p option to a clang tool, such as clang-check. These options +# will then be passed to the parser. Any options specified with CLANG_OPTIONS +# will be added as well. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse_libclang=ON option for CMake. @@ -1155,17 +1309,11 @@ CLANG_DATABASE_PATH = ALPHABETICAL_INDEX = YES -# The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in -# which the alphabetical index list will be split. -# Minimum value: 1, maximum value: 20, default value: 5. -# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. - -COLS_IN_ALPHA_INDEX = 5 - -# In case all classes in a project start with a common prefix, all classes will -# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag -# can be used to specify a prefix (or a list of prefixes) that should be ignored -# while generating the index headers. +# The IGNORE_PREFIX tag can be used to specify a prefix (or a list of prefixes) +# that should be ignored while generating the index headers. The IGNORE_PREFIX +# tag works for classes, function and member names. The entity will be placed in +# the alphabetical list under the first letter of the entity name that remains +# after removing the prefix. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = @@ -1244,7 +1392,12 @@ HTML_STYLESHEET = # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the -# list). For an example see the documentation. +# list). +# Note: Since the styling of scrollbars can currently not be overruled in +# Webkit/Chromium, the styling will be left out of the default doxygen.css if +# one or more extra stylesheets have been specified. So if scrollbar +# customization is desired it has to be added explicitly. For an example see the +# documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = @@ -1259,9 +1412,22 @@ HTML_EXTRA_STYLESHEET = HTML_EXTRA_FILES = +# The HTML_COLORSTYLE tag can be used to specify if the generated HTML output +# should be rendered with a dark or light theme. +# Possible values are: LIGHT always generate light mode output, DARK always +# generate dark mode output, AUTO_LIGHT automatically set the mode according to +# the user preference, use light mode if no preference is set (the default), +# AUTO_DARK automatically set the mode according to the user preference, use +# dark mode if no preference is set and TOGGLE allow to user to switch between +# light and dark mode via a button. +# The default value is: AUTO_LIGHT. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE = AUTO_LIGHT + # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to -# this color. Hue is specified as an angle on a colorwheel, see +# this color. Hue is specified as an angle on a color-wheel, see # https://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. @@ -1271,7 +1437,7 @@ HTML_EXTRA_FILES = HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors -# in the HTML output. For a value of 0 the output will use grayscales only. A +# in the HTML output. For a value of 0 the output will use gray-scales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. @@ -1289,15 +1455,6 @@ HTML_COLORSTYLE_SAT = 100 HTML_COLORSTYLE_GAMMA = 80 -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via JavaScript. If disabled, the navigation index will @@ -1317,6 +1474,13 @@ HTML_DYNAMIC_MENUS = YES HTML_DYNAMIC_SECTIONS = NO +# If the HTML_CODE_FOLDING tag is set to YES then classes and functions can be +# dynamically folded and expanded in the generated HTML source code. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_CODE_FOLDING = YES + # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to @@ -1332,10 +1496,11 @@ HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development -# environment (see: https://developer.apple.com/xcode/), introduced with OSX -# 10.5 (Leopard). To create a documentation set, doxygen will generate a -# Makefile in the HTML output directory. Running make will produce the docset in -# that directory and running make install will install the docset in +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy # genXcode/_index.html for more information. @@ -1352,6 +1517,13 @@ GENERATE_DOCSET = NO DOCSET_FEEDNAME = "Doxygen generated docs" +# This tag determines the URL of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDURL = + # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. @@ -1377,8 +1549,12 @@ DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop -# (see: https://www.microsoft.com/en-us/download/details.aspx?id=21138) on -# Windows. +# on Windows. In the beginning of 2021 Microsoft took the original page, with +# a.o. the download links, offline the HTML help workshop was already many years +# in maintenance mode). You can download the HTML help workshop from the web +# archives at Installation executable (see: +# http://web.archive.org/web/20160201063255/http://download.microsoft.com/downlo +# ad/0/A/9/0A939EF6-E31C-430F-A3DF-DFAE7960D564/htmlhelp.exe). # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML @@ -1408,7 +1584,7 @@ CHM_FILE = HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated -# (YES) or that it should be included in the master .chm file (NO). +# (YES) or that it should be included in the main .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. @@ -1435,6 +1611,16 @@ BINARY_TOC = NO TOC_EXPAND = NO +# The SITEMAP_URL tag is used to specify the full URL of the place where the +# generated documentation will be placed on the server by the user during the +# deployment of the documentation. The generated sitemap is called sitemap.xml +# and placed on the directory specified by HTML_OUTPUT. In case no SITEMAP_URL +# is specified no sitemap is generated. For information about the sitemap +# protocol see https://www.sitemaps.org +# This tag requires that the tag GENERATE_HTML is set to YES. + +SITEMAP_URL = + # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help @@ -1453,7 +1639,8 @@ QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace -# (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1461,8 +1648,8 @@ QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual -# Folders (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual- -# folders). +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. @@ -1470,16 +1657,16 @@ QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom -# Filters (see: https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom- -# filters). +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = @@ -1491,9 +1678,9 @@ QHP_CUST_FILTER_ATTRS = QHP_SECT_FILTER_ATTRS = -# The QHG_LOCATION tag can be used to specify the location of Qt's -# qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the -# generated .qhp file. +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = @@ -1536,16 +1723,28 @@ DISABLE_INDEX = NO # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can -# further fine-tune the look of the index. As an example, the default style -# sheet generated by doxygen has an example that shows how to put an image at -# the root of the tree instead of the PROJECT_NAME. Since the tree basically has -# the same information as the tab index, you could consider setting -# DISABLE_INDEX to YES when enabling this option. +# further fine tune the look of the index (see "Fine-tuning the output"). As an +# example, the default style sheet generated by doxygen has an example that +# shows how to put an image at the root of the tree instead of the PROJECT_NAME. +# Since the tree basically has the same information as the tab index, you could +# consider setting DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = YES +# When both GENERATE_TREEVIEW and DISABLE_INDEX are set to YES, then the +# FULL_SIDEBAR option determines if the side bar is limited to only the treeview +# area (value NO) or if it should extend to the full height of the window (value +# YES). Setting this to YES gives a layout similar to +# https://docs.readthedocs.io with more room for contents, but less room for the +# project logo, title, and description. If either GENERATE_TREEVIEW or +# DISABLE_INDEX is set to NO, this option has no effect. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FULL_SIDEBAR = NO + # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # @@ -1570,6 +1769,24 @@ TREEVIEW_WIDTH = 250 EXT_LINKS_IN_WINDOW = NO +# If the OBFUSCATE_EMAILS tag is set to YES, doxygen will obfuscate email +# addresses. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +OBFUSCATE_EMAILS = YES + +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML @@ -1579,17 +1796,6 @@ EXT_LINKS_IN_WINDOW = NO FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - # The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands # to create new LaTeX commands to be used in formulas as building blocks. See # the section "Including formulas" for details. @@ -1607,11 +1813,29 @@ FORMULA_MACROFILE = USE_MATHJAX = NO +# With MATHJAX_VERSION it is possible to specify the MathJax version to be used. +# Note that the different versions of MathJax have different requirements with +# regards to the different settings, so it is possible that also other MathJax +# settings have to be changed when switching between the different MathJax +# versions. +# Possible values are: MathJax_2 and MathJax_3. +# The default value is: MathJax_2. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_VERSION = MathJax_2 + # When MathJax is enabled you can set the default output format to be used for -# the MathJax output. See the MathJax site (see: -# http://docs.mathjax.org/en/latest/output.html) for more details. +# the MathJax output. For more details about the output format see MathJax +# version 2 (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) and MathJax version 3 +# (see: +# http://docs.mathjax.org/en/latest/web/components/output.html). # Possible values are: HTML-CSS (which is slower, but has the best -# compatibility), NativeMML (i.e. MathML) and SVG. +# compatibility. This is the name for Mathjax version 2, for MathJax version 3 +# this will be translated into chtml), NativeMML (i.e. MathML. Only supported +# for NathJax 2. For MathJax version 3 chtml will be used instead.), chtml (This +# is the name for Mathjax version 3, for MathJax version 2 this will be +# translated into HTML-CSS) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1624,22 +1848,29 @@ MATHJAX_FORMAT = HTML-CSS # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of -# MathJax from https://www.mathjax.org before deployment. -# The default value is: https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/. +# MathJax from https://www.mathjax.org before deployment. The default value is: +# - in case of MathJax version 2: https://cdn.jsdelivr.net/npm/mathjax@2 +# - in case of MathJax version 3: https://cdn.jsdelivr.net/npm/mathjax@3 # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/ # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example +# for MathJax version 2 (see +# https://docs.mathjax.org/en/v2.7-latest/tex.html#tex-and-latex-extensions): # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# For example for MathJax version 3 (see +# http://docs.mathjax.org/en/latest/input/tex/extensions/index.html): +# MATHJAX_EXTENSIONS = ams # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site -# (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. @@ -1686,7 +1917,8 @@ SERVER_BASED_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). +# Xapian (see: +# https://xapian.org/). # # See the section "External Indexing and Searching" for details. # The default value is: NO. @@ -1699,8 +1931,9 @@ EXTERNAL_SEARCH = NO # # Doxygen ships with an example indexer (doxyindexer) and search engine # (doxysearch.cgi) which are based on the open source search engine library -# Xapian (see: https://xapian.org/). See the section "External Indexing and -# Searching" for details. +# Xapian (see: +# https://xapian.org/). See the section "External Indexing and Searching" for +# details. # This tag requires that the tag SEARCHENGINE is set to YES. SEARCHENGINE_URL = @@ -1738,7 +1971,7 @@ EXTRA_SEARCH_MAPPINGS = # If the GENERATE_LATEX tag is set to YES, doxygen will generate LaTeX output. # The default value is: YES. -GENERATE_LATEX = YES +GENERATE_LATEX = NO # The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of @@ -1809,29 +2042,31 @@ PAPER_TYPE = a4 EXTRA_PACKAGES = -# The LATEX_HEADER tag can be used to specify a personal LaTeX header for the -# generated LaTeX document. The header should contain everything until the first -# chapter. If it is left blank doxygen will generate a standard header. See -# section "Doxygen usage" for information on how to let doxygen write the -# default header to a separate file. +# The LATEX_HEADER tag can be used to specify a user-defined LaTeX header for +# the generated LaTeX document. The header should contain everything until the +# first chapter. If it is left blank doxygen will generate a standard header. It +# is highly recommended to start with a default header using +# doxygen -w latex new_header.tex new_footer.tex new_stylesheet.sty +# and then modify the file new_header.tex. See also section "Doxygen usage" for +# information on how to generate the default header that doxygen normally uses. # -# Note: Only use a user-defined header if you know what you are doing! The -# following commands have a special meaning inside the header: $title, -# $datetime, $date, $doxygenversion, $projectname, $projectnumber, -# $projectbrief, $projectlogo. Doxygen will replace $title with the empty -# string, for the replacement values of the other commands the user is referred -# to HTML_HEADER. +# Note: Only use a user-defined header if you know what you are doing! +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. The following +# commands have a special meaning inside the header (and footer): For a +# description of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_HEADER = -# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for the -# generated LaTeX document. The footer should contain everything after the last -# chapter. If it is left blank doxygen will generate a standard footer. See +# The LATEX_FOOTER tag can be used to specify a user-defined LaTeX footer for +# the generated LaTeX document. The footer should contain everything after the +# last chapter. If it is left blank doxygen will generate a standard footer. See # LATEX_HEADER for more information on how to generate a default footer and what -# special commands can be used inside the footer. -# -# Note: Only use a user-defined footer if you know what you are doing! +# special commands can be used inside the footer. See also section "Doxygen +# usage" for information on how to generate the default footer that doxygen +# normally uses. Note: Only use a user-defined footer if you know what you are +# doing! # This tag requires that the tag GENERATE_LATEX is set to YES. LATEX_FOOTER = @@ -1864,18 +2099,26 @@ LATEX_EXTRA_FILES = PDF_HYPERLINKS = YES -# If the USE_PDFLATEX tag is set to YES, doxygen will use pdflatex to generate -# the PDF file directly from the LaTeX files. Set this option to YES, to get a -# higher quality PDF documentation. +# If the USE_PDFLATEX tag is set to YES, doxygen will use the engine as +# specified with LATEX_CMD_NAME to generate the PDF file directly from the LaTeX +# files. Set this option to YES, to get a higher quality PDF documentation. +# +# See also section LATEX_CMD_NAME for selecting the engine. # The default value is: YES. # This tag requires that the tag GENERATE_LATEX is set to YES. USE_PDFLATEX = YES -# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \batchmode -# command to the generated LaTeX files. This will instruct LaTeX to keep running -# if errors occur, instead of asking the user for help. This option is also used -# when generating formulas in HTML. +# The LATEX_BATCHMODE tag signals the behavior of LaTeX in case of an error. +# Possible values are: NO same as ERROR_STOP, YES same as BATCH, BATCH In batch +# mode nothing is printed on the terminal, errors are scrolled as if is +# hit at every error; missing files that TeX tries to input or request from +# keyboard input (\read on a not open input stream) cause the job to abort, +# NON_STOP In nonstop mode the diagnostic message will appear on the terminal, +# but there is no possibility of user interaction just like in batch mode, +# SCROLL In scroll mode, TeX will stop only for missing files to input or if +# keyboard input is necessary and ERROR_STOP In errorstop mode, TeX will stop at +# each error, asking for user intervention. # The default value is: NO. # This tag requires that the tag GENERATE_LATEX is set to YES. @@ -1888,16 +2131,6 @@ LATEX_BATCHMODE = NO LATEX_HIDE_INDICES = NO -# If the LATEX_SOURCE_CODE tag is set to YES then doxygen will include source -# code with syntax highlighting in the LaTeX output. -# -# Note that which sources are shown also depends on other settings such as -# SOURCE_BROWSER. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_SOURCE_CODE = NO - # The LATEX_BIB_STYLE tag can be used to specify the style to use for the # bibliography, e.g. plainnat, or ieeetr. See # https://en.wikipedia.org/wiki/BibTeX and \cite for more info. @@ -1906,14 +2139,6 @@ LATEX_SOURCE_CODE = NO LATEX_BIB_STYLE = plain -# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated -# page will contain the date and time when the page was generated. Setting this -# to NO can help when comparing the output of multiple runs. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_TIMESTAMP = NO - # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) # path from which the emoji images will be read. If a relative path is entered, # it will be relative to the LATEX_OUTPUT directory. If left blank the @@ -1978,16 +2203,6 @@ RTF_STYLESHEET_FILE = RTF_EXTENSIONS_FILE = -# If the RTF_SOURCE_CODE tag is set to YES then doxygen will include source code -# with syntax highlighting in the RTF output. -# -# Note that which sources are shown also depends on other settings such as -# SOURCE_BROWSER. -# The default value is: NO. -# This tag requires that the tag GENERATE_RTF is set to YES. - -RTF_SOURCE_CODE = NO - #--------------------------------------------------------------------------- # Configuration options related to the man page output #--------------------------------------------------------------------------- @@ -2084,27 +2299,44 @@ GENERATE_DOCBOOK = NO DOCBOOK_OUTPUT = docbook -# If the DOCBOOK_PROGRAMLISTING tag is set to YES, doxygen will include the -# program listings (including syntax highlighting and cross-referencing -# information) to the DOCBOOK output. Note that enabling this will significantly -# increase the size of the DOCBOOK output. -# The default value is: NO. -# This tag requires that the tag GENERATE_DOCBOOK is set to YES. - -DOCBOOK_PROGRAMLISTING = NO - #--------------------------------------------------------------------------- # Configuration options for the AutoGen Definitions output #--------------------------------------------------------------------------- # If the GENERATE_AUTOGEN_DEF tag is set to YES, doxygen will generate an -# AutoGen Definitions (see http://autogen.sourceforge.net/) file that captures +# AutoGen Definitions (see https://autogen.sourceforge.net/) file that captures # the structure of the code including all documentation. Note that this feature # is still experimental and incomplete at the moment. # The default value is: NO. GENERATE_AUTOGEN_DEF = NO +#--------------------------------------------------------------------------- +# Configuration options related to Sqlite3 output +#--------------------------------------------------------------------------- + +# If the GENERATE_SQLITE3 tag is set to YES doxygen will generate a Sqlite3 +# database with symbols found by doxygen stored in tables. +# The default value is: NO. + +GENERATE_SQLITE3 = NO + +# The SQLITE3_OUTPUT tag is used to specify where the Sqlite3 database will be +# put. If a relative path is entered the value of OUTPUT_DIRECTORY will be put +# in front of it. +# The default directory is: sqlite3. +# This tag requires that the tag GENERATE_SQLITE3 is set to YES. + +SQLITE3_OUTPUT = sqlite3 + +# The SQLITE3_OVERWRITE_DB tag is set to YES, the existing doxygen_sqlite3.db +# database file will be recreated with each doxygen run. If set to NO, doxygen +# will warn if an a database file is already found and not modify it. +# The default value is: YES. +# This tag requires that the tag GENERATE_SQLITE3 is set to YES. + +SQLITE3_RECREATE_DB = YES + #--------------------------------------------------------------------------- # Configuration options related to the Perl module output #--------------------------------------------------------------------------- @@ -2179,7 +2411,8 @@ SEARCH_INCLUDES = YES # The INCLUDE_PATH tag can be used to specify one or more directories that # contain include files that are not input files but should be processed by the -# preprocessor. +# preprocessor. Note that the INCLUDE_PATH is not recursive, so the setting of +# RECURSIVE has no effect here. # This tag requires that the tag SEARCH_INCLUDES is set to YES. INCLUDE_PATH = @@ -2246,15 +2479,15 @@ TAGFILES = GENERATE_TAGFILE = -# If the ALLEXTERNALS tag is set to YES, all external class will be listed in -# the class index. If set to NO, only the inherited external classes will be -# listed. +# If the ALLEXTERNALS tag is set to YES, all external classes and namespaces +# will be listed in the class and namespace index. If set to NO, only the +# inherited external classes will be listed. # The default value is: NO. ALLEXTERNALS = NO # If the EXTERNAL_GROUPS tag is set to YES, all external groups will be listed -# in the modules index. If set to NO, only the current project's groups will be +# in the topic index. If set to NO, only the current project's groups will be # listed. # The default value is: YES. @@ -2268,25 +2501,9 @@ EXTERNAL_GROUPS = YES EXTERNAL_PAGES = YES #--------------------------------------------------------------------------- -# Configuration options related to the dot tool +# Configuration options related to diagram generator tools #--------------------------------------------------------------------------- -# If the CLASS_DIAGRAMS tag is set to YES, doxygen will generate a class diagram -# (in HTML and LaTeX) for classes with base or super classes. Setting the tag to -# NO turns the diagrams off. Note that this option also works with HAVE_DOT -# disabled, but it is recommended to install and use dot, since it yields more -# powerful graphs. -# The default value is: YES. - -CLASS_DIAGRAMS = YES - -# You can include diagrams made with dia in doxygen documentation. Doxygen will -# then run dia to produce the diagram and insert it in the documentation. The -# DIA_PATH tag allows you to specify the directory where the dia binary resides. -# If left empty dia is assumed to be found in the default search path. - -DIA_PATH = - # If set to YES the inheritance and collaboration graphs will hide inheritance # and usage relations if the target is undocumented or is not a class. # The default value is: YES. @@ -2295,10 +2512,10 @@ HIDE_UNDOC_RELATIONS = YES # If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is # available from the path. This tool is part of Graphviz (see: -# http://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent +# https://www.graphviz.org/), a graph visualization toolkit from AT&T and Lucent # Bell Labs. The other options in this section have no effect if this option is # set to NO -# The default value is: YES. +# The default value is: NO. HAVE_DOT = YES @@ -2312,49 +2529,73 @@ HAVE_DOT = YES DOT_NUM_THREADS = 0 -# When you want a differently looking font in the dot files that doxygen -# generates you can specify the font name using DOT_FONTNAME. You need to make -# sure dot is able to find the font, which can be done by putting it in a -# standard location or by setting the DOTFONTPATH environment variable or by -# setting DOT_FONTPATH to the directory containing the font. -# The default value is: Helvetica. +# DOT_COMMON_ATTR is common attributes for nodes, edges and labels of +# subgraphs. When you want a differently looking font in the dot files that +# doxygen generates you can specify fontname, fontcolor and fontsize attributes. +# For details please see Node, +# Edge and Graph Attributes specification You need to make sure dot is able +# to find the font, which can be done by putting it in a standard location or by +# setting the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the +# directory containing the font. Default graphviz fontsize is 14. +# The default value is: fontname=Helvetica,fontsize=10. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTNAME = Helvetica +DOT_COMMON_ATTR = "fontname=Helvetica,fontsize=10" -# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of -# dot graphs. -# Minimum value: 4, maximum value: 24, default value: 10. +# DOT_EDGE_ATTR is concatenated with DOT_COMMON_ATTR. For elegant style you can +# add 'arrowhead=open, arrowtail=open, arrowsize=0.5'. Complete documentation about +# arrows shapes. +# The default value is: labelfontname=Helvetica,labelfontsize=10. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTSIZE = 10 +DOT_EDGE_ATTR = "labelfontname=Helvetica,labelfontsize=10" -# By default doxygen will tell dot to use the default font as specified with -# DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set -# the path where dot can find it using this tag. +# DOT_NODE_ATTR is concatenated with DOT_COMMON_ATTR. For view without boxes +# around nodes set 'shape=plain' or 'shape=plaintext' Shapes specification +# The default value is: shape=box,height=0.2,width=0.4. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_NODE_ATTR = "shape=box,height=0.2,width=0.4" + +# You can set the path where dot can find font specified with fontname in +# DOT_COMMON_ATTR and others dot attributes. # This tag requires that the tag HAVE_DOT is set to YES. DOT_FONTPATH = -# If the CLASS_GRAPH tag is set to YES then doxygen will generate a graph for -# each documented class showing the direct and indirect inheritance relations. -# Setting this tag to YES will force the CLASS_DIAGRAMS tag to NO. +# If the CLASS_GRAPH tag is set to YES or GRAPH or BUILTIN then doxygen will +# generate a graph for each documented class showing the direct and indirect +# inheritance relations. In case the CLASS_GRAPH tag is set to YES or GRAPH and +# HAVE_DOT is enabled as well, then dot will be used to draw the graph. In case +# the CLASS_GRAPH tag is set to YES and HAVE_DOT is disabled or if the +# CLASS_GRAPH tag is set to BUILTIN, then the built-in generator will be used. +# If the CLASS_GRAPH tag is set to TEXT the direct and indirect inheritance +# relations will be shown as texts / links. +# Possible values are: NO, YES, TEXT, GRAPH and BUILTIN. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. CLASS_GRAPH = YES # If the COLLABORATION_GRAPH tag is set to YES then doxygen will generate a # graph for each documented class showing the direct and indirect implementation # dependencies (inheritance, containment, and class references variables) of the -# class with other documented classes. +# class with other documented classes. Explicit enabling a collaboration graph, +# when COLLABORATION_GRAPH is set to NO, can be accomplished by means of the +# command \collaborationgraph. Disabling a collaboration graph can be +# accomplished by means of the command \hidecollaborationgraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. COLLABORATION_GRAPH = YES # If the GROUP_GRAPHS tag is set to YES then doxygen will generate a graph for -# groups, showing the direct groups dependencies. +# groups, showing the direct groups dependencies. Explicit enabling a group +# dependency graph, when GROUP_GRAPHS is set to NO, can be accomplished by means +# of the command \groupgraph. Disabling a directory graph can be accomplished by +# means of the command \hidegroupgraph. See also the chapter Grouping in the +# manual. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. @@ -2377,10 +2618,32 @@ UML_LOOK = NO # but if the number exceeds 15, the total amount of fields shown is limited to # 10. # Minimum value: 0, maximum value: 100, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. +# This tag requires that the tag UML_LOOK is set to YES. UML_LIMIT_NUM_FIELDS = 10 +# If the DOT_UML_DETAILS tag is set to NO, doxygen will show attributes and +# methods without types and arguments in the UML graphs. If the DOT_UML_DETAILS +# tag is set to YES, doxygen will add type and arguments for attributes and +# methods in the UML graphs. If the DOT_UML_DETAILS tag is set to NONE, doxygen +# will not generate fields with class member information in the UML graphs. The +# class diagrams will look similar to the default class diagrams but using UML +# notation for the relationships. +# Possible values are: NO, YES and NONE. +# The default value is: NO. +# This tag requires that the tag UML_LOOK is set to YES. + +DOT_UML_DETAILS = NO + +# The DOT_WRAP_THRESHOLD tag can be used to set the maximum number of characters +# to display on a single line. If the actual line length exceeds this threshold +# significantly it will wrapped across multiple lines. Some heuristics are apply +# to avoid ugly line breaks. +# Minimum value: 0, maximum value: 1000, default value: 17. +# This tag requires that the tag HAVE_DOT is set to YES. + +DOT_WRAP_THRESHOLD = 17 + # If the TEMPLATE_RELATIONS tag is set to YES then the inheritance and # collaboration graphs will show the relations between templates and their # instances. @@ -2392,7 +2655,9 @@ TEMPLATE_RELATIONS = NO # If the INCLUDE_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are set to # YES then doxygen will generate a graph for each documented file showing the # direct and indirect include dependencies of the file with other documented -# files. +# files. Explicit enabling an include graph, when INCLUDE_GRAPH is is set to NO, +# can be accomplished by means of the command \includegraph. Disabling an +# include graph can be accomplished by means of the command \hideincludegraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. @@ -2401,7 +2666,10 @@ INCLUDE_GRAPH = YES # If the INCLUDED_BY_GRAPH, ENABLE_PREPROCESSING and SEARCH_INCLUDES tags are # set to YES then doxygen will generate a graph for each documented file showing # the direct and indirect include dependencies of the file with other documented -# files. +# files. Explicit enabling an included by graph, when INCLUDED_BY_GRAPH is set +# to NO, can be accomplished by means of the command \includedbygraph. Disabling +# an included by graph can be accomplished by means of the command +# \hideincludedbygraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. @@ -2441,28 +2709,36 @@ GRAPHICAL_HIERARCHY = YES # If the DIRECTORY_GRAPH tag is set to YES then doxygen will show the # dependencies a directory has on other directories in a graphical way. The # dependency relations are determined by the #include relations between the -# files in the directories. +# files in the directories. Explicit enabling a directory graph, when +# DIRECTORY_GRAPH is set to NO, can be accomplished by means of the command +# \directorygraph. Disabling a directory graph can be accomplished by means of +# the command \hidedirectorygraph. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. DIRECTORY_GRAPH = YES +# The DIR_GRAPH_MAX_DEPTH tag can be used to limit the maximum number of levels +# of child directories generated in directory dependency graphs by dot. +# Minimum value: 1, maximum value: 25, default value: 1. +# This tag requires that the tag DIRECTORY_GRAPH is set to YES. + +DIR_GRAPH_MAX_DEPTH = 10 + # The DOT_IMAGE_FORMAT tag can be used to set the image format of the images # generated by dot. For an explanation of the image formats see the section # output formats in the documentation of the dot tool (Graphviz (see: -# http://www.graphviz.org/)). +# https://www.graphviz.org/)). # Note: If you choose svg you need to set HTML_FILE_EXTENSION to xhtml in order # to make the SVG files visible in IE 9+ (other browsers do not have this # requirement). -# Possible values are: png, png:cairo, png:cairo:cairo, png:cairo:gd, png:gd, -# png:gd:gd, jpg, jpg:cairo, jpg:cairo:gd, jpg:gd, jpg:gd:gd, gif, gif:cairo, -# gif:cairo:gd, gif:gd, gif:gd:gd, svg, png:gd, png:gd:gd, png:cairo, +# Possible values are: png, jpg, gif, svg, png:gd, png:gd:gd, png:cairo, # png:cairo:gd, png:cairo:cairo, png:cairo:gdiplus, png:gdiplus and # png:gdiplus:gdiplus. # The default value is: png. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_IMAGE_FORMAT = png +DOT_IMAGE_FORMAT = svg # If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to # enable generation of interactive SVG images that allow zooming and panning. @@ -2489,11 +2765,12 @@ DOT_PATH = DOTFILE_DIRS = -# The MSCFILE_DIRS tag can be used to specify one or more directories that -# contain msc files that are included in the documentation (see the \mscfile -# command). +# You can include diagrams made with dia in doxygen documentation. Doxygen will +# then run dia to produce the diagram and insert it in the documentation. The +# DIA_PATH tag allows you to specify the directory where the dia binary resides. +# If left empty dia is assumed to be found in the default search path. -MSCFILE_DIRS = +DIA_PATH = # The DIAFILE_DIRS tag can be used to specify one or more directories that # contain dia files that are included in the documentation (see the \diafile @@ -2502,10 +2779,10 @@ MSCFILE_DIRS = DIAFILE_DIRS = # When using plantuml, the PLANTUML_JAR_PATH tag should be used to specify the -# path where java can find the plantuml.jar file. If left blank, it is assumed -# PlantUML is not used or called during a preprocessing step. Doxygen will -# generate a warning when it encounters a \startuml command in this case and -# will not generate output for the diagram. +# path where java can find the plantuml.jar file or to the filename of jar file +# to be used. If left blank, it is assumed PlantUML is not used or called during +# a preprocessing step. Doxygen will generate a warning when it encounters a +# \startuml command in this case and will not generate output for the diagram. PLANTUML_JAR_PATH = @@ -2541,19 +2818,7 @@ DOT_GRAPH_MAX_NODES = 50 # Minimum value: 0, maximum value: 1000, default value: 0. # This tag requires that the tag HAVE_DOT is set to YES. -MAX_DOT_GRAPH_DEPTH = 0 - -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not seem -# to support this out of the box. -# -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). -# The default value is: NO. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_TRANSPARENT = NO +MAX_DOT_GRAPH_DEPTH = 1000 # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This @@ -2567,14 +2832,34 @@ DOT_MULTI_TARGETS = NO # If the GENERATE_LEGEND tag is set to YES doxygen will generate a legend page # explaining the meaning of the various boxes and arrows in the dot generated # graphs. +# Note: This tag requires that UML_LOOK isn't set, i.e. the doxygen internal +# graphical representation for inheritance and collaboration diagrams is used. # The default value is: YES. # This tag requires that the tag HAVE_DOT is set to YES. GENERATE_LEGEND = YES -# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate dot +# If the DOT_CLEANUP tag is set to YES, doxygen will remove the intermediate # files that are used to generate the various graphs. +# +# Note: This setting is not only used for dot files but also for msc temporary +# files. # The default value is: YES. -# This tag requires that the tag HAVE_DOT is set to YES. DOT_CLEANUP = YES + +# You can define message sequence charts within doxygen comments using the \msc +# command. If the MSCGEN_TOOL tag is left empty (the default), then doxygen will +# use a built-in version of mscgen tool to produce the charts. Alternatively, +# the MSCGEN_TOOL tag can also specify the name an external tool. For instance, +# specifying prog as the value, doxygen will call the tool as prog -T +# -o . The external tool should support +# output file formats "png", "eps", "svg", and "ismap". + +MSCGEN_TOOL = + +# The MSCFILE_DIRS tag can be used to specify one or more directories that +# contain msc files that are included in the documentation (see the \mscfile +# command). + +MSCFILE_DIRS = diff --git a/README.md b/README.md index 04c7b886..a053f78d 100644 --- a/README.md +++ b/README.md @@ -2,4 +2,7 @@ An alignment-based, generalized structural variant caller for long-read sequencing/mapping data -[![build tests](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml/badge.svg)](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml) \ No newline at end of file +[![build +tests](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml/badge.svg)](https://github.com/WGLab/ContextSV/actions/workflows/build-tests.yml) + +Documentation available at [https://wglab.openbioinformatics.org/ContextSV](https://wglab.openbioinformatics.org/ContextSV) diff --git a/__main__.py b/__main__.py index 137e7109..4d946d5f 100644 --- a/__main__.py +++ b/__main__.py @@ -3,52 +3,222 @@ """ import os +import sys import argparse +import logging as log from lib import contextsv +from python import cnv_plots + +# Set up logging. +log.basicConfig( + level=log.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + log.StreamHandler(sys.stdout) + ] +) def main(): + """Entry point and user interface for the program.""" # Grab the command line arguments using argparse. parser = argparse.ArgumentParser( description="ContextSV: A tool for integrative structural variant detection." ) + + # Add common arguments. + parser.add_argument( + "-r", "--region", + help="The region to analyze.", + required=True + ) + + parser.add_argument( + "-o", "--output", + help="The path to the output directory.", + required=True + ) + + parser.add_argument( + "-v", "--version", + help="Print the version number and exit.", + action="version", + version="%(prog)s 0.0.1" + ) + + # Thread count. + parser.add_argument( + "-t", "--threads", + help="The number of threads to use.", + required=False, + default=1, + type=int + ) + + # CNV data path. + parser.add_argument( + "--cnv", + help="The path to the CNV data in TSV format.", + required=False + ) + + # Mode 1: SV detection mode. parser.add_argument( "-b", "--bam", help="The path to the BAM file.", - required=True + required=False + ) + parser.add_argument( + "-g", "--reference", + help="The path to the reference genome.", + required=False ) parser.add_argument( "-s", "--snps", help="The path to the SNPs file.", - required=True + required=False ) + + # PFB file of population allele frequencies. parser.add_argument( - "-o", "--output", - help="The path to the output file.", - required=True + "-p", "--pfb", + help="The path to the PFB file of population allele frequencies.", + required=False ) + + # HMM file path. parser.add_argument( - "-r", "--region", - help="The region to analyze.", - required=True + "--hmm", + help="The path to the HMM file.", + required=False + ) + + # Pass in chromosome mean coverage data for speediness. + parser.add_argument( + "--chr-cov", + help="Chromosome mean coverage values passed in as a comma-separated list (e.g. chr1:100,chr2:200,chr3:300).", + required=False + ) + + # Turn off CIGAR string SV detection. This is for debugging purposes (speeds + # up the program). + parser.add_argument( + "--disable-cigar", + help="Turn off CIGAR string SV detection.", + required=False, + action="store_true", + default=False + ) + + # Mode 2: CNV plots mode. If only the VCF file is provided, then the program + # will run in this mode. + parser.add_argument( + "--vcf", + help="The path to the VCF file of SV calls.", + required=False ) # Get the command line arguments. args = parser.parse_args() - # Run the program. - print("Running contextsv with the following arguments:") - print("BAM: {}".format(args.bam)) - print("SNPs: {}".format(args.snps)) - print("Output: {}".format(args.output)) - print("Region: {}".format(args.region)) - - contextsv.run( - args.bam, - args.snps, - args.output, - args.region - ) + # Remove commas or spaces from the region. + region = args.region + region = region.replace(",", "") + region = region.replace(" ", "") + + # Determine the selected program mode (SV detection or CNV plots). + if (args.vcf is not None): + # Run SV analysis mode. + + # Ensure that the CNV data path is provided. + arg_error = False + if (args.cnv is None): + log.error("Please provide the CNV data path.") + arg_error = True + + # Exit if there are any errors. + if (arg_error): + sys.exit(1) + + # Create the output directory if it doesn't exist. + if (not os.path.exists(args.output)): + os.makedirs(args.output) + + # Set the data paths from user input for downstream analysis. + vcf_path = args.vcf + cnv_data_path = args.cnv + output_dir = args.output + + log.info("Analyzing SV calls in VCF file %s...", args.vcf) + + else: + # Run SV detection mode. + + # Ensure BAM, reference, and SNPs files are provided. + arg_error = False + if (args.bam is None): + log.error("Please provide the BAM file.") + arg_error = True + + if (args.reference is None): + log.error("Please provide the reference genome.") + arg_error = True + + if (args.snps is None): + log.error("Please provide the SNPs file.") + arg_error = True + + # Exit if there are any errors. + if (arg_error): + # Exit with error code 1. + sys.exit(1) + + # Set all None values to empty strings. + for key, value in vars(args).items(): + if value is None: + setattr(args, key, "") + + else: + log.info("Setting %s to %s", key, value) + + # Loop and set all None values to empty strings. + for key, value in vars(args).items(): + if value is None: + setattr(args, key, "") + + log.info("Chromosome mean coverage: %s", args.chr_cov) + log.info("PFB filepath: %s", args.pfb) + + # Set input parameters. + input_data = contextsv.InputData() + input_data.setBAMFilepath(args.bam) + input_data.setRefGenome(args.reference) + input_data.setSNPFilepath(args.snps) + input_data.setRegion(args.region) + input_data.setThreadCount(args.threads) + input_data.setChrCov(args.chr_cov) + input_data.setPFBFilepath(args.pfb) + input_data.setHMMFilepath(args.hmm) + input_data.setOutputDir(args.output) + input_data.setDisableCIGAR(args.disable_cigar) + input_data.setCNVFilepath(args.cnv) + + # Run the analysis. + contextsv.run(input_data) + + # Determine the data paths for downstream analysis. + vcf_path = os.path.join(args.output, "sv_calls.vcf") + output_dir = args.output + + if (args.cnv == ""): + cnv_data_path = os.path.join(args.output, "cnv_data.tsv") + else: + cnv_data_path = args.cnv + + # Run the python-based analysis. + log.info("Running python-based analysis...") + cnv_plots.run(vcf_path, cnv_data_path, output_dir, region) + log.info("Done.") if __name__ == '__main__': diff --git a/data/chr_cov.txt b/data/chr_cov.txt new file mode 100644 index 00000000..89fe909d --- /dev/null +++ b/data/chr_cov.txt @@ -0,0 +1,3 @@ +# This text file includes chromosome coverage values for speeding up runtime +chr3=39.561 +chr6=39.4096 diff --git a/data/hhall.hmm b/data/hhall.hmm new file mode 100644 index 00000000..139e6361 --- /dev/null +++ b/data/hhall.hmm @@ -0,0 +1,36 @@ +M=6 +N=6 +A: +0.936719716 0.006332139 0.048770575 0.000000001 0.008177573 0.000000001 +0.000801036 0.949230924 0.048770575 0.000000001 0.001168245 0.000029225 +0.000004595 0.000047431 0.999912387 0.000000001 0.000034971 0.000000621 +0.000049998 0.000049998 0.000049998 0.999750015 0.000049998 0.000049998 +0.000916738 0.001359036 0.048770575 0.000000001 0.948953653 0.000000002 +0.000000001 0.000000001 0.027257213 0.000000001 0.000000004 0.972742785 +B: +0.950000 0.000001 0.050000 0.000001 0.000001 0.000001 +0.000001 0.950000 0.050000 0.000001 0.000001 0.000001 +0.000001 0.000001 0.999995 0.000001 0.000001 0.000001 +0.000001 0.000001 0.050000 0.950000 0.000001 0.000001 +0.000001 0.000001 0.050000 0.000001 0.950000 0.000001 +0.000001 0.000001 0.050000 0.000001 0.000001 0.950000 +pi: +0.000001 0.000500 0.999000 0.000001 0.000500 0.000001 +B1_mean: +-3.527211 -0.664184 0.000000 100.000000 0.395621 0.678345 +B1_sd: +1.329152 0.284338 0.159645 0.211396 0.209089 0.191579 +B1_uf: +0.010000 +B2_mean: +0.000000 0.250000 0.333333 0.500000 0.500000 +B2_sd: +0.016372 0.042099 0.045126 0.034982 0.304243 +B2_uf: +0.010000 +B3_mean: +-2.051407 -0.572210 0.000000 0.000000 0.361669 0.626711 +B3_sd: +2.132843 0.382025 0.184001 0.200297 0.253551 0.353183 +B3_uf: +0.010000 diff --git a/data/hhall_loh.hmm b/data/hhall_loh.hmm new file mode 100644 index 00000000..2c7a2ff7 --- /dev/null +++ b/data/hhall_loh.hmm @@ -0,0 +1,36 @@ +M=6 +N=6 +A: +0.936719716 0.006332139 0.048770575 0.000000001 0.008177573 0.000000001 +0.000801036 0.949230924 0.048770575 0.000000001 0.001168245 0.000029225 +0.000004595 0.000047431 0.999912387 0.000000001 0.000034971 0.000000621 +0.000049998 0.000049998 0.000049998 0.999750015 0.000049998 0.000049998 +0.000916738 0.001359036 0.048770575 0.000000001 0.948953653 0.000000002 +0.000000001 0.000000001 0.027257213 0.000000001 0.000000004 0.972742785 +B: +0.950000 0.000001 0.050000 0.000001 0.000001 0.000001 +0.000001 0.950000 0.050000 0.000001 0.000001 0.000001 +0.000001 0.000001 0.999995 0.000001 0.000001 0.000001 +0.000001 0.000001 0.050000 0.950000 0.000001 0.000001 +0.000001 0.000001 0.050000 0.000001 0.950000 0.000001 +0.000001 0.000001 0.050000 0.000001 0.000001 0.950000 +pi: +0.000001 0.000500 0.999000 0.000001 0.000500 0.000001 +B1_mean: +-3.527211 -0.664184 0.000000 0.000000 0.395621 0.678345 +B1_sd: +1.329152 0.284338 0.159645 0.211396 0.209089 0.191579 +B1_uf: +0.010000 +B2_mean: +0.000000 0.250000 0.333333 0.500000 0.500000 +B2_sd: +0.016372 0.042099 0.045126 0.034982 0.304243 +B2_uf: +0.010000 +B3_mean: +-2.051407 -0.572210 0.000000 0.000000 0.361669 0.626711 +B3_sd: +2.132843 0.382025 0.184001 0.200297 0.253551 0.353183 +B3_uf: +0.010000 diff --git a/docs/genome-assembly-guide.md b/docs/genome-assembly-guide.md new file mode 100644 index 00000000..c37821b9 --- /dev/null +++ b/docs/genome-assembly-guide.md @@ -0,0 +1,43 @@ +# Hybrid Optical Map-Guided Assembly + +## Download Test Data +Input data is from GIAB: Oxford Nanopore ultralong (guppy-V3.2.4_2020-01-22) + +https://github.com/genome-in-a-bottle/giab_data_indexes + +## De novo Assembly +We will perform de novo assembly of the hg002 data using miniasm (instructions from https://github.com/lh3/miniasm) + +- Also see Optical Kermit's optical map guided assembly script, which uses the +same tools and approach (https://github.com/Denopia/kermit-optical-maps/blob/master/genome-assembly.sh) + +### Set up the software environment: + +``` +export PATH="$software_dir"/minimap2:$PATH +export PATH="$software_dir"/miniasm:$PATH +``` + +### Use minimap2 to self-align the hg002 data: + +```echo "Running minimap2..." +minimap2 -x ava-ont -t 12 input_data/HG002_hs37d5_ONT_GIAB.fastq input_data/HG002_hs37d5_ONT_GIAB.fastq | gzip -1 > output/HG002_hs37d5_ONT_GIAB.paf.gz +``` + +### Use miniasm to assemble the self-aligned data: + +``` +echo "Running miniasm..." +miniasm -f inputs/HG002_hs37d5_ONT_GIAB.fastq outputs/HG002_hs37d5_ONT_GIAB.paf.gz > outputs/HG002_hs37d5_ONT_GIAB.gfa +``` + +### Convert GFA file to FASTA for downstream analysis +(instructions from https://www.biostars.org/p/169516/) + +``` +echo "Converting GFA to FASTA..." +awk '/^S/{print ">"$2"\n"$3}' outputs/HG002_hs37d5_ONT_GIAB.gfa > outputs/HG002_hs37d5_ONT_GIAB.fasta +echo "Complete." +``` + +This yields an assembly which can be used for input into BioNano's hybrid scaffolding pipeline to generate a more accurate hybrid assembly using the BioNano data. diff --git a/doc/OUTLINE.md b/docs/project-overview.md similarity index 80% rename from doc/OUTLINE.md rename to docs/project-overview.md index f10c4872..995ea731 100644 --- a/doc/OUTLINE.md +++ b/docs/project-overview.md @@ -1,44 +1,40 @@ -# ContextSV Outline - -## External dependencies - -| Dependency | Use | Installation | -| - | - | - | -| HTSLib | Reading BAM files. | [https://www.biostars.org/p/328831/](https://www.biostars.org/p/328831/)
http://www.htslib.org/download/ | -| Tensorflow | Importing the SV scoring model. | [https://www.tensorflow.org/install/lang_c](https://www.tensorflow.org/install/lang_c)| - -## SV scoring model -The SV scoring model is trained and saved as a Tensorflow weighted graph in PB file format. It can be trained via the [Python API](https://www.tensorflow.org/api_docs/python/tf). The [C++ API](https://www.tensorflow.org/api_docs/cc) is used to import and utilize this model for scoring SVs. - -### Training the model - -## Project structure - -``` -ContextSV -├── include -│ └── ContextSV.h -├── src -│ └── ContextSV.cpp -└── tests -│ ├── BasicTests.cpp -│ ├── AccuracyTests.cpp -│ └── ScoringModelTests.cpp -└── lib - ├── tensorflow - | ├── LICENSE - │ ├── include - │ │ └── ... - │ └── lib - │ └── ... - └── htslib - ├── LICENSE - ├── include - │ └── ... - └── lib - └── ... -``` - -## Tasks by order -- [x] Draft project structure -- [ ] CLI with BAM file input +# Project Overview + +## External dependencies + +| Dependency | Use | Installation | +| - | - | - | +| HTSLib | Reading BAM files. | [https://www.biostars.org/p/328831/](https://www.biostars.org/p/328831/)
http://www.htslib.org/download/ | +| scikit-learn | Importing the SV scoring model. | [https://www.tensorflow.org/install/lang_c](https://www.tensorflow.org/install/lang_c)| + +## SV scoring model +The SV scoring model is trained and saved as a Tensorflow weighted graph in PB file format. It can be trained via the [Python API](https://www.tensorflow.org/api_docs/python/tf). The [C++ API](https://www.tensorflow.org/api_docs/cc) is used to import and utilize this model for scoring SVs. + +### Training the model + +## Project structure + +``` +ContextSV +├── include +│ └── ContextSV.h +├── src +│ └── ContextSV.cpp +└── tests +│ ├── BasicTests.cpp +│ ├── AccuracyTests.cpp +│ └── ScoringModelTests.cpp +└── lib + ├── tensorflow + | ├── LICENSE + │ ├── include + │ │ └── ... + │ └── lib + │ └── ... + └── htslib + ├── LICENSE + ├── include + │ └── ... + └── lib + └── ... +``` diff --git a/environment.yml b/environment.yml index 726c3c89..8820b657 100644 --- a/environment.yml +++ b/environment.yml @@ -10,3 +10,7 @@ dependencies: - htslib - swig - pytest + - plotly + - pandas + - scikit-learn + - joblib diff --git a/include/cli.h b/include/cli.h deleted file mode 100644 index 716c8cb1..00000000 --- a/include/cli.h +++ /dev/null @@ -1,15 +0,0 @@ -// -// cli.h: -// Command-line interface entrypoint. -// - -#ifndef CONTEXTSV_CLI_H -#define CONTEXTSV_CLI_H - -#include "common.h" - -#include - -int run(std::string bam, std::string snps, std::string outdir, std::string region); - -#endif //CONTEXTSV_CLI_H diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 2672e86d..4ad68927 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -1,37 +1,34 @@ -// -// cnv_caller.h: -// Detect CNVs -// +// CNVCaller: Detect CNVs and return the state sequence by SNP position +// (key = [chromosome, SNP position], value = state) #ifndef CNV_CALLER_H #define CNV_CALLER_H #include "khmm.h" -#include "common.h" +#include "input_data.h" +#include "cnv_data.h" +#include "types.h" +/// @cond #include #include +/// @endcond -struct RegionCoverage { - bool valid = false; - uint64_t length = 0; - double mean = 0; - double baf = 0; -}; class CNVCaller { private: - Common common; + InputData* input_data; double chr_mean_coverage = 0; public: - CNVCaller(Common common); + CNVCaller(InputData& input_data); - // Detect CNVs - std::vector run(); + // Detect CNVs and return the state sequence by SNP position + // (key = [chromosome, SNP position], value = state) + CNVData run(); - // Calculate Log R Ratios - std::vector calculateLogRRatiosAtSNPS(std::vector snp_positions); + // Calculate coverage log2 ratios at SNP positions + std::vector calculateLog2RatioAtSNPS(std::vector snp_positions); // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(); @@ -40,10 +37,17 @@ class CNVCaller { double calculateWindowLogRRatio(double mean_chr_cov, int start_pos, int end_pos); // Read SNP positions and BAF values from the VCF file - std::pair, std::vector> readSNPBAFs(); + SNPData readSNPBAFs(std::string filepath); + + // Read SNP population frequencies from the PFB file and return a vector + // of population frequencies for each SNP location + std::vector getSNPPopulationFrequencies(std::vector snp_locations); + + // Save a BED file with predicted copy number states + void saveToBED(SNPData& snp_data, std::string filepath); - // Save a CSV with SNP positions, BAF values and Log R Ratios - void saveSNPLRRBAFCSV(std::string filepath, std::vector snp_positions, std::vector bafs, std::vector logr_ratios, std::vector state_sequence); + // Save a TSV with B-allele frequencies, log 2 ratios, and copy number predictions + void saveToTSV(SNPData& snp_data, std::string filepath); }; #endif // CNV_CALLER_H diff --git a/include/cnv_data.h b/include/cnv_data.h new file mode 100644 index 00000000..39e33cd4 --- /dev/null +++ b/include/cnv_data.h @@ -0,0 +1,26 @@ +#ifndef CNV_DATA_H +#define CNV_DATA_H + +#include "types.h" + +/// @cond +#include +/// @endcond + + +class CNVData { + private: + SNPToCNVMap cnv_calls; // Map of SNP positions to CNV types + + public: + // Add a CNV call to the map + void addCNVCall(std::string chr, int snp_pos, int cnv_type); + + // Get the most common CNV type within the SV region start and end positions + int getMostCommonCNV(std::string chr, int start, int end); + + // Load CNV calls from file + void loadFromFile(std::string filepath); +}; + +#endif // CNV_DATA_H diff --git a/include/common.h b/include/common.h deleted file mode 100644 index 24689b2a..00000000 --- a/include/common.h +++ /dev/null @@ -1,45 +0,0 @@ -// -// common.h: -// Manage common parameters and functions - -#ifndef CONTEXTSV_COMMON_H -#define CONTEXTSV_COMMON_H - -#include -#include -#include - -class Common { - public: - std::string get_bam_filepath(); - void set_bam_filepath(std::string filepath); - std::string get_ref_filepath(); - void set_ref_filepath(std::string filepath); - std::string get_output_dir(); - void set_output_dir(std::string dirpath); - std::string get_region(); - void set_region(std::string region); - int get_window_size(); - void set_window_size(int window_size); - std::string get_snp_vcf_filepath(); - void set_snp_vcf_filepath(std::string filepath); - std::string get_region_chr(); - int get_region_start(); - int get_region_end(); - bool get_region_set(); - void print_progress(int progress, int total); - - private: - std::string bam_filepath = ""; - std::string ref_filepath = ""; - std::string output_dir = ""; - std::string region = ""; - int window_size = 10000; - std::string snp_vcf_filepath = ""; - std::string region_chr = ""; - int region_start = 0; - int region_end = 0; - bool region_set = false; -}; - -#endif //CONTEXTSV_COMMON_H diff --git a/include/contextsv.h b/include/contextsv.h index d5304c1e..b90877ce 100644 --- a/include/contextsv.h +++ b/include/contextsv.h @@ -1,15 +1,28 @@ // -// contextsv.h -// Main header file for the contextsv library. +// contextsv.h: +// Main class for ContextSV // #ifndef CONTEXTSV_H #define CONTEXTSV_H -#include "khmm.h" -#include "common.h" +#include "input_data.h" +#include "cnv_data.h" +#include "sv_data.h" -#include -#include -#endif // CONTEXTSV_H +class ContextSV { + private: + InputData* input_data; + + // Label SVs based on CNV calls + void labelCNVs(CNVData cnv_calls, SVData& sv_calls); + + public: + ContextSV(InputData& input_data); + + // Entry point + int run(); +}; + +#endif // CONTEXTSV_H diff --git a/include/fasta_query.h b/include/fasta_query.h new file mode 100644 index 00000000..4050aff5 --- /dev/null +++ b/include/fasta_query.h @@ -0,0 +1,25 @@ +// FASTAQuery: A class for querying a FASTA file. + +#ifndef FASTA_QUERY_H +#define FASTA_QUERY_H + +/// @cond +#include +#include +/// @endcond + +class FASTAQuery { + private: + std::string fasta_filepath; + std::map chr_to_seq; + + public: + int setFilepath(std::string fasta_filepath); + std::string getFilepath(); + std::string query(std::string chr, int64_t pos_start, int64_t pos_end); + + // Get the chromosome contig lengths in VCF header format + std::string getContigHeader(); +}; + +#endif // FASTA_QUERY_H diff --git a/include/input_data.h b/include/input_data.h new file mode 100644 index 00000000..9af673a4 --- /dev/null +++ b/include/input_data.h @@ -0,0 +1,93 @@ +// +// common.h: +// Manage common types, parameters, and functions + +#ifndef INPUT_DATA_H +#define INPUT_DATA_H + +#include "fasta_query.h" + +/// @cond +#include +#include +#include +/// @endcond + +class InputData { + public: + InputData(); + + // Set the filepath to the BAM file with long read alignments for SV detection. + void setBAMFilepath(std::string filepath); + std::string getBAMFilepath(); + + // Set the filepath to the HMM parameters. + void setHMMFilepath(std::string filepath); + std::string getHMMFilepath(); + + // Set the filepath to the reference genome FASTA file. + void setRefGenome(std::string fasta_filepath); + FASTAQuery getRefGenome(); + + // Set the filepath to the tab-delimited file with SNP population frequencies. + void setPFBFilepath(std::string filepath); + std::string getPFBFilepath(); + + // Set the filepath to the VCF file with SNP calls used for CNV + // detection with the HMM. + void setSNPFilepath(std::string filepath); + std::string getSNPFilepath(); + + // Set the genomic region to analyze. + void setRegion(std::string region); + std::string getRegion(); + std::string getRegionChr(); + int getRegionStart(); + int getRegionEnd(); + bool getRegionSet(); + + // Set the window size for the log2 ratio calculation. + void setWindowSize(int window_size); + int getWindowSize(); + + // Set entire-chromosome mean coverage values to speed up the log2 ratio calculations. + void setChrCov(std::string chr_cov); + int getChrCov(std::string chr, double& cov); + + // Set the output directory where the results will be written. + void setOutputDir(std::string dirpath); + std::string getOutputDir(); + + // Set the number of threads to use when parallelization is possible. + void setThreadCount(int thread_count); + int getThreadCount(); + + // Disable CIGAR string SV calling. This is useful for testing. + void setDisableCIGAR(bool disable_cigar); + bool getDisableCIGAR(); + + // Set the filepath to the TSV file with the CNV predictions. + void setCNVFilepath(std::string filepath); + std::string getCNVFilepath(); + + private: + std::string bam_filepath; + std::string ref_filepath; + std::string snp_vcf_filepath; + std::string pfb_filepath; + FASTAQuery fasta_query; + std::string output_dir; + std::string region; + int window_size; + std::string region_chr; + int region_start; + int region_end; + bool region_set; + std::map chr_cov; + int thread_count; + std::string hmm_filepath; + bool disable_cigar; + std::string cnv_filepath; +}; + +#endif // INPUT_DATA_H diff --git a/include/integrative_caller.h b/include/integrative_caller.h deleted file mode 100644 index 4f6e23ea..00000000 --- a/include/integrative_caller.h +++ /dev/null @@ -1,36 +0,0 @@ -// -// integrative_caller.h: -// Integrate SV calls -// - -#ifndef CONTEXTSV_INTEGRATIVE_CALLER_H -#define CONTEXTSV_INTEGRATIVE_CALLER_H - -#include "common.h" - -#include - -class IntegrativeCaller { - private: - Common common; - - public: - IntegrativeCaller(Common common); - - // Entry point - int run(); - - // Check if the bam file uses chr prefix notation - int bamHasChrPrefix(std::string filepath, bool& uses_chr_prefix); - - // Set the bam file path - void set_bam_filepath(std::string bam_filepath); - - // Set the reference file path - void set_ref_filepath(std::string ref_filepath); - void set_output_dir(std::string output_dir); - void set_region(std::string region); - void set_window_size(int window_size); -}; - -#endif diff --git a/include/kc.h b/include/kc.h index 0c9bedbc..a47e393b 100644 --- a/include/kc.h +++ b/include/kc.h @@ -1,5 +1,10 @@ +#ifndef KC_H +#define KC_H +/// @cond #include +/// @endcond + // #include "EXTERN.h" /* std perl include */ // #include "perl.h" /* std perl include */ // #include "XSUB.h" /* XSUB include */ @@ -234,7 +239,7 @@ double beta(double z, double w); double betai(double a, double b, double x); double betacf(double a, double b, double x); double inverff (double x); -double erf(double x); +double errorf (double x); /**************** random number @@ -249,3 +254,5 @@ double random_stdnormal (int *idum); 3rd party code ******************/ double SNPHWE(int obs_hets, int obs_hom1, int obs_hom2); + +#endif /* KC_H */ diff --git a/include/khmm.h b/include/khmm.h index 179abf80..416c543d 100644 --- a/include/khmm.h +++ b/include/khmm.h @@ -1,11 +1,14 @@ #ifndef _KHMM_H #define _KHMM_H +/// @cond #include #include #include -#include +#include #include +#include +/// @endcond typedef struct { int N; /* number of states; Q={1,2,...,N} */ @@ -38,47 +41,19 @@ typedef struct { /// Read an HMM from a file CHMM ReadCHMM (const char *filename); -/// Generate an HMM with default parameters -CHMM GetDefaultCHMM(); - -void PrintCHMM(FILE *fp, CHMM *phmm); -void CopyCHMM(CHMM *phmm1, CHMM *phmm2); +/// Free the memory allocated for an HMM void FreeCHMM(CHMM *phmm); +/// Run the main HMM algorithm std::vector testVit_CHMM (CHMM hmm, int T, double *O1, double *O2, double *pfb, int *snpdist, double *plogproba); -void tumorVit_CHMM (CHMM hmm, int T, double *O1, double *O2, double *pfb, int *snpdist, double *plogproba, double stroma); -std::vector ViterbiLogNP_CHMM(CHMM *phmm, int T, double *O1, double *O2, double *pfb, int *snpdist, double **delta, int **psi, double *pprob); -void ViterbiLogNPStroma_CHMM(CHMM *phmm, int T, double *O1, double *O2, double *pfb, int *snpdist, double **delta, int **psi, int *q, double *pprob, double stroma); -void estHMMFromFile_CHMM (CHMM hmm, int T, FILE *fp, int *niter, double *logprobinit, double *logprobfinal); -void BaumWelchNP_CHMM(CHMM *phmm, int T, double *O1, double *O2, double *pfb, int *snpdist, double **alpha, double **beta, double **gamma, int *pniter, double *plogprobinit, double *plogprobfinal); -void ComputeXi_CHMM (CHMM* phmm, int T, double *O1, double *O2, double **biot, int *snpdist, double **alpha, double **beta, double ***xi); -void ComputeGamma_CHMM (CHMM *phmm, int T, double **alpha, double **beta, double **gamma); -void BackwardWithScale_CHMM (CHMM *phmm, int T, double *O1, double *O2, double **biot, int *snpdist, double **beta, double *scale, double *pprob); -void ForwardWithScale_CHMM (CHMM *phmm, int T, double *O1, double *O2, double **biot, int *snpdist, double **alpha, double *scale, double *pprob); - -void GetStateProb_CHMM(CHMM *phmm, int T, double *O1, double *O2, double *pfb, int *snpdist, double *pprob, int state); -void convertHMMTransition (CHMM *phmm, double **A1, int dist); -void adjustBSD (CHMM *phmm, double sdo); +/// Viterbi algorithm +std::vector ViterbiLogNP_CHMM(CHMM *phmm, int T, double *O1, double *O2, double *pfb, int *snpdist, double **delta, int **psi, double *pprob); +/// O1 emission probability double b1iot (int state, double *mean, double *sd, double uf, double o); -double b2iot (int state, double *mean, double *sd, double uf, double pfb, double b); -double b5iot (int state, double *mean, double *sd, double uf, double o, double stroma); -double b6iot (int state, double *mean, double *sd, double uf, double pfb, double b, double stroma); - -void testVitTrio_CHMM (CHMM *phmm, int T, double *Of1, double *Of2, double *Om1, double *Om2, double *Oo1, double *Oo2, double *pfb, int *snpdist, double *plogproba, double *trio_lrr_sd, int osex, int chrx_flag); -double trioTransition (double cimcell, double chit, double **A1, double e1, double e2, int dist, int pdn, int pifat, int pimot, int pioff, int dn, int ifat, int imot, int ioff); -void calculateCHIT (double a, double M[5][5][5][5][5][5]); -void calculateCHIT_male_x (double a, double M[5][5][5][5][5][5]); -void calculateCHIT_female_x (double a, double M[5][5][5][5][5][5]); -double *** AllocXi(int T, int N); -void FreeXi(double *** xi, int T, int N); - -void callCNVFromFile_SEQ (CHMM hmm, int T, FILE *fp, int *mlstate, double gamma_k, double gamma_theta); -void ViterbiLogNP_SEQ (CHMM *phmm, int T, double k, double theta, double *O1, int *O2, double *pfb, int *length, double **delta, int **psi, int *q); -double b4iot (int state, double *mean, double *sd, double uf, double pfb, int o1, int o2); -double b3iot (int state, double *mean, double *sd, double uf, double o1, int length); -void adjustHMMExpected (CHMM *phmm, double coverage); +/// O2 emission probability +double b2iot (int state, double *mean, double *sd, double uf, double pfb, double b); #endif // _HMM_H_ diff --git a/include/sv_caller.h b/include/sv_caller.h new file mode 100644 index 00000000..e9aaebfd --- /dev/null +++ b/include/sv_caller.h @@ -0,0 +1,38 @@ +// SVCaller: Detect SVs from long read alignments + +#ifndef SV_CALLER_H +#define SV_CALLER_H + +#include "input_data.h" +#include "cnv_data.h" +#include "sv_data.h" +#include "types.h" + +#include + +class SVCaller { + private: + //int max_indel_dist = 1000; // Maximum distance between two indels to + //be considered as a single SV + int max_indel_dist = 10; // Maximum distance between two indels to be considered as a single SV + //int min_sv_size = 50; // Minimum SV size to be considered + //int min_sv_size = 30; // Minimum SV size to be considered + int min_sv_size = 50; // Minimum SV size to be considered + int min_mapq = 20; // Minimum mapping quality to be considered + InputData* input_data; + + // Detect SVs from long read alignments in the CIGAR string + void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls); + //void detectSVsFromCIGAR(SVData& sv_calls, std::string chr, int32_t pos, uint32_t* cigar, int cigar_len, bool debug_mode); + + // Detect SVs from split-read alignments (primary and supplementary) + SVData detectSVsFromSplitReads(); + + public: + SVCaller(InputData& input_data); + + // Detect SVs and predict SV type from long read alignments and CNV calls + SVData run(); +}; + +#endif // SV_CALLER_H diff --git a/include/sv_data.h b/include/sv_data.h new file mode 100644 index 00000000..73145fb5 --- /dev/null +++ b/include/sv_data.h @@ -0,0 +1,54 @@ +#ifndef SV_DATA_H +#define SV_DATA_H + +#include "fasta_query.h" // For querying the reference genome +#include "types.h" + +/// @cond +#include +#include +/// @endcond + + +class SVData { + private: + // SV candidate to read depth map + SVDepthMap sv_calls; + + // Store a reference to the reference genome + FASTAQuery *ref_genome; + + // SV type to string map (DEL, DUP, INV, INS, BND) + // DUPs [1] are INS with INFO/REPTYPE=DUP + std::map sv_type_map = { + {0, "DEL"}, + {1, "INS"}, + {2, "INV"}, + {3, "INS"}, + {4, "BND"} + }; + + public: + SVData(FASTAQuery& ref_genome); + void add(std::string chr, int64_t start, int64_t end, int sv_type, std::string alt_allele, std::string data_type); + + std::string getRefGenome(); + + // Query the reference genome for a given sequence + std::string getSequence(std::string chr, int64_t pos_start, int64_t pos_end); + + // Update the SV type for a given SV candidate + void updateSVType(SVCandidate key, int sv_type); + + // Save SV calls to VCF + void saveToVCF(FASTAQuery& ref_genome, std::string output_dir); + + // Begin and end iterators for the SV candidate map + SVDepthMap::iterator begin() { return this->sv_calls.begin(); } + SVDepthMap::iterator end() { return this->sv_calls.end(); } + + // Define the size of the SV candidate map + int size() { return this->sv_calls.size(); } +}; + +#endif // SV_DATA_H diff --git a/include/swig_interface.h b/include/swig_interface.h new file mode 100644 index 00000000..c7f163ae --- /dev/null +++ b/include/swig_interface.h @@ -0,0 +1,17 @@ +// +// swig_interface.h: +// Declare the C++ functions that will be wrapped by SWIG +// + +#ifndef SWIG_INTERFACE_H +#define SWIG_INTERFACE_H + +#include "input_data.h" + +/// @cond +#include +/// @endcond + +int run(InputData input_data); + +#endif // SWIG_INTERFACE_H diff --git a/include/types.h b/include/types.h new file mode 100644 index 00000000..46896f09 --- /dev/null +++ b/include/types.h @@ -0,0 +1,63 @@ +// This file contains all the type definitions used in the project + +#ifndef TYPES_H +#define TYPES_H + +/// @cond +#include +#include +#include +#include +/// @endcond + +// SNP data is a struct containing vectors used in predicting copy number states +struct SNPData { + std::vector locations; + std::vector pfbs; + std::vector bafs; + std::vector log2_ratios; + std::vector state_sequence; + + SNPData(): + locations({}),\ + pfbs({}), \ + bafs({}), \ + log2_ratios({}), \ + state_sequence({}) {} +}; + +// CNV candidate location map +// (chr, snp_pos) : cnv_type +using SNPLocation = std::pair; +using SNPToCNVMap = std::map; + +// SV candidate read depth map. An SV is defined by its location, type, and +// alternate allele. +// (chr, start, end, sv_type, alt_allele) : (ref_allele, num_reads) +using SVCandidate = std::tuple; // chr, start, end, alt_allele + +// Create a struct for storing SV information +struct SVInfo { + int sv_type; + int read_depth; + std::set data_type; + int sv_length; + + SVInfo() : + sv_type(-1), read_depth(0), data_type({}), sv_length(0) {} + + SVInfo(int sv_type, int read_depth, std::string data_type, int sv_length) : + sv_type(sv_type), read_depth(read_depth), data_type({data_type}), sv_length(sv_length) {} +}; + +using SVDepthMap = std::map; // Map for getting type and read depth of SV candidates + +// SV calling: +// Alignment location (chr, start, end, depth) +using AlignmentData = std::tuple; +using AlignmentVector = std::vector; + +// Query map (query name, alignment vector) +using QueryMap = std::map; + +#endif // TYPES_H diff --git a/include/utils.h b/include/utils.h index e566781b..516018a6 100644 --- a/include/utils.h +++ b/include/utils.h @@ -1,10 +1,10 @@ -// -// utils.h: // Utility functions -// -#ifndef CONTEXTSV_UTILS_H -#define CONTEXTSV_UTILS_H +#ifndef UTILS_H +#define UTILS_H -#endif //CONTEXTSV_UTILS_H \ No newline at end of file +// Print the progress of a task +void printProgress(int progress, int total); + +#endif // UTILS_H diff --git a/include/vcf_writer.h b/include/vcf_writer.h new file mode 100644 index 00000000..800df144 --- /dev/null +++ b/include/vcf_writer.h @@ -0,0 +1,23 @@ +/// @cond +#include +#include +#include +/// @endcond + +class VcfWriter { +public: + // Constructor + VcfWriter(const std::string& filename); + void writeHeader(const std::vector& headerLines); + void writeRecord(const std::string& chrom, int pos, const std::string& id, + const std::string& ref, const std::string& alt, + const std::string& qual, const std::string& filter, + const std::string& info, const std::string& format, + const std::vector& samples); + + // Close the VCF file + void close(); + +private: + std::ofstream file_stream; +}; diff --git a/python/cnv_plots.py b/python/cnv_plots.py new file mode 100644 index 00000000..df1b1ca7 --- /dev/null +++ b/python/cnv_plots.py @@ -0,0 +1,263 @@ +"""Plot the copy number variants and their log2_ratio, BAF values.""" + +import os +import sys +import logging as log +import plotly +from plotly.subplots import make_subplots +import pandas as pd + +from .utils import parse_region, get_info_field_column, get_info_field_value + +# Set up logging. +log.basicConfig( + level=log.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + handlers=[ + log.StreamHandler(sys.stdout) + ] +) + +def run(vcf_file, cnv_data_file, output_path, region): + """ + Saves a plot of the CNVs and their log2 ratio and B-allele frequency + values. + + Args: + vcf_path (str): The path to the VCF file. + cnv_data_path (str): The path to the CNV data file. + output_path (str): The path to the output directory. + + Returns: + None + """ + # Set the maximum number of CNVs to plot. + max_cnvs = 10 + + # Set the plot range for each CNV to 3 times its length on each side. + plot_range = 3 + + # Parse the region. + chromosome, start_position, end_position = parse_region(region) + + if start_position is not None and end_position is not None: + log.info("Plotting CNVs in region %s:%d-%d.", chromosome, start_position, end_position) + else: + log.info("Plotting CNVs in region %s.", chromosome) + + # Set up the CNV type string list for the 6 CNV states. + cnv_types = ["NAN", "DEL", "DEL", "NEUT", "NEUT", "DUP", "DUP"] + + # Filter the VCF file to the region using pandas, and make the chromosome + # column a string. + vcf_data = pd.read_csv(vcf_file, sep="\t", comment="#", header=None, dtype={0: str}) + if start_position is not None and end_position is not None: + vcf_data = vcf_data[(vcf_data[0] == chromosome) & (vcf_data[1] >= start_position) & (vcf_data[1] <= end_position)] + else: + vcf_data = vcf_data[(vcf_data[0] == chromosome)] + + log.info("Loaded %d variants from %s", len(vcf_data), vcf_file) + + # Filter the CNV data to the region using pandas, and make the chromosome + # column a string. + log.info("Loading CNV data from %s", cnv_data_file) + cnv_data = pd.read_csv(cnv_data_file, sep="\t", header=0, dtype={"chromosome": str}) + if start_position is not None and end_position is not None: + cnv_data = cnv_data[(cnv_data["chromosome"] == chromosome) & (cnv_data["position"] >= start_position) & (cnv_data["position"] <= end_position)] + else: + cnv_data = cnv_data[(cnv_data["chromosome"] == chromosome)] + + log.info("Loaded %d CNVs.", len(cnv_data)) + + # Create an output html file where we will append the CNV plots. + if start_position is not None and end_position is not None: + html_filename = f"cnv_plots_{chromosome}_{start_position}_{end_position}.html" + else: + html_filename = f"cnv_plots_{chromosome}.html" + + # Create the output directory if it doesn't exist. + if not os.path.exists(output_path): + os.makedirs(output_path) + + # Create the output html file. + output_html_filepath = os.path.join(output_path, html_filename) + if os.path.exists(output_html_filepath): + os.remove(output_html_filepath) + + with open(output_html_filepath, "w", encoding="utf-8") as output_html_file: + + # Get the INFO field column by searching for the first column that + # contains SVTYPE=. + info_column = get_info_field_column(vcf_data) + + # Loop through the VCF data and plot each CNV (DEL or DUP) along with log2 + # ratio and BAF values for the SNPs in the CNV. + cnv_count = 0 + for _, sv_data in vcf_data.iterrows(): + + # Get the INFO field + info_field = sv_data[info_column] + + # Get the SVTYPE field value. + svtype = get_info_field_value(info_field, "SVTYPE") + if svtype == "INS" and get_info_field_value(info_field, "REPTYPE") == "DUP": + svtype = "DUP" + + # Analyze the CNV if it is a DEL or DUP (=INS with INFO/REPTYE=DUP) + if svtype in ("DEL", "DUP"): + + # Get the read depth (DP) value. + read_depth = int(get_info_field_value(info_field, "DP")) + + # Get the start position. + start_position = int(sv_data[1]) + + # Get the SV length. + cnv_length = int(get_info_field_value(info_field, "SVLEN")) + + # Get the end position using the start position and SV length. + end_position = start_position + cnv_length - 1 + + # Get the chromosome. + chromosome = sv_data[0] + + # Get the plot range. + plot_start_position = start_position - (plot_range * cnv_length) + plot_end_position = end_position + (plot_range * cnv_length) + + # Get the CNV state, log2 ratio, and BAF values for all SNPs in the + # plot range. + sv_data = cnv_data[(cnv_data["position"] >= plot_start_position) & (cnv_data["position"] <= plot_end_position)] + + # Get the marker colors for the state sequence. + marker_colors = [] + for state in sv_data["cnv_state"]: + if state in [1, 2]: + marker_colors.append("red") + elif state in [3, 4]: + marker_colors.append("black") + elif state in [5, 6]: + marker_colors.append("blue") + + # Get the hover text for the state sequence markers. + hover_text = [] + for _, row in sv_data.iterrows(): + hover_text.append(f"TYPE: {cnv_types[row['cnv_state']]}
CHR: {row['chromosome']}
POS: {row['position']}
L2R: {row['log2_ratio']}
BAF: {row['b_allele_freq']}
PFB: {row['population_freq']}") + + # Create the log2 ratio trace. + log2_ratio_trace = plotly.graph_objs.Scatter( + x = sv_data["position"], + y = sv_data["log2_ratio"], + mode = "markers+lines", + name = r'Log2 Ratio', + text = hover_text, + hoverinfo = "text", + marker = dict( + color = marker_colors, + size = 10, + ), + line = dict( + color = "black", + width = 0 + ), + showlegend = False + ) + + # Create the B-allele frequency trace. + baf_trace = plotly.graph_objs.Scatter( + x = sv_data["position"], + y = sv_data["b_allele_freq"], + mode = "markers+lines", + name = "B-Allele Frequency", + text = hover_text, + marker = dict( + color = marker_colors, + size = 10, + ), + line = dict( + color = "black", + width = 0 + ), + showlegend = False + ) + + # Create a subplot for the CNV plot and the BAF plot. + fig = make_subplots( + rows=2, + cols=1, + shared_xaxes=True, + vertical_spacing=0.05, + subplot_titles=(r"SNP Log2 Ratio", "SNP B-Allele Frequency") + ) + + # Add the traces to the figure. + fig.append_trace(log2_ratio_trace, 1, 1) + fig.append_trace(baf_trace, 2, 1) + + # Set the x-axis title. + fig.update_xaxes( + title_text = "Chromosome Position", + row = 2, + col = 1 + ) + + # Set the y-axis titles. + fig.update_yaxes( + title_text = r"Log2 Ratio", + row = 1, + col = 1 + ) + + fig.update_yaxes( + title_text = "B-Allele Frequency", + row = 2, + col = 1 + ) + + # Set the figure title. + fig.update_layout( + title_text = f"{svtype} (DP={read_depth}, LEN={cnv_length}bp) at {chromosome}:{start_position}-{end_position}", + ) + + # Create a shaded rectangle for the CNV, layering it below the CNV + # trace and labeling it with the CNV type. + fig.add_vrect( + x0 = start_position, + x1 = end_position, + fillcolor = "Black", + layer = "below", + line_width = 0, + opacity = 0.1, + annotation_text = svtype, + annotation_position = "top left", + annotation_font_size = 20, + annotation_font_color = "black" + ) + + # Add vertical lines at the start and end positions of the CNV. + fig.add_vline( + x = start_position, + line_width = 2, + line_color = "black", + layer = "below" + ) + + fig.add_vline( + x = end_position, + line_width = 2, + line_color = "black", + layer = "below" + ) + + # Append the figure to the output html file. + output_html_file.write(fig.to_html(full_html=False, include_plotlyjs="cdn")) + log.info("Plotted CNV %s %s:%d-%d.", svtype, chromosome, start_position, end_position) + + # Increment the CNV count. + cnv_count += 1 + + # Break if the maximum number of CNVs has been reached. + if cnv_count == max_cnvs: + break + + log.info("Saved CNV plots to %s.", output_html_filepath) diff --git a/python/scoring_model.py b/python/scoring_model.py new file mode 100644 index 00000000..d38f22fa --- /dev/null +++ b/python/scoring_model.py @@ -0,0 +1,6 @@ +""" +scoring_model.py: Score the structural variants using the binary classification model. +""" + +import os +import sys diff --git a/python/train_model.py b/python/train_model.py new file mode 100644 index 00000000..e5408313 --- /dev/null +++ b/python/train_model.py @@ -0,0 +1,69 @@ +""" +train_model.py: Train the binary classification model. +""" + +import os +import sys +import joblib +from sklearn.linear_model import LogisticRegression +import pandas as pd + + +def train(true_positives_filepath, false_positives_filepath): + """Train the binary classification model.""" + # Read in the true positive and false positive data. + tp_data = pd.read_csv(true_positives_filepath, sep="\t") + fp_data = pd.read_csv(false_positives_filepath, sep="\t") + + # Get the training data. + tp_data["label"] = 1 + fp_data["label"] = 0 + training_data = pd.concat([tp_data, fp_data]) + + # Get the features and labels. + features = training_data[["lrr", "baf"]] + labels = training_data["label"] + + # Train the model. + model = LogisticRegression() + model.fit(features, labels) + + # Return the model. + return model + +# Run the program. +def run(true_positives_filepath, false_positives_filepath, output_directory): + """Run the program.""" + # Train the model. + model = train(true_positives_filepath, false_positives_filepath) + + # Save the model + model_path = os.path.join(output_directory, "model.pkl") + joblib.dump(model, model_path) + + # Print the model. + print(model) + + # Return the model. + return model + +def score(model, cnv_data): + """Score the structural variants.""" + # Get the features. + features = cnv_data[["lrr", "baf"]] + + # Score the structural variants. + scores = model.predict_proba(features) + + # Return the scores. + return scores + + +if __name__ == '__main__': + # Get the command line arguments. + tp_filepath = sys.argv[1] + fp_filepath = sys.argv[2] + output_dir = sys.argv[3] + + # Run the program. + run(tp_filepath, fp_filepath, output_dir) diff --git a/python/utils.py b/python/utils.py new file mode 100644 index 00000000..9176dc9f --- /dev/null +++ b/python/utils.py @@ -0,0 +1,44 @@ +"""Utility functions for genome data analysis.""" + +def parse_region(region): + """Parse a region string into its chromosome and start and end positions.""" + region_parts = region.split(":") + chromosome = str(region_parts[0]) + + try: + start_position = int(region_parts[1].split("-")[0]) + end_position = int(region_parts[1].split("-")[1]) + except IndexError: + start_position, end_position = None, None + + return chromosome, start_position, end_position + +def get_info_field_column(vcf_data): + """Return the column index of the INFO field in a VCF file.""" + index = vcf_data.apply(lambda col: col.astype(str).str.contains("SVTYPE=").any(), axis=0).idxmax() + return index + +def get_info_field_value(info_field, field_name): + """ + Get the value of a field in the INFO field of a VCF file. + + Args: + info_field (str): The INFO field. + field_name (str): The name of the field to get the value of. + + Returns: + str: The value of the field. + """ + + # Split the INFO field into its parts. + info_field_parts = info_field.split(";") + + # Get the field value. + field_value = "" + for info_field_part in info_field_parts: + if info_field_part.startswith("{}=".format(field_name)): + field_value = info_field_part.split("=")[1] + break + + # Return the field value. + return field_value diff --git a/samtools/htslib b/samtools/htslib deleted file mode 160000 index 4e61c128..00000000 --- a/samtools/htslib +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 4e61c128238f3e7cbb3b1f4e9c0fdb4880aa9a10 diff --git a/scripts/find_cigar_deletions.py b/scripts/find_cigar_deletions.py new file mode 100644 index 00000000..f361570c --- /dev/null +++ b/scripts/find_cigar_deletions.py @@ -0,0 +1,58 @@ +import re + + +def find_deletions(cigar_string, ref_pos): + """ + Given a cigar string and a reference position, returns a list of all deletions in reference coordinates. + """ + query_pos = 0 + deletions = [] + + # Parse CIGAR string + for length, operation in parse_cigar(cigar_string): + + if operation == 'D': # Deletion + # deletions.append((ref_pos, ref_pos + length, length)) + + # Print out the current operation and length + print(operation, length, ref_pos, ref_pos + length) + + if operation in ['M', 'D', 'N', '=', 'X']: + ref_pos += length + if operation not in ['I', 'S', 'H', 'P']: # Not Insertion, Soft Clip, Hard Clip, Padding + query_pos += length + + return deletions + +# Function to parse CIGAR string into (length, operation) tuples +def parse_cigar(cigar): + import re + cigar_tuples = re.findall(r'(\d+)([MIDNSHP=X])', cigar) + return [(int(length), operation) for length, operation in cigar_tuples] + +# Take user input (cigar string and reference position) +# cigar_string = input("Enter cigar string: ") +# ref_pos = int(input("Enter reference position: ")) +# # Call function +# deletions = find_deletions(cigar_string, ref_pos) + +# cigar_string = "9S13M1D27M1I389M1I171M1I178M1D3M1D79M1D25M7D248M1D310M1D10M2D7M1D147M12I25M1D24M1D148M1I51M1D8M2D127M3D47M2I4M1D22M1D119M1D390M4D177M1I9M1D34M1D74M1D84M1I1M1I62M2I160M1D56M1D20M3D403M2D12M1I77M1D12M2D72M1I135M1I18M4D14M4I36M1I36M1D177M1D61M4I347M2D36M1D42M1D82M1D4M1I41M1I198M1I160M1D83M1D32M1D78M1D340M1D106M5D180M2D120M1D23M2D47M1D232M2D1M1D198M1D107M1D98M1D282M3I8M2D216M1D41M2D5M1D8M1I205M1I18M2I244M1D74M1D5M1D310M3D45M2I79M1D204M1I29M1I33M1I164M1I97M1D295M2I42M3I69M1D473M2D22M1I59M4I36M1D20M2D...1I2M4I1M1I8M1D25M2I37M1I23M3I8M1D56M1I66M3D4M1D12M1D55M1D22M1D16M2I25M1I27M2I14M1D5M2D13M1I7M1I64M1I16M1D13M1D52M2I45M1D38M1I2M1I53M1D1M2D5M2D82M1D28M3I26M1I57M2I83M1D85M1D161M1I11M2I20M4I14M1D16M5I70M2D51M1D13M3D53M1D19M1I150M1D101M2I22M1D2M1D3M2D176M2I118M3D75M1D7M1I34M2I9M3I68M2I57M4I102M1I38M1I1M4D8M2D35M1D100M1D4M1D76M1D7M1D143M1I69M1D133M2D68M1D149M1D90M1D81M3D1M1D5M1D1M2D12M2I5M1I45M1D75M1D78M1D5M2D68M2I22M1I7M2D299M1I5M2I142M2D17M1D113M62D12M1D140M1D4M3D78M1D17M1D443M1D5M2D136M1I19M28S" +# ref_pos = 41191391 # 0-based leftmost coordinate of the reference sequence +# deletions = find_deletions(cigar_string, ref_pos) + +# cigar_string= "6S150M1I11M2D127M1D25M2D11M1D29M1D30M2I100M2I7M2I54M1I15M1D12M5D30M1D8M2I57M1D29M1D61M1I88M1D42M1D8M1I52M1D7M5I148M2I31M1D33M3D15M1D324M2I42M3I4M1D26M5I30M1D18M1I55M1D88M1D41M3D1M2D12M1D37M1D7M1D31M1I26M1D31M4I111M3D5M2D4M4D37M1D3M1D16M2I47M1D2M3D160M2D7M1I70M1D142M3D30M4I76M1D25M1I25M1I15M4D115M1D56M1D26M1D21M1I69M2I6M1I29M1D63M2I1M3D182M3D30M1D14M1D16M2D3M2D30M1I22M3D2M2D46M2D81M1D6M2I114M2D10M1D66M3D2M2D142M2I112M1D143M1D39M2D3M1D78M1D21M1D54M1D29M2D84M1I13M4I51M4D76M66D148M1D39M1D6M1D4M1D53M1I40M3I32M1D204M1D45M2I15M1I190M1D71M2D12M2D53M1D55M1I54M31S" +# ref_pos=41186795 +# deletions = find_deletions(cigar_string, ref_pos) + +cigar_string="4S68M1D157M2D36M1I174M1D7M1D64M1D35M4I98M2D212M1D54M2I102M1I139M1D331M1D81M1I12M1I211M2D102M1D126M1I270M1D148M1D164M1I16M1D58M1D212M1D16M1I75M1I62M1I39M2D43M1D144M3I45M1I73M2I79M2I428M1D8M1I28M1D73M1I407M1D81M1D242M2I68M2D175M2I6M3I15M4D85M1D39M1D152M1I54M1D55M3D223M3I4M3D564M1I3M1I4M1D20M1D77M1D348M1D104M1D20M3D48M1D120M1I22M1D3M1I61M2D69M1I1930M1I135M3D41M2I4M5D91M1D179M1I122M3D339M2I66M1D123M1D99M1I42M1D182M1I23M1D139M1D107M1D219M1I65M1I22M1D107M1D36M1I21M1D152M1I26M1D33M1I107M1D5M1D13M1D56M...1I37M1I43M1D2M1D124M1I37M1D251M1D100M2I22M1D2M2D4M1D20M4D4M2I320M1I97M1D21M2I10M3I6M1I142M1I39M2I145M1D76M2I175M1D273M1D74M1D21M2D8M1D32M1D94M1I47M2I397M1D318M2D264M2D49M2I32M2D254M2I95M1I50M1D100M2D11M3I32M1I121M2D38M1I6M1D52M1I45M1D277M2I200M1D98M1I73M1I33M1I52M1I72M2I233M1D22M3D64M2I139M3D260M1D497M1D1M2D91M22I136M2D27M1D292M1I409M1I1M1I132M1I3M1D112M2I18M1D117M1D8M2D30M1I63M1D8M1D16M4I8M2I12M1D42M1D19M1D50M1I189M1D160M1I178M1I125M3I2M2D63M3D240M1I123M1I14M1I304M1I151M1D3M1D14M1D238M1D6M27S" +ref_pos=41149857 +deletions = find_deletions(cigar_string, ref_pos) + +# Print output +# print("Deletions:") +# for deletion in deletions: +# print(deletion) + +# print("Mismatches:") +# for mismatch in mismatches: +# print(mismatch) \ No newline at end of file diff --git a/setup.py b/setup.py index 7b354654..13477948 100644 --- a/setup.py +++ b/setup.py @@ -3,44 +3,40 @@ This file is used to install the package. """ -print("Running setup.py...") - import os import glob from setuptools import setup, find_packages, Extension +print("Running setup.py...") + # Set the project metadata -name = "contextsv" -version = "0.0.1" -author = "WGLab" -description = "ContextSV: A tool for integrative structural variant detection." +NAME = "contextsv" +VERSION = "0.0.1" +AUTHOR = "WGLab" +DESCRIPTION = "ContextSV: A tool for integrative structural variant detection." # Get the conda environment's include path conda_prefix = os.environ.get("CONDA_PREFIX") if conda_prefix is None: - raise Exception("CONDA_PREFIX is not set.") -conda_include_dir = os.path.join(conda_prefix, "include") + raise AssertionError("CONDA_PREFIX is not set.") -print("CONDA_PREFIX: {}".format(conda_prefix)) # DEBUG -print("include_dir: {}".format(conda_include_dir)) # DEBUG +conda_include_dir = os.path.join(conda_prefix, "include") # Get the conda environment's lib path conda_lib_dir = os.path.join(conda_prefix, "lib") -print("lib_dir: {}".format(conda_lib_dir)) # DEBUG - # Set the project dependencies -src_dir = "src" -src_files = glob.glob(os.path.join(src_dir, "*.cpp")) -include_dir = "include" -include_files = glob.glob(os.path.join(include_dir, "*.h")) +SRC_DIR = "src" +SRC_FILES = glob.glob(os.path.join(SRC_DIR, "*.cpp")) +INCLUDE_DIR = "include" +INCLUDE_FILES = glob.glob(os.path.join(INCLUDE_DIR, "*.h")) # Set up the extension ext = Extension( - name="_" + name, - sources=src_files, - include_dirs=[include_dir, conda_include_dir], + name="_" + NAME, + sources=SRC_FILES, + include_dirs=[INCLUDE_DIR, conda_include_dir], extra_compile_args=["-std=c++11"], language="c++", libraries=["hts"], @@ -49,12 +45,12 @@ # Set up the module setup( - name=name, - version=version, - author=author, - description=description, + name=NAME, + version=VERSION, + author=AUTHOR, + description=DESCRIPTION, ext_modules=[ext], - py_modules=[name], + py_modules=[NAME], packages=find_packages(), test_suite="tests", entry_points={ diff --git a/src/cli.cpp b/src/cli.cpp deleted file mode 100644 index 86a21399..00000000 --- a/src/cli.cpp +++ /dev/null @@ -1,40 +0,0 @@ -// -// Created by jperdomo on 1/8/2023. -// - -#include "cli.h" -#include "integrative_caller.h" - -#include -#include -#include -#include -#include -#include - - -// Run the CLI with the given parameters -int run(std::string bam, std::string snps, std::string outdir, std::string region) -{ - // Create the common parameters - Common common; - common.set_bam_filepath(bam); - common.set_snp_vcf_filepath(snps); - common.set_output_dir(outdir); - common.set_region(region); - - // Run the integrative caller - IntegrativeCaller caller_obj(common); - try - { - caller_obj.run(); - } - - catch (std::exception& e) - { - std::cerr << e.what() << std::endl; - return -1; - } - - return 0; -} diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 7c02a2b9..f6c69967 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -1,62 +1,76 @@ -// #include "khmm.h" #include "cnv_caller.h" +#include "utils.h" +#include + +/// @cond #include #include #include #include #include #include /* log2 */ -#include #include #include #include #include // Progress bar -#include "common.h" #define BUFFER_SIZE 1024 +// #define MIN_PFB 0.00001 // Minimum PFB value +// #define MAX_PFB 0.99999 // Maximum PFB value +#define MIN_PFB 0.01 +#define MAX_PFB 0.99 +/// @endcond -CNVCaller::CNVCaller(Common common) + +CNVCaller::CNVCaller(InputData& input_data) { - this->common = common; + this->input_data = &input_data; } -std::vector CNVCaller::run() +CNVData CNVCaller::run() { // Read SNP positions and BAF values from the VCF file std::cout << "Reading SNP positions and BAF values from the VCF file..." << std::endl; - std::pair, std::vector> bafs_by_pos = readSNPBAFs(); - std::cout << "Complete." << std::endl; + std::string snp_filepath = this->input_data->getSNPFilepath(); + SNPData snp_data = readSNPBAFs(snp_filepath); + int snp_count = (int) snp_data.locations.size(); + std::cout << "Complete. Number of SNPs: " << snp_count << std::endl; - // Extract the SNP positions and BAF values - std::vector snp_positions = bafs_by_pos.first; - std::vector baf = bafs_by_pos.second; + // Get the population frequencies for each SNP + if (this->input_data->getPFBFilepath() == "") + { + std::cout << "No PFB file provided. Using the minimum PFB value of " << MIN_PFB << " for all SNPs." << std::endl; + snp_data.pfbs = std::vector(snp_count, MIN_PFB); + } else { + std::cout << "Using PFB file: " << this->input_data->getPFBFilepath() << std::endl; + std::cout << "Getting population frequencies for each SNP..." << std::endl; + snp_data.pfbs = getSNPPopulationFrequencies(snp_data.locations); + std::cout << "Population frequencies retrieved." << std::endl; + } // Calculate LRRs - std::cout << "Calculating LRRs at SNP positions..." << std::endl; - std::vector lrr = calculateLogRRatiosAtSNPS(snp_positions); - std::cout << "LRRs calculated." << std::endl; + std::cout << "Calculating log2 ratio of coverage at SNP positions..." << std::endl; + snp_data.log2_ratios = calculateLog2RatioAtSNPS(snp_data.locations); + std::cout << "Log2 ratios calculated." << std::endl; // Read the HMM from file //std::string hmm_filepath = "data/wgs.hmm"; - std::string hmm_filepath = "data/hh550.hmm"; + std::string hmm_filepath = this->input_data->getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; CHMM hmm = ReadCHMM(hmm_filepath.c_str()); std::cout << "HMM read from file." << std::endl; - // Set up the input variables - int num_probes = lrr.size(); - double *lrr_ptr = lrr.data(); - double *baf_ptr = baf.data(); - - // Create a double array for pop. frequency and snp distance (not used), and log probabilities + // Set up the data for the Viterbi algorithm + double *lrr_ptr = snp_data.log2_ratios.data(); + double *baf_ptr = snp_data.bafs.data(); + double *pfb_ptr = snp_data.pfbs.data(); int *snpdist = NULL; - double *pfb = NULL; double *logprob = NULL; // Run the Viterbi algorithm - // Each of the 6 states corresponds to a CNV call: + // Each of the 6 state predictions corresponds to a copy number state: // 1: 0/0 (Two copy loss) // 2: 1/0 (One copy loss) // 3: 1/1 (Normal) @@ -65,47 +79,75 @@ std::vector CNVCaller::run() // 6: 2/2 (Two copy gain) std::cout << "Running the Viterbi algorithm..." << std::endl; std::vector state_sequence; // Create the output state sequence - state_sequence = testVit_CHMM(hmm, num_probes, lrr_ptr, baf_ptr, pfb, snpdist, logprob); - std::cout << "Viterbi algorithm complete." << std::endl; + state_sequence = testVit_CHMM(hmm, snp_count, lrr_ptr, baf_ptr, pfb_ptr, snpdist, logprob); + snp_data.state_sequence = state_sequence; + std::cout << "Complete." << std::endl; + + // Save a TSV of the positions, LRRs, BAFs, and state sequence + std::cout << "Saving TSV of copy number prediction data..." << std::endl; + std::string output_tsv = this->input_data->getOutputDir() + "/cnv_data.tsv"; + saveToTSV(snp_data, output_tsv); + std::cout << "Saved to: " << output_tsv << std::endl; + + // Save a BED file of the CNV state sequence + std::cout << "Saving BED of CNV states..." << std::endl; + std::string output_bed = this->input_data->getOutputDir() + "/cnv_states.bed"; + saveToBED(snp_data, output_bed); + std::cout << "Saved to: " << output_bed << std::endl; + + // Get the chromosome as a C string + std::string chr = this->input_data->getRegionChr(); + char *chr_cstr = new char[chr.length() + 1]; + strcpy(chr_cstr, chr.c_str()); + + // Create a map of the state sequence by position + CNVData cnv_data; + for (int i = 0; i < snp_count; i++) + { + cnv_data.addCNVCall(chr_cstr, snp_data.locations[i], snp_data.state_sequence[i]); + } - // Save a CSV of the positions, LRRs, BAFs, and state sequence - std::cout << "Saving CSV of positions, LRRs, and BAFs..." << std::endl; - std::string output_filepath = this->common.get_output_dir() + "/snp_lrr_baf.csv"; - saveSNPLRRBAFCSV(output_filepath, snp_positions, baf, lrr, state_sequence); - std::cout << "CSV saved to: " << output_filepath << std::endl; + // Free the C string + delete[] chr_cstr; - return std::vector(); + return cnv_data; } -std::vector CNVCaller::calculateLogRRatiosAtSNPS(std::vector snp_positions) +std::vector CNVCaller::calculateLog2RatioAtSNPS(std::vector snp_locations) { - std::string target_chr = this->common.get_region_chr(); - - // Calculate mean chromosome coverage - std::string input_filepath = this->common.get_bam_filepath(); - std::cout << "\nCalculating coverage for chromosome: " << target_chr.c_str() << std::endl; - //double mean_chr_cov = calculateMeanChromosomeCoverage(); // Commented out for testing - // double mean_chr_cov = 39.4096; // Chr6 mean coverage from test data - double mean_chr_cov = 39.561; // Chr3 mean coverage from test data + std::string input_filepath = this->input_data->getBAMFilepath(); + std::string chr = this->input_data->getRegionChr(); - std::cout << "Mean coverage: " << mean_chr_cov << std::endl; + // Check if the chromosome coverage was passed in + double mean_chr_cov = -1; + if (this->input_data->getChrCov(chr, mean_chr_cov) == -1) + { + // Calculate the mean chromosome coverage + std::cout << "\nCalculating coverage for chromosome: " << chr << std::endl; + mean_chr_cov = calculateMeanChromosomeCoverage(); + std::cout << "Mean coverage for chromosome " << chr << ": " << mean_chr_cov << std::endl; + } else { + std::cout << "Using user-provided mean coverage for chromosome " << chr << ": " << mean_chr_cov << std::endl; + } // Set the region start and end from the first and last SNPs - int region_start = snp_positions.front(); - int region_end = snp_positions.back(); - if (this->common.get_region_set()) { - region_start = std::max(region_start, this->common.get_region_start()); - region_end = std::min(region_end, this->common.get_region_end()); + std::cout << "Getting region start and end..." << std::endl; + int region_start = snp_locations.front(); + int region_end = snp_locations.back(); + std::cout << "Fixing region start and end..." << std::endl; + if (this->input_data->getRegionSet()) { + region_start = std::max(region_start, this->input_data->getRegionStart()); + region_end = std::min(region_end, this->input_data->getRegionEnd()); } - std::cout << "Beginning analysis of region: " << target_chr << ":" << region_start << "-" << region_end << std::endl; + std::cout << "Predicting CNV states for SNPs in region: " << chr << ":" << region_start << "-" << region_end << std::endl; // Loop through each SNP and calculate the LRR std::vector snp_lrr; - int window_size = this->common.get_window_size(); - int snp_count = (int) snp_positions.size(); + int window_size = this->input_data->getWindowSize(); + int snp_count = (int) snp_locations.size(); for (int i = 0; i < snp_count; i++) { - int pos = snp_positions[i]; + int pos = snp_locations[i]; // Skip SNPs outside of the region if (pos < region_start || pos > region_end) { @@ -120,9 +162,13 @@ std::vector CNVCaller::calculateLogRRatiosAtSNPS(std::vector snp_po // Set the LRR value snp_lrr.push_back(lrr); - // Update the progress bar - this->common.print_progress(i, snp_positions.size()-1); + // Update the progress bar every 1000 SNPs + if (i % 1000 == 0) { + printProgress(i, snp_count-1); + } + //printProgress(i, snp_count-1); } + std::cout << std::endl; // End the progress bar return snp_lrr; } @@ -130,36 +176,27 @@ std::vector CNVCaller::calculateLogRRatiosAtSNPS(std::vector snp_po /// Calculate the mean chromosome coverage double CNVCaller::calculateMeanChromosomeCoverage() { - std::string chr = this->common.get_region_chr(); - std::string input_filepath = this->common.get_bam_filepath(); + std::string chr = this->input_data->getRegionChr(); + std::string input_filepath = this->input_data->getBAMFilepath(); char cmd[BUFFER_SIZE]; FILE *fp; char line[BUFFER_SIZE]; - RegionCoverage cov; // Open a SAMtools process to calculate cumulative read depth and position // counts (non-zero depths only) for a single chromosome // Run the entire chromosome - snprintf(cmd, BUFFER_SIZE,\ - "samtools depth -r %s %s | awk '{c++;s+=$3}END{print c, s}'",\ - chr.c_str(), input_filepath.c_str()); // Remove '-a' for debugging + snprintf(cmd, BUFFER_SIZE,\ + "samtools depth -r %s %s | awk '{c++;s+=$3}END{print c, s}'",\ + chr.c_str(), input_filepath.c_str()); // Remove '-a' for debugging + + std::cout << "Running command: " << cmd << std::endl; // Parse the output fp = popen(cmd, "r"); if (fp == NULL) { - fprintf(stderr, "Failed to run command\n"); - exit(1); - } - - fprintf(stdout, "%s\n", cmd); // Print the command - fflush(stdout); - - fp = popen(cmd, "r"); - if (fp == NULL) - { - fprintf(stderr, "Failed to run command\n"); + std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl; exit(EXIT_FAILURE); } @@ -172,6 +209,9 @@ double CNVCaller::calculateMeanChromosomeCoverage() { // Calculate the mean chromosome coverage mean_chr_cov = (double) cum_depth / (double) pos_count; + } else { + std::cerr << "ERROR: Could not parse output from command: " << cmd << std::endl; + exit(EXIT_FAILURE); } } pclose(fp); // Close the process @@ -181,8 +221,8 @@ double CNVCaller::calculateMeanChromosomeCoverage() double CNVCaller::calculateWindowLogRRatio(double mean_chr_cov, int start_pos, int end_pos) { - std::string chr = this->common.get_region_chr(); - std::string input_filepath = this->common.get_bam_filepath(); + std::string chr = this->input_data->getRegionChr(); + std::string input_filepath = this->input_data->getBAMFilepath(); char cmd[BUFFER_SIZE]; FILE *fp; @@ -192,13 +232,11 @@ double CNVCaller::calculateWindowLogRRatio(double mean_chr_cov, int start_pos, i // counts (non-zero depths only) for a single region // Run the entire chromosome - snprintf(cmd, BUFFER_SIZE,\ - "samtools depth -r %s:%d-%d %s | awk '{c++;s+=$3}END{print c, s}'",\ + snprintf(cmd, BUFFER_SIZE,\ + "samtools depth -r %s:%d-%d %s | awk '{c++;s+=$3}END{print c, s}'",\ chr.c_str(), start_pos, end_pos, input_filepath.c_str()); - //fprintf(stdout, "%s\n", cmd); // Print the command - //fflush(stdout); - - fp = popen(cmd, "r"); + + fp = popen(cmd, "r"); if (fp == NULL) { fprintf(stderr, "Failed to run command\n"); @@ -222,18 +260,15 @@ double CNVCaller::calculateWindowLogRRatio(double mean_chr_cov, int start_pos, i return region_lrr; } -std::pair, std::vector> CNVCaller::readSNPBAFs() +SNPData CNVCaller::readSNPBAFs(std::string snp_filepath) { - // Get the VCF filepath with SNPs - std::string snp_filepath = this->common.get_snp_vcf_filepath(); - // Create a VCF filepath of filtered SNPs - std::string filtered_snp_vcf_filepath = this->common.get_output_dir() + "/filtered_snps.vcf"; + std::string filtered_snp_vcf_filepath = this->input_data->getOutputDir() + "/filtered_snps.vcf"; std::cout << "Parsing SNPs from " << snp_filepath << std::endl; // Filter variants by depth and quality and SNPs only - std::string region = this->common.get_region(); + std::string region = this->input_data->getRegion(); std::string cmd; if (region == "") { @@ -246,10 +281,9 @@ std::pair, std::vector> CNVCaller::readSNPBAFs() std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl; - // Extract all BAFs from the filtered SNPs and store in the pair vector - std::cout << "Extracting BAFs from filtered SNPs" << std::endl; - cmd = "bcftools query -f '%CHROM,%POS,[%VAF]\n' " + filtered_snp_vcf_filepath; - std::cout << "Command: " << cmd << std::endl; + // Extract B-allele frequency data from the VCF file + std::cout << "Extracting allelic depth data from filtered SNPs..." << std::endl; + cmd = "bcftools query -f '%CHROM,%POS,[%AD]\n' " + filtered_snp_vcf_filepath; FILE *fp = popen(cmd.c_str(), "r"); if (fp == NULL) { @@ -257,12 +291,10 @@ std::pair, std::vector> CNVCaller::readSNPBAFs() exit(1); } - // Read the BAFs + // Read the DP and AD values (AD for alternate allele depth) char line[BUFFER_SIZE]; - - // Loop through the lines - std::vector snp_positions; - std::vector snp_bafs; + std::vector locations; + std::vector bafs; while (fgets(line, BUFFER_SIZE, fp) != NULL) { // Parse the line @@ -270,7 +302,9 @@ std::pair, std::vector> CNVCaller::readSNPBAFs() int col = 0; // Column index std::string chr = ""; uint64_t pos = 0; - double baf = 0.0; + int ref_ad = 0; + int alt_ad = 0; + std::string alt_allele = ""; while (tok != NULL) { // Get the chromosome from column 1 @@ -285,45 +319,291 @@ std::pair, std::vector> CNVCaller::readSNPBAFs() pos = atoi(tok); } - // Get the BAF from column 3 + // Get the AD for the reference allele from column 3 else if (col == 2) { - baf = atof(tok); + ref_ad = atoi(tok); } + // Get the AD for the non-reference allele from column 4 + else if (col == 3) + { + alt_ad = atoi(tok); + } + + // Move to the next token tok = strtok(NULL, ","); col++; } - // Store the position and BAF - snp_positions.push_back(pos); - snp_bafs.push_back(baf); + // Calculate the BAF + double baf = (double) alt_ad / (double) (ref_ad + alt_ad); + + // Store the position, alternate allele, and BAF + locations.push_back(pos); + bafs.push_back(baf); } // Close the pipe pclose(fp); - // Create the pair vector - std::pair, std::vector> snp_data = std::make_pair(snp_positions, snp_bafs); + // Exit if no SNPs were found + if (locations.size() == 0) + { + std::cerr << "ERROR: No SNPs found in the VCF file: " << snp_filepath << std::endl; + exit(1); + } + + // Create the SNP data struct + SNPData snp_data; + snp_data.locations = locations; + snp_data.bafs = bafs; + + //SNPData snp_data = std::make_tuple(locations, alt_alleles, bafs); + // std::pair, std::vector> snp_data = std::make_pair(snp_locations, snp_bafs); return snp_data; } -void CNVCaller::saveSNPLRRBAFCSV(std::string filepath, std::vector snp_positions, std::vector bafs, std::vector logr_ratios, std::vector state_sequence) +std::vector CNVCaller::getSNPPopulationFrequencies(std::vector snp_locations) { - // Open the CSV file for writing - std::ofstream csv_file(filepath); + // Get the PFB filepath + std::string pfb_filepath = this->input_data->getPFBFilepath(); + + // Open the PFB file + FILE *fp = fopen(pfb_filepath.c_str(), "r"); + if (fp == NULL) + { + std::cerr << "ERROR: Could not open PFB file: " << pfb_filepath << std::endl; + exit(1); + } + + // Read the AF values (AF for population frequency) into a map + std::map> pfb_map; // Map of population frequencies by SNP position (chr -> pos -> pfb) + char line[BUFFER_SIZE]; + int pfb_count = 0; + int pfb_min_hit = 0; + int pfb_max_hit = 0; + while (fgets(line, BUFFER_SIZE, fp) != NULL) + { + // Parse the tab-delimited line + char *tok = strtok(line, "\t"); // Tokenize the line + int col = 0; // Column index + std::string chr = ""; + uint64_t pos = 0; + double pfb = 0; + while (tok != NULL) + { + // Get the chromosome from column 1 + if (col == 0) + { + chr = tok; + } + + // Get the position from column 2 + else if (col == 1) + { + pos = atoi(tok); + } + + // Get the PFB from column 3 + else if (col == 2) + { + pfb = atof(tok); + } + + tok = strtok(NULL, "\t"); + col++; + } + + // Store the PFB value in the map, and fix within the range + //std::cout << "PFB: " << std::fixed << std::setprecision(6) << pfb << std::endl; + //std::cout << "PFB at " << chr << ":" << pos << " -> " << std::fixed << std::setprecision(6) << pfb << std::endl; + if (pfb < MIN_PFB) + { + pfb_map[chr][pos] = MIN_PFB; + pfb_min_hit++; + } else if (pfb > MAX_PFB) { + pfb_map[chr][pos] = MAX_PFB; + pfb_max_hit++; + } else { + pfb_map[chr][pos] = pfb; + + // Print values for debugging + //std::cout << "PFB at " << chr << ":" << pos << " -> " << std::fixed << std::setprecision(6) << pfb << std::endl; + } + pfb_count++; + } + + // Close the file + fclose(fp); + + // Log the percentage of PFB values that were fixed + std::cout << "PFB values fixed: " << (double) (pfb_min_hit + pfb_max_hit) / (double) pfb_count * 100 << "%" << std::endl; + std::cout << "PFB min hit: " << (double) pfb_min_hit / (double) pfb_count * 100 << "%" << std::endl; + std::cout << "PFB max hit: " << (double) pfb_max_hit / (double) pfb_count * 100 << "%" << std::endl; + + // Determine whether the chromosome is in chr notation (e.g. chr1) or not (e.g. 1) + bool chr_notation = false; + std::string first_chr = pfb_map.begin()->first; + if (first_chr.find("chr") != std::string::npos) + { + chr_notation = true; + } + + // Create a vector of population frequencies for each SNP + std::cout << "Getting SNPs population frequencies..." << std::endl; + std::vector snp_pfb; + + // Get the chromosome + std::string chr = this->input_data->getRegionChr(); + + // Make sure the chromosome follows the same format as the PFB file + if (chr_notation) + { + // Add "chr" to the chromosome if it is not already there + if (chr.find("chr") == std::string::npos) + { + chr = "chr" + chr; + } + } else { + // Remove "chr" from the chromosome if it is there + if (chr.find("chr") != std::string::npos) + { + chr = chr.substr(3); + } + } + + // Loop through each SNP position and get the population frequency + int snp_count = (int) snp_locations.size(); + int found_count = 0; + for (int i = 0; i < snp_count; i++) + { + // Get the SNP position + int pos = snp_locations[i]; + + // Get the population frequency from the map + auto it = pfb_map[chr].find(pos); + if (it != pfb_map[chr].end()) + { + // Store the population frequency + snp_pfb.push_back(it->second); + found_count++; + } + else + { + // Use 0.5 as the population frequency if it is not found + snp_pfb.push_back(0.5); + } + } + + // Print the percentage of SNPs with population frequencies + std::cout << "Found population frequencies for " << found_count << " of " << snp_count << " SNPs (" << (double) found_count / (double) snp_count * 100 << "%)" << std::endl; + + return snp_pfb; +} + +void CNVCaller::saveToTSV(SNPData& snp_data, std::string filepath) +{ + // Open the TSV file for writing + std::ofstream tsv_file(filepath); // Write the header - csv_file << "position,baf,log2_ratio,cnv_state" << std::endl; + tsv_file << "chromosome\tposition\tb_allele_freq\tlog2_ratio\tcnv_state\tpopulation_freq" << std::endl; + + // Write the data + std::string chr = this->input_data->getRegionChr(); + int snp_count = (int) snp_data.locations.size(); + for (int i = 0; i < snp_count; i++) + { + // Get the SNP data + int64_t pos = snp_data.locations[i]; + double pfb = snp_data.pfbs[i]; + double baf = snp_data.bafs[i]; + double log2_ratio = snp_data.log2_ratios[i]; + int cn_state = snp_data.state_sequence[i]; + + // Write the TSV line (chrom, pos, baf, lrr, state) + tsv_file << \ + chr << "\t" << \ + pos << "\t" << \ + baf << "\t" << \ + log2_ratio << "\t" << \ + cn_state << "\t" << \ + pfb << \ + std::endl; + } + + // Close the file + tsv_file.close(); +} + +void CNVCaller::saveToBED(SNPData& snp_data, std::string filepath) +{ + // Each of the 6 copy number states corresponds to the following: + // 1: 0/0 (Two copy loss) + // 2: 1/0 (One copy loss) + // 3: 1/1 (Normal) + // 4: 1/1 (Copy neutral LOH) + // 5: 2/1 (One copy gain) + // 6: 2/2 (Two copy gain) + + // Create a map of descriptions and RGB colors for each CNV state + // Descriptions: + std::map description_map = { + {1, "DEL"}, // DEL + {2, "DEL"}, // DEL + {3, "NEUT"}, // NEUTRAL + {4, "NEUT"}, // NEUTRAL + {5, "DUP"}, // DUP + {6, "DUP"} // DUP + }; + + // RGB colors: + std::map color_map = { + {1, "255,0,0"}, // DEL + {2, "255,0,0"}, // DEL + {3, "0,0,0"}, // NEUTRAL + {4, "0,0,0"}, // NEUTRAL + {5, "0,0,255"}, // DUP + {6, "0,0,255"} // DUP + }; + + // Open the BED file for writing + std::ofstream bed_file(filepath); + + // Write the track line (track type, name, description) + bed_file << "track name=\"CNV States\" description=\"CNV state predictions from ContextSV\" itemRgb=\"On\" visibility=\"2\" useScore=\"0\"" << std::endl; // Write the data - int snp_count = (int) snp_positions.size(); + std::string chr = this->input_data->getRegionChr(); + int snp_count = (int) snp_data.locations.size(); for (int i = 0; i < snp_count; i++) { - csv_file << snp_positions[i] << "," << bafs[i] << "," << logr_ratios[i] << "," << state_sequence[i] << std::endl; + // Get the SNP position and state + int64_t pos = snp_data.locations[i]; + int state = snp_data.state_sequence[i]; + + // Get the state description + std::string state_str = description_map[state]; + + // Get the RGB color + std::string rgb_color = color_map[state]; + + // Write the BED line (chrom, start, end, name (state), score, strand, thickStart, thickEnd, itemRgb) + bed_file << \ + chr << "\t" << \ + pos << "\t" << \ + pos << "\t" << \ + state_str << "\t" << \ + "0" << "\t" << \ + "." << "\t" << \ + pos << "\t" << \ + pos << "\t" << \ + rgb_color << \ + std::endl; } // Close the file - csv_file.close(); + bed_file.close(); } diff --git a/src/cnv_data.cpp b/src/cnv_data.cpp new file mode 100644 index 00000000..9e869faa --- /dev/null +++ b/src/cnv_data.cpp @@ -0,0 +1,114 @@ +#include "cnv_data.h" + +/// @cond +#include +#include +#include +#include +/// @endcond + +void CNVData::addCNVCall(std::string chr, int snp_pos, int cnv_type) +{ + // Add the CNV call to the map + SNPLocation key(chr, snp_pos); + this->cnv_calls[key] = cnv_type; +} + +int CNVData::getMostCommonCNV(std::string chr, int start, int end) +{ + // Get the majority CNV type within the SV region start and end positions + // (0 = deletion, 1 = duplication, -1 = no CNV call) + int dup_count = 0; + int del_count = 0; + int no_call_count = 0; + int total_count = 0; + //int sv_len = end - start + 1; + + //std::cout << "Checking for CNV calls in " << chr << ":" << start << "-" << end << " (SVLEN=" << sv_len << ")" << std::endl; + + for (int pos = start; pos <= end; pos++) { + SNPLocation key(chr, pos); + //std::cout << "Checking for CNV call at " << chr << ":" << pos << + //std::endl; + + if (this->cnv_calls.find(key) != this->cnv_calls.end()) { + //std::cout << "State = " << this->cnv_calls[key] << std::endl; + if (this->cnv_calls[key] == 5 || this->cnv_calls[key] == 6) { + dup_count++; + } else if (this->cnv_calls[key] == 1 || this->cnv_calls[key] == 2) { + del_count++; + } else { + no_call_count++; + } + total_count++; + } + } + + // Check if the SV region is mostly covered by CNV calls (at least 50%) and + // if the majority CNV type is an insertion or deletion + int cnv_type = -1; + if (total_count > 0) { + if (dup_count > del_count && (double) dup_count / total_count > 0.5) { + cnv_type = 1; + //std::cout << "CNV type is DUP, SVLEN=" << sv_len << std::endl; + } else if (del_count > dup_count && (double) del_count / total_count > 0.5) { + cnv_type = 0; + //std::cout << "CNV type is DEL, SVLEN=" << sv_len << std::endl; + } else { + //std::cout << "CNV type is no call, SVLEN=" << sv_len << std::endl; + } + } + + return cnv_type; +} + +void CNVData::loadFromFile(std::string filepath) +{ + // Load CNV calls from file + std::ifstream cnv_file(filepath); + std::string line; + std::string chr; + int snp_pos; + int cnv_type; + + // Check if the file was opened successfully + if (!cnv_file.is_open()) { + std::cerr << "Error: Could not open CNV file " << filepath << std::endl; + exit(1); + } + + // Skip the first line (header) + std::getline(cnv_file, line); + + // Read the file line by line + int line_num = 1; + while (std::getline(cnv_file, line)) { + + // Parse the line + std::istringstream iss(line); + + // Get columns 1, 2, and 5 (chr, pos, cnv_type) + std::string chr; + std::getline(iss, chr, '\t'); + + std::string pos_str; + std::getline(iss, pos_str, '\t'); + snp_pos = std::stoi(pos_str); + + std::string skip_str; + std::getline(iss, skip_str, '\t'); + std::getline(iss, skip_str, '\t'); + + std::string cnv_type_str; + std::getline(iss, cnv_type_str, '\t'); + cnv_type = std::stoi(cnv_type_str); + + // Add the CNV call to the map + this->addCNVCall(chr, snp_pos, cnv_type); + + line_num++; + } + cnv_file.close(); + + std::cout << "Loaded " << line_num << " CNV calls" << std::endl; +} diff --git a/src/common.cpp b/src/common.cpp deleted file mode 100644 index e5cc3e04..00000000 --- a/src/common.cpp +++ /dev/null @@ -1,169 +0,0 @@ -#include "common.h" - -#include -#include -#include - -#define BUFFER_SIZE 1024 - -std::string Common::get_bam_filepath() -{ - return this->bam_filepath; -} - -void Common::set_bam_filepath(std::string filepath) -{ - this->bam_filepath = filepath; -} - -std::string Common::get_ref_filepath() -{ - return this->ref_filepath; -} - -void Common::set_ref_filepath(std::string filepath) -{ - this->ref_filepath = filepath; -} - -std::string Common::get_output_dir() -{ - return this->output_dir; -} - -void Common::set_output_dir(std::string dirpath) -{ - this->output_dir = dirpath; - - // Create the output directory - std::string cmd = "mkdir -p " + output_dir; - system(cmd.c_str()); -} - -std::string Common::get_region() -{ - return this->region; -} - -void Common::set_region(std::string region) -{ - this->region = region; - - // Parse the region - char *tok = strtok((char *)region.c_str(), ":"); - int col = 0; - while (tok != NULL) - { - // Get the chromosome - if (col == 0) - { - this->region_chr = tok; - } - - // Get the start and end positions - else if (col == 1) - { - // Check if empty - if (strcmp(tok, "") == 0) - { - break; - } - - // Split the start and end positions - char *start_tok = strtok(tok, "-"); - char *end_tok = strtok(NULL, "-"); - - // Get the start position - if (start_tok != NULL) - { - this->region_start = atoi(start_tok); - } - - // Get the end position - if (end_tok != NULL) - { - this->region_end = atoi(end_tok); - } - } - tok = strtok(NULL, ":"); - col++; - } - - // Print the region - if (this->region_start == 0 && this->region_end == 0) - { - this->region_set = false; - std::cout << "Parsed region = " << this->region_chr << std::endl; - } else { - this->region_set = true; - std::cout << "Parsed region = " << this->region_chr << ":" << this->region_start << "-" << this->region_end << std::endl; - } -} - -int Common::get_window_size() -{ - return this->window_size; -} - -void Common::set_window_size(int window_size) -{ - this->window_size = window_size; -} - -std::string Common::get_snp_vcf_filepath() -{ - return this->snp_vcf_filepath; -} - -void Common::set_snp_vcf_filepath(std::string filepath) -{ - this->snp_vcf_filepath = filepath; -} - -std::string Common::get_region_chr() -{ - return this->region_chr; -} - -int Common::get_region_start() -{ - return this->region_start; -} - -int Common::get_region_end() -{ - return this->region_end; -} - -bool Common::get_region_set() -{ - return this->region_set; -} - -void Common::print_progress(int progress, int total) -{ - // Get the percentage - float percent = (float)progress / (float)total * 100.0; - - // Get the number of hashes - int num_hashes = (int)(percent / 2.0); - - // Print the progress bar - printf("\r["); - for (int i = 0; i < num_hashes; i++) - { - printf("#"); - } - for (int i = 0; i < 50 - num_hashes; i++) - { - printf(" "); - } - printf("] %3.2f%%", percent); - fflush(stdout); - - // Print a new line if finished - if (progress == total) - { - printf("\n"); - } -} diff --git a/src/contextsv.cpp b/src/contextsv.cpp new file mode 100644 index 00000000..b3bceddb --- /dev/null +++ b/src/contextsv.cpp @@ -0,0 +1,87 @@ +#include "contextsv.h" +#include "cnv_caller.h" +#include "sv_caller.h" + +#include + +/// @cond +#include +#include +#include +/// @endcond + +ContextSV::ContextSV(InputData& input_data) +{ + this->input_data = &input_data; +} + +// Entry point +int ContextSV::run() +{ + // Check if a file with CNV data was provided + CNVData cnv_calls; + if (this->input_data->getCNVFilepath() != "") { + // Load CNV data + std::cout << "Loading CNV data..." << std::endl; + cnv_calls.loadFromFile(this->input_data->getCNVFilepath()); + } else { + // Call CNVs using the SNP positions + std::cout << "Calling CNVs..." << std::endl; + CNVCaller cnv_caller(*this->input_data); + cnv_calls = cnv_caller.run(); + } + + // Call SVs from long read alignments and CNV calls + std::cout << "Calling SVs..." << std::endl; + SVCaller sv_caller(*this->input_data); + SVData sv_calls = sv_caller.run(); + + // Check if the ref genome was set + if (sv_calls.getRefGenome() == "") { + std::cout << "Warning: Reference genome not set in SVData" << std::endl; + } + + // Classify SVs based on CNV calls + std::cout << "Labeling CNVs..." << std::endl; + this->labelCNVs(cnv_calls, sv_calls); + + // Write SV calls to file + std::cout << "Writing SV calls to file..." << std::endl; + // Get the reference genome + FASTAQuery ref_genome = this->input_data->getRefGenome(); + std::string output_dir = this->input_data->getOutputDir(); + sv_calls.saveToVCF(ref_genome, output_dir); + + std::cout << "Done!" << std::endl; + + return 0; +} + +// Label SVs based on CNV calls +void ContextSV::labelCNVs(CNVData cnv_calls, SVData& sv_calls) +{ + // Iterate over SV calls + for (auto const& sv_call : sv_calls) { + + SVCandidate candidate = sv_call.first; + + // Get the SV coordinates + std::string chr = std::get<0>(candidate); + int start_pos = std::get<1>(candidate); + int end_pos = std::get<2>(candidate); + + // Get CNV calls within the SV coordinate range and identify the most + // common call + int cnv_call = cnv_calls.getMostCommonCNV(chr, start_pos, end_pos); + + // Update the SV call's type if the CNV call is not unknown + if (cnv_call != -1) { + sv_calls.updateSVType(candidate, cnv_call); + } + + // // Print the updated SV call if the type was changed, and if it is not unknown + // if (cnv_call != -1 && cnv_call != sv_type) { + // std::cout << "Updated SV call from " << sv_type << " to " << cnv_call << std::endl; + // } + } +} diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp new file mode 100644 index 00000000..1ca2d732 --- /dev/null +++ b/src/fasta_query.cpp @@ -0,0 +1,168 @@ +#include "fasta_query.h" + +/// @cond +#include +#include +#include +#include +#include +#include +#include +#include +/// @endcond + + +int FASTAQuery::setFilepath(std::string fasta_filepath) +{ + if (fasta_filepath == "") + { + std::cout << "No FASTA filepath provided" << std::endl; + return 1; + } + + this->fasta_filepath = fasta_filepath; + + std::cout << "Reading FASTA file " << fasta_filepath << std::endl; + + // Parse the FASTA file + std::ifstream fasta_file(fasta_filepath); + if (!fasta_file.is_open()) + { + std::cout << "Could not open FASTA file " << fasta_filepath << std::endl; + exit(1); + } + + // Get the sequence + std::map chr_to_seq; + std::string current_chr = ""; + std::string sequence = ""; + std::string line_str = ""; + + while (std::getline(fasta_file, line_str)) + { + // Check if the line is a header + if (line_str[0] == '>') + { + // Header line, indicating a new chromosome + // Store the previous chromosome and sequence + if (current_chr != "") + { + chr_to_seq[current_chr] = sequence; + //std::cout << "Read chromosome " << current_chr << " with length " << sequence.length() << std::endl; + sequence = ""; // Reset the sequence + } + + // Get the new chromosome + current_chr = line_str.substr(1); + + // Remove the description + size_t space_pos = current_chr.find(" "); + if (space_pos != std::string::npos) + { + current_chr.erase(space_pos); + } + + // Check if the chromosome is already in the map + if (chr_to_seq.find(current_chr) != chr_to_seq.end()) + { + std::cout << "Duplicate chromosome " << current_chr << std::endl; + exit(1); + } + } else { + // Sequence line + sequence += line_str; + } + } + + // Add the last chromosome at the end of the file + if (current_chr != "") + { + chr_to_seq[current_chr] = sequence; + } + + // Close the file + std::cout << "Closing FASTA file..." << std::endl; + fasta_file.close(); + + // Set the map + this->chr_to_seq = chr_to_seq; + + std::cout << "Done." << std::endl; + + return 0; +} + +std::string FASTAQuery::getFilepath() +{ + return this->fasta_filepath; +} + +// Function to get the reference sequence at a given position range +std::string FASTAQuery::query(std::string chr, int64_t pos_start, int64_t pos_end) +{ + //std::cout << "Querying " << chr << ":" << pos_start << "-" << pos_end << std::endl; + + // Check if a FASTA file has been set + if (this->fasta_filepath == "") + { + std::cout << "No FASTA file set" << std::endl; + return ""; + } + + // Check if the chromosome is in the map + if (this->chr_to_seq.find(chr) == this->chr_to_seq.end()) + { + std::cout << "Chromosome " << chr << " not found in FASTA file" << std::endl; + return ""; + } + + //std::cout << "Querying " << chr << ":" << pos_start << "-" << pos_end << std::endl; + + // Get the sequence + std::string sequence = this->chr_to_seq[chr]; + + //std::cout << "Sequence length: " << sequence.length() << std::endl; + + // Get the substring + std::string subsequence = sequence.substr(pos_start, pos_end - pos_start); // (start, length) + + //std::cout << "Subsequence length: " << subsequence.length() << std::endl; + + // If the subsequence is empty, return VCF missing data character + if (subsequence == "") + { + return "."; + } + + return subsequence; +} + +// Function to get the chromosome contig lengths in VCF header format +std::string FASTAQuery::getContigHeader() +{ + std::string contig_header = ""; + + // Sort the chromosomes + std::vector chromosomes; + for (auto const& chr_seq : this->chr_to_seq) + { + chromosomes.push_back(chr_seq.first); + } + std::sort(chromosomes.begin(), chromosomes.end()); + + // Iterate over the chromosomes and add them to the contig header + for (auto const& chr : chromosomes) + { + // Add the contig header line + contig_header += "##contig=\n"; + } + // { + // std::string chr = chr_seq.first; + // std::string seq = chr_seq.second; + + // // Add the contig header line + // contig_header += "##contig=\n"; + // } + + return contig_header; +} \ No newline at end of file diff --git a/src/input_data.cpp b/src/input_data.cpp new file mode 100644 index 00000000..65ec3b78 --- /dev/null +++ b/src/input_data.cpp @@ -0,0 +1,324 @@ +#include "input_data.h" + +/// @cond +#include +#include +#include +#include +/// @endcond + +#define BUFFER_SIZE 1024 + +// Constructor +InputData::InputData() +{ + this->bam_filepath = ""; + this->ref_filepath = ""; + this->snp_vcf_filepath = ""; + this->pfb_filepath = ""; + this->output_dir = ""; + this->region = ""; + this->window_size = 10000; + this->region_chr = ""; + this->region_start = 0; + this->region_end = 0; + this->region_set = false; + this->thread_count = 1; + this->hmm_filepath = "data/wgs.hmm"; +} + +std::string InputData::getBAMFilepath() +{ + return this->bam_filepath; +} + +void InputData::setBAMFilepath(std::string filepath) +{ + this->bam_filepath = filepath; +} + +void InputData::setRefGenome(std::string fasta_filepath) +{ + // Set the reference genome + this->fasta_query.setFilepath(fasta_filepath); +} + +FASTAQuery InputData::getRefGenome() +{ + return this->fasta_query; +} + +std::string InputData::getOutputDir() +{ + return this->output_dir; +} + +void InputData::setOutputDir(std::string dirpath) +{ + this->output_dir = dirpath; + + // Create the output directory + std::string cmd = "mkdir -p " + output_dir; + system(cmd.c_str()); +} + +std::string InputData::getRegion() +{ + return this->region; +} + +void InputData::setRegion(std::string region) +{ + this->region = region; + + // Parse the region + char *tok = strtok((char *)region.c_str(), ":"); + int col = 0; + while (tok != NULL) + { + // Get the chromosome + if (col == 0) + { + this->region_chr = tok; + } + + // Get the start and end positions + else if (col == 1) + { + // Check if empty + if (strcmp(tok, "") == 0) + { + break; + } + + // Split the start and end positions + char *start_tok = strtok(tok, "-"); + char *end_tok = strtok(NULL, "-"); + + // Get the start position + if (start_tok != NULL) + { + this->region_start = atoi(start_tok); + } + + // Get the end position + if (end_tok != NULL) + { + this->region_end = atoi(end_tok); + } + } + tok = strtok(NULL, ":"); + col++; + } + + // Check if a valid chromosome was parsed + if (this->region_chr == "") + { + std::cerr << "Error: Region chromosome not set" << std::endl; + exit(1); + } + + // Check if only a chromosome was parsed + if (this->region_start == 0 && this->region_end == 0) + { + // Use the entire chromosome as the region + this->region_set = false; + std::cout << "Parsed region = " << this->region_chr << std::endl; + } else { + // Check if a valid chromosome start and end position were parsed + if (this->region_start == 0 || this->region_end == 0 || this->region_start > this->region_end) + { + std::cerr << "Error: Region start and end positions not set" << std::endl; + exit(1); + } else { + // Set the region + this->region_set = true; + std::cout << "Parsed region = " << this->region_chr << ":" << this->region_start << "-" << this->region_end << std::endl; + } + } +} + +int InputData::getWindowSize() +{ + return this->window_size; +} + +void InputData::setWindowSize(int window_size) +{ + this->window_size = window_size; +} + +std::string InputData::getSNPFilepath() +{ + return this->snp_vcf_filepath; +} + +void InputData::setSNPFilepath(std::string filepath) +{ + this->snp_vcf_filepath = filepath; +} + +std::string InputData::getRegionChr() +{ + return this->region_chr; +} + +int InputData::getRegionStart() +{ + return this->region_start; +} + +int InputData::getRegionEnd() +{ + return this->region_end; +} + +bool InputData::getRegionSet() +{ + return this->region_set; +} + +void InputData::setChrCov(std::string chr_cov) +{ + // Update the chromosome coverage map if the string is not empty + if (chr_cov != "") + { + // Split the string by commas + std::istringstream ss(chr_cov); + std::string token; + std::vector chr_cov_pairs; + + while (std::getline(ss, token, ',')) + { + chr_cov_pairs.push_back(token); + } + + // Iterate over the pairs + for (auto const &pair : chr_cov_pairs) + { + // Split the pair by colon + std::istringstream ss(pair); + std::string token; + std::vector chr_cov; + + while (std::getline(ss, token, ':')) + { + chr_cov.push_back(token); + } + + // Check if the pair is valid + if (chr_cov.size() == 2) + { + // Get the chromosome and coverage + std::string chr = chr_cov[0]; + double cov = std::stod(chr_cov[1]); + + // Add the pair to the map + this->chr_cov[chr] = cov; + + // Print the pair + std::cout << "Set mean coverage for " << chr << " to " << cov << std::endl; + } + } + } +} + +int InputData::getChrCov(std::string chr, double &cov) +{ + // Check if the chromosome is in the map + if (this->chr_cov.find(chr) != this->chr_cov.end()) + { + // Get the coverage + cov = this->chr_cov[chr]; + + // Return 0 if the chromosome is in the map + return 0; + } + else + { + // Return -1 if the chromosome is not in the map + return -1; + } +} + +std::string InputData::getPFBFilepath() +{ + return this->pfb_filepath; +} + +void InputData::setPFBFilepath(std::string filepath) +{ + this->pfb_filepath = filepath; + + // Check if empty string + if (filepath == "") + { + return; + + } else { + // Check if the file exists + FILE *fp = fopen(filepath.c_str(), "r"); + if (fp == NULL) + { + std::cerr << "PFB file does not exist: " << filepath << std::endl; + exit(1); + } + } +} + +void InputData::setThreadCount(int thread_count) +{ + this->thread_count = thread_count; +} + +int InputData::getThreadCount() +{ + return this->thread_count; +} + +std::string InputData::getHMMFilepath() +{ + return this->hmm_filepath; +} + +void InputData::setHMMFilepath(std::string filepath) +{ + // Check if empty string + if (filepath == "") + { + std::cout << "Using default HMM file: " << this->hmm_filepath << std::endl; + return; + + } else { + // Check if the file exists + FILE *fp = fopen(filepath.c_str(), "r"); + if (fp == NULL) + { + std::cerr << "HMM file does not exist: " << filepath << std::endl; + exit(1); + } else { + this->hmm_filepath = filepath; + std::cout << "Using HMM file: " << this->hmm_filepath << std::endl; + } + } +} + +void InputData::setDisableCIGAR(bool disable_cigar) +{ + this->disable_cigar = disable_cigar; +} + +bool InputData::getDisableCIGAR() +{ + return this->disable_cigar; +} + +void InputData::setCNVFilepath(std::string filepath) +{ + this->cnv_filepath = filepath; +} + +std::string InputData::getCNVFilepath() +{ + return this->cnv_filepath; +} diff --git a/src/integrative_caller.cpp b/src/integrative_caller.cpp deleted file mode 100644 index 37b326f6..00000000 --- a/src/integrative_caller.cpp +++ /dev/null @@ -1,40 +0,0 @@ - -#include "integrative_caller.h" -#include "cnv_caller.h" -#include "common.h" - -#include -#include -#include - - -IntegrativeCaller::IntegrativeCaller(Common common) -{ - this->common = common; -} - -/// Entry point -int IntegrativeCaller::run() -{ - // Call SNVs - // SNVCaller snv_obj(this->common); - // snv_obj.run(); - - // Get the SNP VCF file - std::string snp_vcf_filename = this->common.get_snp_vcf_filepath(); - std::cout << "SNP VCF file = " << snp_vcf_filename << std::endl; - - // Get the positions of SNPs in the region - //snv_obj.run(filepath); - - // Print the size of the SNP positions vector - // std::cout << "SNP positions vector size = " << snv_obj.get_snp_positions().size() << std::endl; - - // Call CNVs - //CNVCaller cnv_obj(this->common, snv_obj.get_snp_positions()); - //std::vector snp_positions; - CNVCaller cnv_obj(this->common); - cnv_obj.run(); - - return 0; -} diff --git a/src/kc.cpp b/src/kc.cpp index abffdb14..453af803 100644 --- a/src/kc.cpp +++ b/src/kc.cpp @@ -1,8 +1,10 @@ +#include "kc.h" + +/// @cond #include #include #include - -#include "kc.h" +/// @endcond /************************ DESCRIPTION OF THE kc.c FILE ******************* @@ -2570,13 +2572,13 @@ cumulative normal distribution -inf */ { - return (1+erf((x-mu)/(sigma*sqrt(2))))/2; + return (1+errorf((x-mu)/(sigma*sqrt(2))))/2; } double cdf_stdnormal (double x) /*Returns the cumulative density function for a standard normal distribution (see http://en.wikipedia.org/wiki/Normal_distribution)*/ { - return (1+erf(x/sqrt(2)))/2; + return (1+errorf(x/sqrt(2)))/2; } @@ -3698,18 +3700,18 @@ the formula is adapted from http://www.theorie.physik.uni-muenchen.de/~serge/erf return y; } -double erf(double x) +double errorf(double x) /* calculate the error function (http://en.wikipedia.org/wiki/Error_function) The error function is defined by x 2 2 / -t - erf(x) = -------- | e dt + errorf(x) = -------- | e dt sqrt(pi) / 0 */ { - double gammp(double a, double x); - return (x < 0.0) ? (-gammp(0.5,x*x)) : gammp(0.5,x*x); + double gammp(double a, double x); + return (x < 0.0) ? (-gammp(0.5,x*x)) : gammp(0.5,x*x); } diff --git a/src/khmm.cpp b/src/khmm.cpp index b4a421d7..88075dd9 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -1,8 +1,14 @@ -#include - #include "khmm.h" #include "kc.h" +/// @cond +#include +#include +#include +#include +/// @endcond + +#define BUFFER_SIZE 1024 #define STATE_CHANGE 100000.0 /*this is the expected changes (D value) in the transition matrix*/ #define VITHUGE 100000000000.0 #define FLOAT_MINIMUM 1.175494351e-38; /*this is indeed machine dependent*/ @@ -27,26 +33,23 @@ std::vector testVit_CHMM(CHMM hmm, int T, double *O1, double *O2, double *p // Note: PLOGPROBA is replaced with DELTA (delta matrix) after running the // HMM, which is used to calculate the probability of the most likely state - // int *q; /* Array for the state sequence q[1..T] */ double **delta; // Matrix int **psi; // Matrix - - // These 3 variables will be updated with the HMM results - // q = ivector(1, T); // Allocate an int vector for each - // probe delta = dmatrix(1, T, 1, hmm.N); // Allocate a TxN double matrix (N=6 states) psi = imatrix(1, T, 1, hmm.N); // Allocate a TxN int matrix (N=6 states) - - // Allocate pfb and plogproba - pfb = dvector(1, T); + + // Set initial log probability for each state plogproba = dvector(1, hmm.N); + for (int i = 1; i <= hmm.N; i++) + { + plogproba[i] = -VITHUGE; + } // Run the HMM std::vector q; // State sequence q = ViterbiLogNP_CHMM(&hmm, T, O1, O2, pfb, snpdist, delta, psi, plogproba); // Free the variables - // free_ivector(q, 1, T); free_imatrix(psi, 1, T, 1, hmm.N); free_dmatrix(delta, 1, T, 1, hmm.N); @@ -57,7 +60,9 @@ std::vector testVit_CHMM(CHMM hmm, int T, double *O1, double *O2, double *p return q; } -// Emission probability: Calculate the state observationn likelihood b_j(O_t) of the observation symbol O_t given the current state j +// Emission probability: Calculate the state observationn likelihood b_j(O_t) of +// the observation symbol O_t given the current state j +// O_t is the LRR value double b1iot(int state, double *mean, double *sd, double uf, double o) { // UF = previous alpha @@ -65,7 +70,6 @@ double b1iot(int state, double *mean, double *sd, double uf, double o) p = uf; // PDF normal is the transition probability distrubution a_ij (initialized as pi_n) from state i to j - // P += (1-alpha_t-1) * p += (1 - uf) * pdf_normal(o, mean[state], sd[state]); // Prevent divide by zero error @@ -76,21 +80,32 @@ double b1iot(int state, double *mean, double *sd, double uf, double o) return log(p); } +// Emission probability: Calculate the state observationn likelihood b_j(O_t) of +// the observation symbol O_t given the current state j +// O_t is the BAF value double b2iot(int state, double *mean, double *sd, double uf, double pfb, double b) { double p = 0; - double mean0 = mean[1]; - double mean25 = mean[2]; - double mean33 = mean[3]; - double mean50 = mean[4]; - double mean50_state1 = mean[5]; - double sd0 = sd[1]; - double sd25 = sd[2]; - double sd33 = sd[3]; - double sd50 = sd[4]; - double sd50_state1 = sd[5]; + double mean0 = mean[1]; // mean[1] = 0 + double mean25 = mean[2]; // mean[2] = 0.25 + double mean33 = mean[3]; // mean[3] = 0.33 + double mean50 = mean[4]; // mean[4] = 0.5 + double mean50_state1 = mean[5]; // mean[5] = 0.5 + double sd0 = sd[1]; // sd[1] = 0 + double sd25 = sd[2]; // sd[2] = 0.25 + double sd33 = sd[3]; // sd[3] = 0.33 + double sd50 = sd[4]; // sd[4] = 0.5 + double sd50_state1 = sd[5]; // sd[5] = 0.5 + + p = uf; // UF = previous alpha (transition probability) + + // PDF normal is the transition probability distrubution a_ij (initialized + // as pi_n) from state i to j + // Here, we calculate the probability of the observation symbol O_t given + // the current state j. The observation symbol is the BAF value. + // P += (1-alpha_t-1) * pdf_normal(b, mean[state], sd[state]); + // b = BAF value - p = uf; if (state == 1) { if (b == 0) @@ -192,9 +207,10 @@ double b2iot(int state, double *mean, double *sd, double uf, double pfb, double p += (1 - uf) * pfb * pfb * pfb * pfb * pdf_normal(b, 1 - mean0, sd0); } } - if (p == 0) + if (p == 0) // Prevent divide by zero error p = FLOAT_MINIMUM; - return log(p); + + return log(p); // Return the log probability of the observation symbol O_t } // SV calling with the HMM via the Viterbi algorithm @@ -204,7 +220,6 @@ std::vector ViterbiLogNP_CHMM(CHMM *phmm, int T, double *O1, double *O2, do int t; /* time index */ int snp_count = 0; - int cn_count = 0; int maxvalind; double maxval, val; @@ -232,58 +247,63 @@ std::vector ViterbiLogNP_CHMM(CHMM *phmm, int T, double *O1, double *O2, do { if (phmm->pi[i] == 0) phmm->pi[i] = 1e-9; /*eliminate problems with zero probability*/ - phmm->pi[i] = log(phmm->pi[i]); + phmm->pi[i] = log(phmm->pi[i]); // Convert to log probability due to underflow } // Bi(Ot) is an NxT matrix of emission probabilities (observation likelihoods) - // expressing the probability of an observation Ot being generated from a state i - biot = dmatrix(1, phmm->N, 1, T); + // expressing the probability of an observation Ot being generated from a + // state i. Ot is the observation symbol at time t (in this case, the LRR + // and BAF values). + + // Initialize the emission probability matrix + biot = dmatrix(1, phmm->N, 1, T); // Allocate a NxT double matrix + + std::cout << "[HMM] Running Viterbi algorithm with " << phmm->N << " states and " << T << " probes\n"; // Loop through each state N + // Start at 1 because states are 1-based (1-6) for (i = 1; i <= phmm->N; i++) { - // Loop through each probe T - for (t = 1; t <= T; t++) + // Loop through each probe T (0-based) + //for (t = 1; t <= T; t++) + for (t = 0; t < T; t++) { - // If it's non-polymorphic (BAF > 1) - if (O2[t] > 1) - { /*normally BAF>=0 and BAF<=1; we use BAF>1 to indicate a Non-Polymorphic marker*/ - if (!phmm->NP_flag) - std::cerr << "FATAL ERROR: CN probe detected but HMM model does not contain parameters for them\n"; - - // Update emissions based on LRR - biot[i][t] = b1iot(i, phmm->B3_mean, phmm->B3_sd, phmm->B3_uf, O1[t]); - cn_count++; - - // If it's a regular SNP - } - else - { /*a regular SNP marker; we use both LRR and BAF information to calculate logProb for the marker*/ - // Update emissions based on LRR (O1) - biot[i][t] = b1iot(i, phmm->B1_mean, phmm->B1_sd, phmm->B1_uf, O1[t]); - - // Update emissions based on BAF (O2) - // *Set pfb to all 1's to ignore BAF population frequency in - // this implementation - biot[i][t] += b2iot(i, phmm->B2_mean, phmm->B2_sd, phmm->B2_uf, pfb[t], O2[t]); - snp_count++; - } + // A regular SNP marker; we use both LRR and BAF information to + // calculate the joint probability of the marker being in state i + + // Get the state observation likelihood b_j(O_t) of the observation + // symbol O_t given the current state j + // B1_uf is the previous alpha (transition probability) + double O1_val = O1[t]; + double O1_logprob = b1iot(i, phmm->B1_mean, phmm->B1_sd, phmm->B1_uf, O1_val); + + double O2_val = O2[t]; + double O2_logprob = b2iot(i, phmm->B2_mean, phmm->B2_sd, phmm->B2_uf, pfb[t], O2_val); + // double O2_logprob = b1iot(i, phmm->B2_mean, phmm->B2_sd, phmm->B2_uf, O2_val); + + // Update the emission probability matrix with the joint probability + // of the marker being in state i at time t (log probability) based + // on both LRR and BAF values + biot[i][t] = O1_logprob + O2_logprob; + + // Update the SNP count + snp_count++; } } + /*fprintf(stderr, "NOTICE: Encounterd %i snp probes and %i cn probes\n", snp_count, cn_count);*/ /* 1. Initialization */ for (i = 1; i <= phmm->N; i++) { - delta[1][i] = phmm->pi[i] + biot[i][1]; - psi[1][i] = 0; + delta[1][i] = phmm->pi[i] + biot[i][1]; // Initialize the delta matrix (log probability) + psi[1][i] = 0; // Initialize the psi matrix (state sequence) to 0 (no state) pprob[1] = -VITHUGE; // Initialize log probabilities for each state to -inf } /* 2. Recursion */ - for (t = 2; t <= T; t++) { @@ -298,16 +318,16 @@ std::vector ViterbiLogNP_CHMM(CHMM *phmm, int T, double *O1, double *O2, do maxvalind = 1; for (i = 1; i <= phmm->N; i++) { - val = delta[t - 1][i] + log(A1[i][j]); - if (val > maxval) + val = delta[t - 1][i] + log(A1[i][j]); // Update the delta matrix (log probability) + if (val > maxval) // Update the max value { maxval = val; maxvalind = i; } } - delta[t][j] = maxval + biot[j][t]; - psi[t][j] = maxvalind; + delta[t][j] = maxval + biot[j][t]; // Update the delta matrix (log probability) + psi[t][j] = maxvalind; // Update the psi matrix (state sequence) } } @@ -519,4 +539,3 @@ void FreeCHMM(CHMM *phmm) free_dvector(phmm->B3_sd, 1, phmm->N); } } - diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp new file mode 100644 index 00000000..483801f9 --- /dev/null +++ b/src/sv_caller.cpp @@ -0,0 +1,350 @@ +// +// sv_caller.cpp: +// Detect SVs from long read alignments +// + +#include "sv_caller.h" + +#include + +/// @cond +#include +#include +#include +#include +/// @endcond + +SVCaller::SVCaller(InputData& input_data) +{ + this->input_data = &input_data; +} + +// Main function for SV detection +SVData SVCaller::run() +{ + // Get SV calls from split read alignments (primary and supplementary) and + // directly from the CIGAR string + SVData sv_calls = this->detectSVsFromSplitReads(); + + // Return the SV calls + return sv_calls; +} + +// Detect SVs from the CIGAR string of a read alignment. +void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls) +{ + // Get the chromosome + std::string chr = header->target_name[alignment->core.tid]; + + // Get the position + int32_t pos = alignment->core.pos; + + // Get the CIGAR string + uint32_t* cigar = bam_get_cigar(alignment); + + // Get the CIGAR length + int cigar_len = alignment->core.n_cigar; + + // Track the query position + int query_pos = 0; + + // Loop through the CIGAR string and detect insertions and deletions in + // reference coordinates + // POS is the leftmost position of where the alignment maps to the reference: + // https://genome.sph.umich.edu/wiki/SAM + for (int i = 0; i < cigar_len; i++) { + + // Get the CIGAR operation + int op = bam_cigar_op(cigar[i]); + + // Get the CIGAR operation length + int op_len = bam_cigar_oplen(cigar[i]); + + // Check if the CIGAR operation is an insertion + if (op == BAM_CINS) { + + // Add the SV if greater than the minimum SV size + if (op_len >= this->min_sv_size) { + + // Get the sequence of the insertion from the query + std::string ins_seq_str = ""; + uint8_t* seq_ptr = bam_get_seq(alignment); + for (int j = 0; j < op_len; j++) { + ins_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; + } + + // if (pos == 47026017 && op_len == 140) { + // std::cout << "Insertion sequence: " << ins_seq_str << std::endl; + + // // Check if the TCGCCTG substring is present in the + // // insertion + // std::string substring = "TCGCCTG"; + // if (ins_seq_str.find(substring) != std::string::npos) { + // std::cout << "Found substring " << substring << std::endl; + // } else { + // std::cout << "Did not find substring " << substring << std::endl; + // } + // } + + // Add the insertion to the SV calls + sv_calls.add(chr, pos, pos + op_len, 3, ins_seq_str, "CIGARINS"); + } + + // Check if the CIGAR operation is a deletion + } else if (op == BAM_CDEL) { + + // Add the SV if greater than the minimum SV size + if (op_len >= this->min_sv_size) { + + // Add the deletion to the SV calls + sv_calls.add(chr, pos, pos + op_len, 0, ".", "CIGARDEL"); + } + } + + // Update the reference coordinate based on the CIGAR operation (M, D, + // N, =, X) + // https://samtools.github.io/hts-specs/SAMv1.pdf + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + pos += op_len; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { + // Do nothing + } else { + std::cerr << "ERROR: Unknown CIGAR operation " << op << std::endl; + exit(1); + } + + // Update the query position based on the CIGAR operation (M, I, S, H) + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + query_pos += op_len; + } else if (op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { + // Do nothing + } else { + std::cerr << "ERROR: Unknown CIGAR operation " << op << std::endl; + exit(1); + } + } +} + +// Detect SVs from split read alignments (primary and supplementary) and +// directly from the CIGAR string +SVData SVCaller::detectSVsFromSplitReads() +{ + bool debug_cigar = false; + + // Open the BAM file + samFile *fp_in = sam_open(this->input_data->getBAMFilepath().c_str(), "r"); + if (fp_in == NULL) { + std::cerr << "ERROR: failed to open " << this->input_data->getBAMFilepath() << std::endl; + exit(1); + } + + // Get the header + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); + if (bamHdr == NULL) { + std::cerr << "ERROR: failed to read header for " << this->input_data->getBAMFilepath() << std::endl; + exit(1); + } + + // Get the region + std::string region = this->input_data->getRegion(); + + // Get the index + hts_idx_t *idx = sam_index_load(fp_in, this->input_data->getBAMFilepath().c_str()); + if (idx == NULL) { + std::cerr << "ERROR: failed to load index for " << this->input_data->getBAMFilepath() << std::endl; + exit(1); + } + + // Set up the iterator + hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); + + // Set up the read + bam1_t *bam1 = bam_init1(); + + // Create a map of primary and supplementary alignments by QNAME (query template name) + int num_alignments = 0; + //int num_sv_calls = 0; + FASTAQuery ref_genome = this->input_data->getRefGenome(); + SVData sv_calls(ref_genome); + QueryMap primary_alignments; // TODO: Add depth to primary alignments + QueryMap supplementary_alignments; + std::cout << "Reading alignments..." << std::endl; + while (sam_itr_next(fp_in, itr, bam1) >= 0) { + + // Get the QNAME (query template name) for associating split reads + std::string qname = bam_get_qname(bam1); + + // Skip secondary and unmapped alignments + if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP) { + // Do nothing + + // Skip alignments with low mapping quality + } else if (bam1->core.qual < this->min_mapq) { + // Do nothing + // std::cout << "Skipping alignment with low mapping quality" << std::endl; + // std::cout << bam1->core.qual << " < " << this->min_mapq << std::endl; + } else { + + // Process primary alignments + if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { + + // Add the primary alignment to the map + std::string chr = bamHdr->target_name[bam1->core.tid]; + int64_t start = bam1->core.pos; + int64_t end = bam_endpos(bam1); + int depth = 0; // Placeholder for now + AlignmentData alignment(chr, start, end, depth); + primary_alignments[qname].push_back(alignment); + + // Print the entire CIGAR string + if (debug_cigar) { + std::cout << "Full CIGAR string: " << std::endl; + for (uint32_t i = 0; i < bam1->core.n_cigar; i++) { + std::cout << bam1->core.n_cigar << bam_cigar_opchr(bam1->core.n_cigar) << bam_cigar_oplen(bam1->core.n_cigar) << " "; + } + std::cout << std::endl; + } + + // Finally, call SVs directly from the CIGAR string + if (!this->input_data->getDisableCIGAR()) { + + //std::cout << "Calling SVs from CIGAR string" << std::endl; + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls); + } + + // Process supplementary alignments + } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { + + //std::cout << "Found supplementary alignment" << std::endl; + + // Add the supplementary alignment to the map + std::string chr = bamHdr->target_name[bam1->core.tid]; + int32_t start = bam1->core.pos; + int32_t end = bam_endpos(bam1); + int32_t depth = bam1->core.n_cigar; + AlignmentData alignment(chr, start, end, depth); + + // Add the supplementary alignment to the map + supplementary_alignments[qname].push_back(alignment); + } + } + + // Increment the number of alignment records processed + num_alignments++; + + // Print the number of alignments processed every 10 thousand + if (num_alignments % 10000 == 0) { + std::cout << num_alignments << " alignments processed" << std::endl; + } + } + + // Print the number of alignments processed + std::cout << num_alignments << " alignments processed" << std::endl; + + // Loop through the map of primary alignments by QNAME and find gaps and + // overlaps from supplementary alignments + std::cout << "Running split read analysis..." << std::endl; + for (const auto& entry : primary_alignments) { + + // Get the QNAME + std::string qname = entry.first; + + // Get the first primary alignment + AlignmentData primary_alignment = entry.second[0]; + + // Get the primary alignment chromosome + std::string primary_chr = std::get<0>(primary_alignment); + + // Get the start and end positions of the primary alignment + int32_t primary_start = std::get<1>(primary_alignment); + int32_t primary_end = std::get<2>(primary_alignment); + + // Loop through the supplementary alignments and find gaps and overlaps + AlignmentVector supp_alignments = supplementary_alignments[qname]; + for (const auto& supp_alignment : supp_alignments) { + + // Get the supplementary alignment chromosome + std::string supp_chr = std::get<0>(supp_alignment); + + // Skip supplementary alignments that are on a different chromosome + // for now (TODO: Use for translocations) + if (primary_chr != supp_chr) { + std::cout << "Supplementary alignment on different chromosome" << std::endl; + continue; + } + + // Get the start and end positions of the supplementary alignment + int32_t supp_start = std::get<1>(supp_alignment); + int32_t supp_end = std::get<2>(supp_alignment); + + + // Determine the type of gap between alignments + if (supp_start < primary_start && supp_end < primary_start) { + // Gap with supplementary before primary: + // [supp_start] [supp_end] -- [primary_start] [primary_end] + + // Use the gap ends as the SV endpoints (Deletion) + if (primary_start - supp_end >= this->min_sv_size) { + sv_calls.add(supp_chr, supp_end, primary_start, -1, ".", "GAPINNER_1"); + } + + // Use the alignment ends as the SV endpoints (Insertion) + if (primary_end - supp_start >= this->min_sv_size) { + sv_calls.add(supp_chr, supp_start, primary_end, -1, ".", "GAPOUTER_1"); + } + + //std::cout << "FWD GAP at " << supp_chr << ":" << gap_start << "-" << gap_end << std::endl; + + } else if (supp_start > primary_end && supp_end > primary_end) { + // Gap with supplementary after primary: + // [primary_start] [primary_end] -- [supp_start] [supp_end] + + // Use the gap ends as the SV endpoints (Deletion) + if (supp_start - primary_end >= this->min_sv_size) { + sv_calls.add(supp_chr, primary_end, supp_start, -1, ".", "GAPINNER_2"); + } + + // Use the alignment ends as the SV endpoints (Insertion) + if (supp_end - primary_start >= this->min_sv_size) { + sv_calls.add(supp_chr, primary_start, supp_end, -1, ".", "GAPOUTER_2"); + } + + //std::cout << "REV GAP at " << supp_chr << ":" << gap_start << "-" << gap_end << std::endl; + + } else { + // Overlap between alignments: + // [supp_start] [primary_start] [supp_end] [primary_end] + // [primary_start] [supp_start] [primary_end] [supp_end] + + // Use the overlap ends as the SV endpoints (Deletion) + if (supp_end - primary_start >= this->min_sv_size) { + sv_calls.add(supp_chr, primary_start, supp_end, -1, ".", "OVERLAP_1"); + } else if (primary_end - supp_start > this->min_sv_size) { + sv_calls.add(supp_chr, supp_start, primary_end, -1, ".", "OVERLAP_2"); + } + + // Use the alignment ends as the SV endpoints (Insertion) + if (primary_end - supp_start >= this->min_sv_size) { + sv_calls.add(supp_chr, supp_start, primary_end, -1, ".", "OVERLAP_3"); + } else if (supp_end - primary_start > this->min_sv_size) { + sv_calls.add(supp_chr, primary_start, supp_end, -1, ".", "OVERLAP_4"); + } + + //std::cout << "OVERLAP at " << supp_chr << ":" << gap_start << "-" << gap_end << std::endl; + } + } + } + + // Destroy objects and close files + bam_destroy1(bam1); // Destroy the read + hts_itr_destroy(itr); // Destroy the iterator + sam_close(fp_in); // Close the BAM file + bam_hdr_destroy(bamHdr); // Destroy the header + hts_idx_destroy(idx); // Destroy the index + + // Print the number of SV calls + std::cout << sv_calls.size() << " SV calls" << std::endl; + + // Return the SV calls + return sv_calls; +} diff --git a/src/sv_data.cpp b/src/sv_data.cpp new file mode 100644 index 00000000..a79d82c7 --- /dev/null +++ b/src/sv_data.cpp @@ -0,0 +1,194 @@ +#include "sv_data.h" +#include "vcf_writer.h" + +/// @cond +#include +#include +/// @endcond + +void SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::string alt_allele, std::string data_type) +{ + // Add the SV call to the map of candidate locations + SVCandidate candidate(chr, start, end, alt_allele); + if (this->sv_calls.find(candidate) != this->sv_calls.end()) { + + // Update the read depth + SVInfo& sv_info = this->sv_calls[candidate]; + sv_info.read_depth += 1; + + // Update the SV type if it is unknown + if (sv_info.sv_type == -1) { + sv_info.sv_type = sv_type; + } + + // Update the alignment type used to call the SV + sv_info.data_type.insert(data_type); + + } else { + // Determine the SV length + int sv_length = end - start; + + // Create a new SVInfo object + SVInfo sv_info(sv_type, 1, data_type, sv_length); + + // Add the SV candidate to the map + this->sv_calls[candidate] = sv_info; + } +} + +SVData::SVData(FASTAQuery &ref_genome) +{ + // Set the reference genome + this->ref_genome = &ref_genome; +} + +std::string SVData::getRefGenome() +{ + return this->ref_genome->getFilepath(); +} + +std::string SVData::getSequence(std::string chr, int64_t pos_start, int64_t pos_end) +{ + // Query the reference genome + return this->ref_genome->query(chr, pos_start, pos_end); +} + +void SVData::updateSVType(SVCandidate candidate, int sv_type) +{ + // Update the SV type for a given SV candidate + if (this->sv_calls.find(candidate) != this->sv_calls.end()) { + // Update the SV type + SVInfo& sv_info = this->sv_calls[candidate]; + sv_info.sv_type = sv_type; + } else { + std::cerr << "Error: Unable to update SV type for SV candidate (" << std::get<0>(candidate) << ", " << std::get<1>(candidate) << ", " << std::get<2>(candidate) << ", " << std::get<3>(candidate) << ")" << std::endl; + } +} + +void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) +{ + // Create a VCF writer + std::string output_vcf = output_dir + "/sv_calls.vcf"; + VcfWriter vcf_writer(output_vcf); + + // Set the sample name + std::string sample_name = "SAMPLE"; + + // Set the header lines + std::vector header_lines = { + std::string("##reference=") + ref_genome.getFilepath(), + ref_genome.getContigHeader(), + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##FILTER=", + "##FILTER=", + "##FORMAT=", + "##FORMAT=" + }; + + // Write the header lines + vcf_writer.writeHeader(header_lines); + + // Iterate over the SV calls + std::cout << "Saving SV calls to " << output_vcf << "..." << std::endl; + std::string sv_method = "CONTEXTSVv0.1"; + for (auto const& sv_call : this->sv_calls) { + + // Get the SV candidate and SV info + SVCandidate candidate = sv_call.first; + SVInfo info = sv_call.second; + int sv_type = info.sv_type; + int depth = info.read_depth; + int sv_length = info.sv_length; + std::set data_type = info.data_type; + + // Convert the data type set to a string + std::string data_type_str = ""; + for (auto const& type : data_type) { + data_type_str += type + ","; + } + + // Get the CHROM, POS, END, and ALT + std::string chr = std::get<0>(candidate); + int pos = std::get<1>(candidate); + int end = std::get<2>(candidate); + + // If the SV type is unknown, skip it + if (sv_type == -1) { + continue; + } + + // Process by SV type + std::string ref_allele = "."; + std::string alt_allele = "."; + std::string repeat_type = "NA"; + + // Deletion + if (sv_type == 0) { + // Get the reference allele from the reference genome as well as the + // previous base preceding the SV + int preceding_pos = std::max(1, pos-1); // Make sure the position is not negative + ref_allele = ref_genome.query(chr, preceding_pos, end); + + // Use the previous base as the alternate allele + alt_allele = ref_allele.substr(0, 1); + + // Make the SV length negative + sv_length = -1 * sv_length; + + repeat_type = "CONTRAC"; // Deletion + + // Duplications and insertions + } else if (sv_type == 1 || sv_type == 3) { + // Use the preceding base as the reference allele + int preceding_pos = std::max(1, pos-1); // Make sure the position is not negative + ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos+1); + + // Use the insertion sequence as the alternate allele + alt_allele = std::get<3>(candidate); + + // Insert the reference base into the alternate allele + alt_allele.insert(0, ref_allele); + + // Update the end position to the start position to change from + // query to reference coordinates + end = pos; + + if (sv_type == 1) { + repeat_type = "DUP"; // Duplication + } + } + + // Get the SV type string + std::string sv_type_str = this->sv_type_map[sv_type]; + + // For now, set the QUAL and FILTER as unknown + + // Set the genotype as unspecified for now (Haven't distinguished b/w homozygous, heterozygous) + std::string genotype = "./."; + + // Create the INFO string + std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";DP=" + std::to_string(depth) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";REPTYPE=" + repeat_type; + + // Create the FORMAT string + std::string format_str = "GT:DP"; + + // Create the sample string + std::string sample_str = genotype + ":" + std::to_string(depth); + std::vector samples = {sample_str}; + + // Write the SV call to the file + vcf_writer.writeRecord(chr, pos, ".", ref_allele, alt_allele, ".", ".", info_str, format_str, samples); + //output_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "." << "\t" << info_str << "\t" << format_str << "\t" << sample_str << std::endl; + } + + // Close the output stream + vcf_writer.close(); + // output_stream.close(); + std::cout << "Saved SV calls to " << output_vcf << std::endl; +} diff --git a/src/swig_interface.cpp b/src/swig_interface.cpp new file mode 100644 index 00000000..87334fec --- /dev/null +++ b/src/swig_interface.cpp @@ -0,0 +1,31 @@ +// +// Created by jperdomo on 1/8/2023. +// + +#include "swig_interface.h" +#include "contextsv.h" + +/// @cond +#include +/// @endcond + + +// Run the CLI with the given parameters +int run(InputData input_data) +{ + + // Run ContextSV + ContextSV contextsv(input_data); + try + { + contextsv.run(); + } + + catch (std::exception& e) + { + std::cerr << e.what() << std::endl; + return -1; + } + + return 0; +} diff --git a/src/swig_wrapper.i b/src/swig_wrapper.i index f42ae4ed..d398087c 100644 --- a/src/swig_wrapper.i +++ b/src/swig_wrapper.i @@ -6,7 +6,8 @@ SWIG wrapper for C++ code. // Include header %{ -#include "cli.h" +#include "swig_interface.h" +#include "input_data.h" %} // Set up types @@ -21,5 +22,9 @@ SWIG wrapper for C++ code. } } +// Expose the InputData class +%include "input_data.h" + // Include functions -int run(const std::string& bam, const std::string& snps, const std::string& outdir, const std::string& region); +int run(InputData input_data); + diff --git a/src/utils.cpp b/src/utils.cpp index e69de29b..a307a663 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -0,0 +1,36 @@ +#include "utils.h" + +/// @cond +#include +#include +#include +/// @endcond + +// Print a progress bar +void printProgress(int progress, int total) +{ + // Get the percentage + float percent = (float)progress / (float)total * 100.0; + + // Get the number of hashes + int num_hashes = (int)(percent / 2.0); + + // Print the progress bar + printf("\r["); + for (int i = 0; i < num_hashes; i++) + { + printf("#"); + } + for (int i = 0; i < 50 - num_hashes; i++) + { + printf(" "); + } + printf("] %3.2f%%", percent); + fflush(stdout); + + // Print a new line if finished + if (progress == total) + { + printf("\n"); + } +} diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp new file mode 100644 index 00000000..013816f2 --- /dev/null +++ b/src/vcf_writer.cpp @@ -0,0 +1,73 @@ +#include "vcf_writer.h" + +/// @cond +#include +#include +/// @endcond + +VcfWriter::VcfWriter(const std::string &filename) +{ + // Remove the file if it already exists + std::cout << "Removing previous VCF..." << std::endl; + std::remove(filename.c_str()); + + // Open the VCF file + std::cout << "Opening VCF file..." << std::endl; + this->file_stream.open(filename); + if (!this->file_stream.is_open()) { + std::cerr << "Error: Unable to open " << filename << std::endl; + exit(1); + } else { + std::cout << "Done." << std::endl; + } +} + +void VcfWriter::writeHeader(const std::vector &headerLines) +{ + std::cout << "Writing VCF header..." << std::endl; + + // Add the file format + std::string file_format = "##fileformat=VCFv4.2"; + this->file_stream << file_format << std::endl; + + // Add date and time + time_t rawtime; + struct tm * timeinfo; + char buffer[80]; + time (&rawtime); + timeinfo = localtime(&rawtime); + strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo); + file_stream << "##fileDate=" << buffer << std::endl; + + // Add source + std::string source = "##source=ContexSV"; + this->file_stream << source << std::endl; + + // Loop over the header metadata lines + for (auto &line : headerLines) { + this->file_stream << line << std::endl; + } + + // Add the header line + std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE"; + this->file_stream << header_line << std::endl; + + // Flush the stream to ensure that the header is written + this->file_stream.flush(); + + std::cout << "Done." << std::endl; +} + +void VcfWriter::writeRecord(const std::string &chrom, int pos, const std::string &id, const std::string &ref, const std::string &alt, const std::string &qual, const std::string &filter, const std::string &info, const std::string &format, const std::vector &samples) +{ + // Write a record to the VCF file + this->file_stream << chrom << "\t" << pos << "\t" << id << "\t" << ref << "\t" << alt << "\t" << qual << "\t" << filter << "\t" << info << "\t" << format << "\t" << samples[0] << std::endl; +} + +void VcfWriter::close() +{ + // Close the VCF file + std::cout << "Closing VCF file..." << std::endl; + this->file_stream.close(); + std::cout << "Done." << std::endl; +} diff --git a/tests/data/ref.fa b/tests/data/ref.fa new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_general.py b/tests/test_general.py index f71cfa29..a268df3b 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -16,24 +16,33 @@ # Get the path to the test data files. TEST_BAM_FILE = os.path.join(TEST_DATA_DIR, 'test.bam') +TEST_REF_FILE = "" # Empty string for now. TEST_SNPS_FILE = os.path.join(TEST_DATA_DIR, 'snps.vcf.gz') +TEST_PFB_FILE = "" # Set the output directory. TEST_OUTDIR = os.path.join(os.path.dirname(__file__), 'output') - +# Test the main function. def test_run(): - """Test the run function.""" - # Run the program. - contextsv.run( - TEST_BAM_FILE, - TEST_SNPS_FILE, - TEST_OUTDIR, - "chr3:60380533-60390533" - ) + + # Set input parameters. + input_data = contextsv.InputData() + input_data.setBAMFilepath(TEST_BAM_FILE) + input_data.setRefGenome(TEST_REF_FILE) + input_data.setSNPFilepath(TEST_SNPS_FILE) + input_data.setRegion("chr3:60380533-60390533") + input_data.setThreadCount(1) + input_data.setChrCov("chr3:39.561,chr6:39.4096") + input_data.setPFBFilepath("") + input_data.setHMMFilepath("") + input_data.setOutputDir(TEST_OUTDIR) + + # Run the analysis. + contextsv.run(input_data) # Check that the output file exists. - output_file = os.path.join(TEST_OUTDIR, 'snp_lrr_baf.csv') + output_file = os.path.join(TEST_OUTDIR, 'cnv_data.tsv') assert os.path.exists(output_file) # Check that the output file is not empty. @@ -45,9 +54,14 @@ def test_run(): # Check that the output file has the correct header. with open(output_file, 'r') as f: - assert f.readline().strip() == "position,baf,log2_ratio,cnv_state" + assert f.readline().strip() == "chromosome\tposition\tb_allele_freq\tlog2_ratio\tcnv_state\tpopulation_freq" - # Check that the output file has the correct last line. + # Check that the output file has the correct SNP values (excluding predicted + # state) in the last line with open(output_file, 'r') as f: - last_line = f.readlines()[-1].strip() - assert last_line == "60389325,0.590909,-0.048852,1" + last_line = f.readlines()[-1].strip('\n') + print(last_line) + actual_line="chr3\t60389325\t0.590909\t-0.048852\t6\t0.01" + print(actual_line) + assert last_line == actual_line +