From 0a70641742344418c0f26d3a37a16e2603b67426 Mon Sep 17 00:00:00 2001 From: Tim Dunn <me.timd1@gmail.com> Date: Mon, 5 Jun 2023 17:36:53 -0400 Subject: [PATCH] Version bump (v2.0.0) - also updated README and demo --- README.md | 72 +++++++++++++++---------------------------------- demo/demo.sh | 7 +++++ demo/output.txt | 29 ++++++++++++++++++++ src/README.md | 63 ++++++++++++++++++++++++------------------- src/globals.h | 2 +- 5 files changed, 94 insertions(+), 79 deletions(-) create mode 100755 demo/demo.sh create mode 100644 demo/output.txt diff --git a/README.md b/README.md index 21ed980..a847fc4 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# vcfdist: benchmarking phased small variant calls +# vcfdist: benchmarking phased variant calls data:image/s3,"s3://crabby-images/52d6c/52d6c1478b99a26542cf5b531864ba6f18194faa" alt="build" <!-- data:image/s3,"s3://crabby-images/aac95/aac95e06dd387b7aa74dd59c5353d82147f33ef8" alt="Github All Releases" --> @@ -14,14 +14,15 @@ ## Introduction -vcfdist is a distance-based small variant calling evaluator that: -- gives partial credit to variant calls which are mostly correct -- standardizes query and truth VCF variants to a consistent representation -- **requires local phasing of both input VCFs** and enforces correct local phasing of variants -- discovers long-range variant representation dependencies using a novel clustering algorithm -- works on monoploid and diploid VCF contigs +vcfdist is a distance-based **variant calling evaluation tool** that: +- simultaneously evaluates **SNPS, small INDELs, complex, tandem repeat, and structural variants** +- gives **partial credit** to variant calls which are mostly correct +- **standardizes** query and truth VCF **variants** to a consistent representation +- **requires local phasing** of both input VCFs and enforces correct local phasing of variants +- **discovers** long-range variant **representation dependencies** using a novel clustering algorithm +- works on **monoploid and diploid** VCF contigs -This results in more stable SNP and INDEL precision-recall curves than vcfeval, particularly for complex variants. vcfdist also reports alignment distance based metrics for evaluation which are entirely independent of variant representation, providing greater insight into variant calling performance. +This results in more stable SNP and INDEL precision-recall curves than previous work, particularly for complex variants. vcfdist also reports alignment distance based metrics for evaluation which are entirely independent of variant representation, providing greater insight into variant calling performance. This project is currently under active development. We welcome the submission of any feedback, issues, or suggestions for improvement! @@ -65,61 +66,31 @@ vcfdist is developed for Linux and its only dependencies are GCC v8+ and HTSlib. If you do already have HTSlib installed elsewhere, make sure you've added it to your `LD_LIBRARY_PATH`. At this point, installation is as simple as cloning the repository and building the executable. It should compile in less than one minute. ```bash -> git clone --branch v1.3.1 https://github.com/timd1/vcfdist +> git clone https://github.com/timd1/vcfdist > cd vcfdist/src > make > ./vcfdist --version -vcfdist v1.3.1 +vcfdist v2.0.0 ``` ## Usage -The `demo` directory contains all input files required to run `vcfdist`. This demonstration operates on the first 5 million bases on `chr1`, and should run in about 3 seconds. +The <a href="./demo">`demo`</a> directory contains a <a href="./demo/demo.sh">demo script</a> (shown below) and all required inputs. It operates on the first 5 million bases on `chr1`, and should run in about 3 seconds. ```bash -./vcfdist \ - ../demo/query.vcf \ - ../demo/nist-v4.2.1_chr1_5Mb.vcf.gz \ - ../demo/GRCh38_chr1_5Mb.fa \ - -b ../demo/nist-v4.2.1_chr1_5Mb.bed \ - -p ../demo/results/ \ +../src/vcfdist \ + query.vcf \ + nist-v4.2.1_chr1_5Mb.vcf.gz \ + GRCh38_chr1_5Mb.fa \ + -b nist-v4.2.1_chr1_5Mb.bed \ + -p results/ \ -v 0 ``` -You can expect to see the following output: - -``` -PRECISION-RECALL SUMMARY - -TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE -SNP Q >= 0 8220 8220 5 6 0.999149 0.999271 0.999210 31.023401 -SNP Q >= 0 8220 8220 5 6 0.999149 0.999271 0.999210 31.023401 - -TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE -INDEL Q >= 0 932 932 3 3 0.996793 0.996261 0.996527 24.592749 -INDEL Q >= 0 932 932 3 3 0.996793 0.996261 0.996527 24.592749 - - -ALIGNMENT DISTANCE SUMMARY - -TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE ALN_QSCORE -ALL Q >= 0 26 16 26.566509 27.579178 27.154125 -ALL Q >= 0 26 16 26.566509 27.579178 27.154125 -ALL Q >= 61 11793 9163 0.000000 0.000000 0.000000 - -TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE -SNP Q >= 0 10 10 29.151360 29.151360 -SNP Q >= 0 10 10 29.151360 29.151360 -SNP Q >= 61 8225 8225 0.000000 0.000000 - -TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE -INDEL Q >= 0 16 6 23.483049 21.940516 -INDEL Q >= 0 16 6 23.483049 21.940516 -INDEL Q >= 61 3568 938 0.000000 0.000000 -``` +You can expect to see <a href="./demo/output.txt">this output</a>. To include more details on intermediate results, run it again at higher verbosity by removing the `-v 0` flag. -Please note that your results may not be identical to the example shown, since vcfdist is under active development and handling of certain edge cases may differ between versions. +Please note that your results may not be identical, since vcfdist is under active development and handling of edge-cases may differ between versions. Please see additional options documented <a href="./src/README.md">here</a>, or run `./vcfdist --help`. @@ -152,8 +123,7 @@ In order to use `qfy.py` please install <a href="https://github.com/Illumina/hap --o results/qfy-output-prefix \ output-prefix/summary.vcf.gz ``` -Ensure that `strat.tsv` contains one stratification region per line. -Each line must contain a region name and BED file name, separated by a tab (`\t`). +Ensure that `strat.tsv` contains one stratification region per line; each line consists of a region name and BED file name separated by a tab. GIAB stratification regions for GRCh38 can be found <a href="https://github.com/genome-in-a-bottle/genome-stratifications/tree/master/GRCh38">here</a>. diff --git a/demo/demo.sh b/demo/demo.sh new file mode 100755 index 0000000..c361c8b --- /dev/null +++ b/demo/demo.sh @@ -0,0 +1,7 @@ +../src/vcfdist \ + query.vcf \ + nist-v4.2.1_chr1_5Mb.vcf.gz \ + GRCh38_chr1_5Mb.fa \ + -b nist-v4.2.1_chr1_5Mb.bed \ + -p results/ \ + -v 0 diff --git a/demo/output.txt b/demo/output.txt new file mode 100644 index 0000000..4b46f2f --- /dev/null +++ b/demo/output.txt @@ -0,0 +1,29 @@ + +PRECISION-RECALL SUMMARY + + TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE + SNP Q >= 0 8220 8220 5 6 0.999149 0.999271 0.999210 31.023401 + SNP Q >= 0 8220 8220 5 6 0.999149 0.999271 0.999210 31.023401 + + TYPE MIN_QUAL TRUTH_TP QUERY_TP TRUTH_FN QUERY_FP PREC RECALL F1_SCORE F1_QSCORE + INDEL Q >= 0 932 932 3 3 0.996793 0.996261 0.996527 24.592749 + INDEL Q >= 0 932 932 3 3 0.996793 0.996261 0.996527 24.592749 + + + ALIGNMENT DISTANCE SUMMARY + + TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE ALN_QSCORE + ALL Q >= 0 26 16 26.565773 27.579651 27.154125 + ALL Q >= 0 26 16 26.565773 27.579651 27.154125 + ALL Q >= 61 11791 9164 0.000000 0.000000 0.000000 + + TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE + SNP Q >= 0 10 10 29.130186 29.130186 + SNP Q >= 0 10 10 29.130186 29.130186 + SNP Q >= 61 8185 8185 0.000000 0.000000 + + TYPE MIN_QUAL EDIT_DIST DISTINCT_EDITS ED_QSCORE DE_QSCORE + INDEL Q >= 0 16 6 23.529057 22.126314 + INDEL Q >= 0 16 6 23.529057 22.126314 + INDEL Q >= 61 3606 979 0.000000 0.000000 + diff --git a/src/README.md b/src/README.md index 2f47f1d..424907d 100644 --- a/src/README.md +++ b/src/README.md @@ -3,64 +3,73 @@ Usage: vcfdist <query.vcf> <truth.vcf> <ref.fasta> [options] Required: - <STRING> query.vcf phased VCF file containing variant calls to evaluate - <STRING> truth.vcf phased VCF file containing ground truth variant calls - <STRING> ref.fasta FASTA file containing draft reference sequence + <STRING> query.vcf phased VCF file containing variant calls to evaluate + <STRING> truth.vcf phased VCF file containing ground truth variant calls + <STRING> ref.fasta FASTA file containing draft reference sequence Options: -b, --bed <STRING> - BED file containing regions to evaluate - + BED file containing regions to evaluate + -p, --prefix <STRING> [./] - prefix for output files (directory needs a trailing slash) - + prefix for output files (directory needs a trailing slash) + -v, --verbosity <INTEGER> [1] - printing verbosity (0: succinct, 1: default, 2:verbose) - + printing verbosity (0: succinct, 1: default, 2:verbose) + -r, --realign-only - standardize truth and query variant representations, then exit - + standardize truth and query variant representations, then exit + -q, --keep-query - do not realign query variants, keep original representation - + do not realign query variants, keep original representation + -t, --keep-truth - do not realign truth variants, keep original representation + do not realign truth variants, keep original representation -x, --mismatch-penalty <INTEGER> [3] - Smith-Waterman mismatch (substitution) penalty + Smith-Waterman mismatch (substitution) penalty -o, --gap-open-penalty <INTEGER> [2] - Smith-Waterman gap opening penalty + Smith-Waterman gap opening penalty -e, --gap-extend-penalty <INTEGER> [1] - Smith-Waterman gap extension penalty + Smith-Waterman gap extension penalty --min-qual <INTEGER> [0] - minimum variant quality, lower qualities ignored + minimum variant quality, lower qualities ignored --max-qual <INTEGER> [60] - maximum variant quality, higher qualities kept but thresholded + maximum variant quality, higher qualities kept but thresholded -s, --smallest-variant <INTEGER> [1] - minimum variant size, smaller variants ignored (SNPs are size 1) + minimum variant size, smaller variants ignored (SNPs are size 1) -l, --largest-variant <INTEGER> [5000] - maximum variant size, larger variants ignored + maximum variant size, larger variants ignored -i, --max-iterations <INTEGER> [4] - maximum iterations for expanding/merging clusters + maximum iterations for expanding/merging clusters -g, --supercluster-gap <INTEGER> [50] - minimum base gap between independent superclusters + minimum base gap between independent superclusters + + --max-threads <INTEGER> [64] + maximum threads to use for precision/recall alignment + (haps*contigs used for wavefront clustering) + + --max-ram <FLOAT> [64.000GB] + maximum RAM to use for precision/recall alignment + (work in-progress, more may be used in other steps) -h, --help - show this help message + show this help message -a, --advanced - show advanced options + show advanced options -c, --citation - please cite vcfdist if used in your analyses + please cite vcfdist if used in your analyses -v, --version - print vcfdist version (v1.3.1) + print vcfdist version (v2.0.0) +``` diff --git a/src/globals.h b/src/globals.h index 363626b..55a32c0 100644 --- a/src/globals.h +++ b/src/globals.h @@ -73,7 +73,7 @@ class Globals { void init_timers(std::vector<std::string> timer_strs); // program data - const std::string VERSION = "1.3.1"; + const std::string VERSION = "2.0.0"; const std::string PROGRAM = "vcfdist"; };