Skip to content

Commit

Permalink
vcfdist now reports an error on nonexistent BED
Browse files Browse the repository at this point in the history
 - improved alignment of output printing after renaming columns
 - fixed constant name in `defs.h`
 - added analysis script update for plotting
  • Loading branch information
TimD1 committed Mar 15, 2023
1 parent d5e5b30 commit a4068de
Show file tree
Hide file tree
Showing 7 changed files with 27 additions and 18 deletions.
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,7 @@ If you do already have HTSlib installed elsewhere, make sure you've added it to
> git clone https://github.com/timd1/vcfdist
> cd vcfdist/src
> make
> ./vcfdist --version
vcfdist 1.0.0
> ./vcfdist -h
```


Expand Down
22 changes: 12 additions & 10 deletions analysis/9_f1_pr_plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@
data = "/home/timdunn/vcfdist/data"
datasets = ["nist", "cmrg"]
tools = ["vcfeval", "vcfdist"]
names = ["original", "A", "B", "C", "D"]
colors = ["k", "C0", "C1", "C2", "C3"]
# names = ["original", "A", "B", "C", "D"]
# colors = ["k", "C0", "C1", "C2", "C3"]
names = ["original", "B", "C", "D"]
colors = ["k", "C1", "C2", "C3"]

sub_ids = [
"0GOOR", "23O09", "4GKR1", "61YRJ", "8H0ZB", "B1S5A", "C6JUX", "EIUT6", "H9OJ3", "IA789",
Expand Down Expand Up @@ -75,7 +77,7 @@
except FileNotFoundError:
print(f"File '{data}/pfda-v2/{dataset}_vcfeval/{sub_id}_HG002_{filename}.summary.csv' not found.")
continue
if len(snp_f1_list) != 5: continue
if len(snp_f1_list) != len(names): continue
snp_f1_mean = sum(snp_f1_list) / len(snp_f1_list)
indel_f1_mean = sum(indel_f1_list) / len(indel_f1_list)
for ni, name in enumerate(names):
Expand All @@ -84,10 +86,10 @@
ax[ti][di*2+1].plot(indel_f1_mean, indel_f1_list[ni], linestyle='',
marker=markers[si], color=colors[ni], label=label)
all_snp_f1_y.extend(snp_f1_list)
all_snp_f1_x.extend([snp_f1_mean]*5)
all_snp_f1_x.extend([snp_f1_mean]*len(names))
all_snp_f1_meds.append(statistics.median(snp_f1_list))
all_indel_f1_y.extend(indel_f1_list)
all_indel_f1_x.extend([indel_f1_mean]*5)
all_indel_f1_x.extend([indel_f1_mean]*len(names))
all_indel_f1_meds.append(statistics.median(indel_f1_list))

m, b, r, p, std_err = scipy.stats.linregress(all_snp_f1_x, all_snp_f1_y)
Expand Down Expand Up @@ -130,7 +132,7 @@
indel_f1_list.append(indel_f1q)
except FileNotFoundError:
break
if len(snp_f1_list) != 5: continue
if len(snp_f1_list) != len(names): continue
# remove this median from list before doing rank calcs
snp_f1_meds = all_snp_f1_meds[:]
snp_f1_med = statistics.median(snp_f1_list)
Expand Down Expand Up @@ -177,7 +179,7 @@
except FileNotFoundError:
print(f"File '{data}/pfda-v2/{dataset}_vcfdist/{sub_id}_HG002_{filename}.precision-recall-summary.tsv' not found.")
continue
if len(snp_f1_list) != 5: continue
if len(snp_f1_list) != len(names): continue
snp_f1_mean = sum(snp_f1_list) / len(snp_f1_list)
indel_f1_mean = sum(indel_f1_list) / len(indel_f1_list)
for ni, name in enumerate(names):
Expand All @@ -186,10 +188,10 @@
ax[ti][di*2+1].plot(indel_f1_mean, indel_f1_list[ni], linestyle='',
marker=markers[si], color=colors[ni], label=label)
all_snp_f1_y.extend(snp_f1_list)
all_snp_f1_x.extend([snp_f1_mean]*5)
all_snp_f1_x.extend([snp_f1_mean]*len(names))
all_snp_f1_meds.append(statistics.median(snp_f1_list))
all_indel_f1_y.extend(indel_f1_list)
all_indel_f1_x.extend([indel_f1_mean]*5)
all_indel_f1_x.extend([indel_f1_mean]*len(names))
all_indel_f1_meds.append(statistics.median(indel_f1_list))

m, b, r, p, std_err = scipy.stats.linregress(all_snp_f1_x, all_snp_f1_y)
Expand Down Expand Up @@ -224,7 +226,7 @@
indel_f1_list.append(indel_f1q)
except FileNotFoundError:
break
if len(snp_f1_list) != 5: continue
if len(snp_f1_list) != len(names): continue
# remove this median from list before doing rank calcs
snp_f1_meds = all_snp_f1_meds[:]
snp_f1_med = statistics.median(snp_f1_list)
Expand Down
8 changes: 8 additions & 0 deletions src/bed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,14 @@
#include "print.h"

bedData::bedData(const std::string & bed_fn) {

// fail if file doesn't exist
auto bed_fp = fopen(bed_fn.data(), "r");
if (bed_fp == NULL) {
ERROR("Failed to open BED file '%s'", bed_fn.data());
}
fclose(bed_fp);

std::ifstream bed(bed_fn);
std::string region;
while (getline(bed, region)) {
Expand Down
2 changes: 1 addition & 1 deletion src/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
#define TYPE_SUB 1
#define TYPE_INS 2
#define TYPE_DEL 3
#define TYPE_GRP 4
#define TYPE_CPX 4
#define TYPE_INDEL 4
#define TYPES 5

Expand Down
2 changes: 1 addition & 1 deletion src/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ class Globals {
void print_usage() const;

// program data
const std::string VERSION = "1.0.1";
const std::string VERSION = "1.0.2";
const std::string PROGRAM = "vcfdist";
};

Expand Down
6 changes: 3 additions & 3 deletions src/print.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ void write_precision_recall(std::unique_ptr<phaseData> & phasedata_ptr) {
for (int type = 0; type < VARTYPES; type++) {
std::vector<int> quals = {g.min_qual, max_f1_qual[type]};
INFO(" ");
INFO("VAR_TYPE\tMIN_QUAL\tTRUTH_TP\tQUERY_TP\tTRUTH_FN\tQUERY_FP\tPREC\t\tRECALL\t\tF1_SCORE\tF1_QSCORE");
INFO("TYPE\tMIN_QUAL\tTRUTH_TP\tQUERY_TP\tTRUTH_FN\tQUERY_FP\tPREC\t\tRECALL\t\tF1_SCORE\tF1_QSCORE");

for (int qual : quals) {
// redo calculations for these two
Expand Down Expand Up @@ -513,9 +513,9 @@ void write_distance(const editData & edits) {

INFO(" ");
if (type == TYPE_ALL) {
INFO("VAR_TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE");
INFO("TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE\tALN_QSCORE");
} else {
INFO("VAR_TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE");
INFO("TYPE\tMIN_QUAL\tEDIT_DIST\tDISTINCT_EDITS\tED_QSCORE\tDE_QSCORE");
}
std::vector<int> quals = {g.min_qual, best_qual[type], g.max_qual+1};
for (auto q : quals) {
Expand Down
2 changes: 1 addition & 1 deletion src/run
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,5 @@
../test/query.vcf.gz \
../test/truth.vcf.gz \
/x/gm24385/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
-b ../scripts/data/bench_20.bed \
-b ../test/bench20.bed \
-p ../out/

0 comments on commit a4068de

Please sign in to comment.