Skip to content

Commit

Permalink
feat: allele count errors are now printed
Browse files Browse the repository at this point in the history
 - bugfix: swapping 1|1 results is now conditional on block state
  • Loading branch information
TimD1 committed Nov 29, 2024
1 parent a286190 commit 991c247
Show file tree
Hide file tree
Showing 9 changed files with 181 additions and 93 deletions.
22 changes: 13 additions & 9 deletions src/defs.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,15 +63,19 @@ class idx4;
#define ERRTYPE_TP 0 // true positive
#define ERRTYPE_FP 1 // false positive
#define ERRTYPE_FN 2 // false negative
#define ERRTYPE_PE 3 // phase error (0|1 -> 1|0)
#define ERRTYPE_GE 4 // genotype error (0|1 -> 1|1)
#define ERRTYPE_UN 5 // unknown
#define ERRTYPES 6

#define AC_ERR_NONE 0 // e.g. 0|1 -> 1|0 or 1|1 -> 1|1
#define AC_ERR_1_TO_2 1 // e.g. 0|1 -> 1|1
#define AC_ERR_2_TO_1 2 // e.g. 1|1 -> 0|1
#define AC_ERRTYPES 3 // allele count error types
#define ERRTYPE_UN 3 // unknown
#define ERRTYPES 4

#define AC_ERR_0_TO_1 0 // e.g. 0/0 -> 0/1 (1 QUERY_FP)
#define AC_ERR_0_TO_2 1 // e.g. 0/0 -> 1/1 (2 QUERY_FP)
#define AC_ERR_1_TO_0 2 // e.g. 0/1 -> 0/0 (1 TRUTH_FN)
#define AC_ERR_1_TO_1 3 // e.g. 0/1 -> 0/1 (1 QUERY_TP, 1 TRUTH_TP)
#define AC_ERR_1_TO_2 4 // e.g. 0/1 -> 1/1 (1 QUERY_TP, 1 TRUTH_TP, 1 QUERY_FP)
#define AC_ERR_2_TO_0 5 // e.g. 1/1 -> 0/0 (2 TRUTH_FN)
#define AC_ERR_2_TO_1 6 // e.g. 1/1 -> 0/1 (1 QUERY_TP, 1 TRUTH_TP, 1 TRUTH_FN)
#define AC_ERR_2_TO_2 7 // e.g. 1/1 -> 1/1 (2 QUERY_TP, 2 TRUTH_TP)
#define AC_UNKNOWN 8
#define AC_ERRTYPES 8 // allele count error types

#define SWITCHTYPE_FLIP 0
#define SWITCHTYPE_SWITCH 1
Expand Down
71 changes: 0 additions & 71 deletions src/dist.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,77 +876,6 @@ void precision_recall_wrapper(
}


/******************************************************************************/

/* The precision-recall calculation allows the calculated GT to be anything (including 0|0 or 1|1).
We need to do some post-processing to fix this and force the allele counts to be unchanged.
*/
void fix_genotype_allele_counts(std::shared_ptr<ctgSuperclusters> sc, int sc_idx) {
int cluster_beg = sc->superclusters[QUERY][sc_idx];
int cluster_end = sc->superclusters[QUERY][sc_idx+1];
for (int ci = cluster_beg; ci < cluster_end; ci++) {
std::shared_ptr<ctgVariants> vars = sc->callset_vars[QUERY];
for (int vi = vars->clusters[ci]; vi < vars->clusters[ci+1]; vi++) {

// force 1|1 query variants to be evaluated as such
if (vars->orig_gts[vi] == GT_ALT1_ALT1) {
if (vars->calc_gts[vi] == GT_REF_ALT1 || vars->calc_gts[vi] == GT_ALT1_REF) {
vars->ac_errtype[vi] = AC_ERR_1_TO_2;
}
vars->calc_gts[vi] = vars->orig_gts[vi];
// if we're in a PHASE_SWAP phase block, we should swap data here since otherwise
// there's no way based on calc_gt 1|1 to know to look at data from other hap
std::swap(vars->errtypes[HAP1][vi], vars->errtypes[HAP2][vi]);
std::swap(vars->sync_group[HAP1][vi], vars->sync_group[HAP2][vi]);
std::swap(vars->callq[HAP1][vi], vars->callq[HAP2][vi]);
std::swap(vars->ref_ed[HAP1][vi], vars->ref_ed[HAP2][vi]);
std::swap(vars->query_ed[HAP1][vi], vars->query_ed[HAP2][vi]);
std::swap(vars->credit[HAP1][vi], vars->credit[HAP2][vi]);
}
// force 0|1 and 1|0 query variants to be evaluated as such
else if (vars->orig_gts[vi] == GT_REF_ALT1 || vars->orig_gts[vi] == GT_ALT1_REF) {

// for called 1|1 variants, keep variant call with better calculated credit
if (vars->calc_gts[vi] == GT_ALT1_ALT1) {

vars->ac_errtype[vi] = AC_ERR_2_TO_1;
// try to use phasing with max credit
if (vars->credit[HAP1][vi] > vars->credit[HAP2][vi]) {
vars->set_var_calcgt_on_hap(vi, HAP2, false);
} else if (vars->credit[HAP1][vi] < vars->credit[HAP2][vi]) {
vars->set_var_calcgt_on_hap(vi, HAP1, false);
} else { // default to current phasing
if (vars->pb_phases[vi] == PHASE_ORIG) {
vars->calc_gts[vi] = vars->orig_gts[vi];
} else { // PHASE_SWAP
vars->calc_gts[vi] = (vars->orig_gts[vi] == GT_REF_ALT1) ?
GT_ALT1_REF : GT_REF_ALT1;
}
}
}

// for called 0|0 variants
if (vars->calc_gts[vi] == GT_REF_REF) {
// try to use hap with max credit
if (vars->credit[HAP1][vi] > vars->credit[HAP2][vi]) {
vars->set_var_calcgt_on_hap(vi, HAP1, true);
} else if (vars->credit[HAP1][vi] < vars->credit[HAP2][vi]) {
vars->set_var_calcgt_on_hap(vi, HAP2, true);
} else { // default to current phasing
if (vars->pb_phases[vi] == PHASE_ORIG) {
vars->calc_gts[vi] = vars->orig_gts[vi];
} else { // PHASE_SWAP
vars->calc_gts[vi] = (vars->orig_gts[vi] == GT_REF_ALT1) ?
GT_ALT1_REF : GT_REF_ALT1;
}
}
}
}
}
}
}


/******************************************************************************/


Expand Down
2 changes: 0 additions & 2 deletions src/dist.h
Original file line number Diff line number Diff line change
Expand Up @@ -171,8 +171,6 @@ void calc_prec_recall(
bool print = false
);

void fix_genotype_allele_counts(std::shared_ptr<ctgSuperclusters> sc, int sc_idx);

/******************************************************************************/

int wf_swg_max_reach(
Expand Down
6 changes: 2 additions & 4 deletions src/globals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,12 @@
#include "timer.h"

Globals g;
std::vector<std::string> aln_strs =
{"QUERY1-TRUTH1", "QUERY1-TRUTH2", "QUERY2-TRUTH1", "QUERY2-TRUTH2"};
std::vector<std::string> callset_strs = {"QUERY", "TRUTH"};
std::vector<std::string> error_strs = {"TP", "FP", "FN", "PE", "GE", "??"};
std::vector<std::string> error_strs = {"TP", "FP", "FN", "??"};
std::vector<std::string> gt_strs =
{"0", "1", "0|0", "0|1", "1|0", "1|1", "1|2", "2|1", ".|.", "M|N" };
std::vector<std::string> phase_strs = {"0", "1", "."};
std::vector<std::string> ac_strs = {".", "+", "-"};
std::vector<std::string> ac_strs = {".", ".", ".", ".", "+", ".", "-", ".", "."};
std::vector<std::string> region_strs = {"OUTSIDE", "INSIDE ", "BORDER ", "OFF CTG"};
std::vector<std::string> switch_strs =
{"FLIP", "SWITCH", "SWITCH+FLIP", "SWITCH_ERR", "FLIP_BEG", "FLIP_END", "NONE"};
Expand Down
1 change: 0 additions & 1 deletion src/globals.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ void create_directory(const std::string & dir);
extern Globals g;

// defined in main.cpp
extern std::vector<std::string> aln_strs;
extern std::vector<std::string> callset_strs;
extern std::vector<std::string> error_strs;
extern std::vector<std::string> gt_strs;
Expand Down
133 changes: 128 additions & 5 deletions src/phase.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,14 +247,137 @@ phaseblockData::phaseblockData(std::shared_ptr<superclusterData> clusterdata_ptr

// calculate phasings, flip, and switch errors
this->phase();
// calculate and fix allele count errors
this->fix_allele_counts();
}


/******************************************************************************/


// fix genotypes, with ties defaulting to current phase
/* The precision-recall calculation allows the calculated GT to be anything (including 0|0 or 1|1).
We need to do some post-processing to fix this and force the allele counts to be unchanged.
*/
void phaseblockData::fix_allele_counts() {
std::vector<int> allele_error_counts(AC_ERRTYPES, 0);
for (const std::string & ctg : this->contigs) {
std::shared_ptr<ctgSuperclusters> scs = clusterdata_ptr->superclusters[ctg];
for (int sc_idx = 0; sc_idx < scs->n; sc_idx++) {
fix_genotype_allele_counts(scs, sc_idx);
std::shared_ptr<ctgVariants> qvars =
this->phase_blocks[ctg]->ctg_superclusters->callset_vars[QUERY];

for (int vi = 0; vi < qvars->n; vi++) {
int allele_count_errtype = qvars->set_allele_errtype(vi);
if (allele_count_errtype == AC_UNKNOWN) {
ERROR("Unknown variant allele count at %s:%d, %s -> %s", ctg.data(), qvars->poss[vi],
gt_strs[qvars->calc_gts[vi]].data(), gt_strs[qvars->orig_gts[vi]].data());
}
allele_error_counts[allele_count_errtype]++;

// force 1|1 query variants to be evaluated as such
if (qvars->orig_gts[vi] == GT_ALT1_ALT1) {
qvars->calc_gts[vi] = qvars->orig_gts[vi];

// if we're in a PHASE_SWAP phase block, we should swap data here since otherwise
// there's no way based on calc_gt 1|1 to know to look at data from other hap
if (qvars->pb_phases[vi] == PHASE_SWAP) {
std::swap(qvars->errtypes[HAP1][vi], qvars->errtypes[HAP2][vi]);
std::swap(qvars->sync_group[HAP1][vi], qvars->sync_group[HAP2][vi]);
std::swap(qvars->callq[HAP1][vi], qvars->callq[HAP2][vi]);
std::swap(qvars->ref_ed[HAP1][vi], qvars->ref_ed[HAP2][vi]);
std::swap(qvars->query_ed[HAP1][vi], qvars->query_ed[HAP2][vi]);
std::swap(qvars->credit[HAP1][vi], qvars->credit[HAP2][vi]);
}
}

// force original 0|1 and 1|0 query variants to be evaluated as such, though truth differs
// (calc_gt has allele count 2, orig_gt has allele count 1)
else if (allele_count_errtype == AC_ERR_2_TO_1) {

// for called 1|1 variants, keep variant call with better calculated credit
if (qvars->credit[HAP1][vi] > qvars->credit[HAP2][vi]) {
qvars->set_var_calcgt_on_hap(vi, HAP2, false);
} else if (qvars->credit[HAP1][vi] < qvars->credit[HAP2][vi]) {
qvars->set_var_calcgt_on_hap(vi, HAP1, false);
} else { // default to current phasing
if (qvars->pb_phases[vi] == PHASE_ORIG) {
qvars->calc_gts[vi] = qvars->orig_gts[vi];
} else { // PHASE_SWAP
qvars->calc_gts[vi] = (qvars->orig_gts[vi] == GT_REF_ALT1) ?
GT_ALT1_REF : GT_REF_ALT1;
}
}
// (calc_gt has allele count 0, orig_gt has allele count 1)
} else if (allele_count_errtype == AC_ERR_0_TO_1) {
// try to use hap with max credit
if (qvars->credit[HAP1][vi] > qvars->credit[HAP2][vi]) {
qvars->set_var_calcgt_on_hap(vi, HAP1, true);
} else if (qvars->credit[HAP1][vi] < qvars->credit[HAP2][vi]) {
qvars->set_var_calcgt_on_hap(vi, HAP2, true);
} else { // default to current phasing
if (qvars->pb_phases[vi] == PHASE_ORIG) {
qvars->calc_gts[vi] = qvars->orig_gts[vi];
} else { // PHASE_SWAP
qvars->calc_gts[vi] = (qvars->orig_gts[vi] == GT_REF_ALT1) ?
GT_ALT1_REF : GT_REF_ALT1;
}
}
}
}

// false negative errors can only be counted from the truth VCF
std::shared_ptr<ctgVariants> tvars =
this->phase_blocks[ctg]->ctg_superclusters->callset_vars[TRUTH];
for (int vi = 0; vi < qvars->n; vi++) {
if (tvars->orig_gts[vi] == GT_ALT1_ALT1) {
if (tvars->errtypes[HAP1][vi] == ERRTYPE_FN &&
tvars->errtypes[HAP2][vi] == ERRTYPE_FN) {
allele_error_counts[AC_ERR_2_TO_0]++;
}
} else if (tvars->orig_gts[vi] == GT_REF_ALT1) {
if (tvars->errtypes[HAP2][vi] == ERRTYPE_FN) {
allele_error_counts[AC_ERR_1_TO_0]++;
}
} else if (tvars->orig_gts[vi] == GT_ALT1_REF) {
if (tvars->errtypes[HAP1][vi] == ERRTYPE_FN) {
allele_error_counts[AC_ERR_1_TO_0]++;
}
}
}
}

if (g.verbosity >= 1) INFO(" ");
if (g.verbosity >= 1) INFO(" Genotype Error Summary:");
if (g.verbosity >= 1) INFO(" 0/0 -> 0/1: %-8d 1 FP", allele_error_counts[AC_ERR_0_TO_1]);
if (g.verbosity >= 1) INFO(" 0/0 -> 1/1: %-8d 2 FP", allele_error_counts[AC_ERR_0_TO_2]);
if (g.verbosity >= 1) INFO(" 0/1 -> 0/0: %-8d 1 FN", allele_error_counts[AC_ERR_1_TO_0]);
if (g.verbosity >= 1) INFO(" 0/1 -> 0/1: %-8d 1 TP", allele_error_counts[AC_ERR_1_TO_1]);
if (g.verbosity >= 1) INFO(" 0/1 -> 1/1: %-8d 1 TP, 1 FP (False Homozygous)", allele_error_counts[AC_ERR_1_TO_2]);
if (g.verbosity >= 1) INFO(" 1/1 -> 0/0: %-8d 2 FN", allele_error_counts[AC_ERR_2_TO_0]);
if (g.verbosity >= 1) INFO(" 1/1 -> 0/1: %-8d 1 TP, 1 FN (False Heterozygous)", allele_error_counts[AC_ERR_2_TO_1]);
if (g.verbosity >= 1) INFO(" 1/1 -> 1/1: %-8d 2 TP", allele_error_counts[AC_ERR_2_TO_2]);
if (g.verbosity >= 1) INFO(" ");
this->write_genotype_error_summary(allele_error_counts);
}


/*******************************************************************************/


void phaseblockData::write_genotype_error_summary(const std::vector<int> & allele_error_counts) {
std::string out_genotype_errors_fn = g.out_prefix + "genotype-errors.tsv";
FILE* out_genotype_errors = 0;
if (g.verbosity >= 1) INFO(" Writing genotype error results to '%s'", out_genotype_errors_fn.data());
out_genotype_errors = fopen(out_genotype_errors_fn.data(), "w");
fprintf(out_genotype_errors, "ALLELE_COUNT_0_TO_1\tALLELE_COUNT_0_TO_2\tALLELE_COUNT_1_TO_0\tALLELE_COUNT_1_TO_1\tALLELE_COUNT_1_TO_2\tALLELE_COUNT_2_TO_0\tALLELE_COUNT_2_TO_1\tALLELE_COUNT_2_TO_2\n");
fprintf(out_genotype_errors, "%d\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
allele_error_counts[AC_ERR_0_TO_1],
allele_error_counts[AC_ERR_0_TO_2],
allele_error_counts[AC_ERR_1_TO_0],
allele_error_counts[AC_ERR_1_TO_1],
allele_error_counts[AC_ERR_1_TO_2],
allele_error_counts[AC_ERR_2_TO_0],
allele_error_counts[AC_ERR_2_TO_1],
allele_error_counts[AC_ERR_2_TO_2]);
fclose(out_genotype_errors);
}


Expand Down Expand Up @@ -548,7 +671,7 @@ int phaseblockData::calculate_ng50(bool break_on_switch, bool break_on_flip) {

// get sizes of each correct phase block (split on flips, not just switch)
std::vector<int> correct_blocks;
for (std::string ctg: this->contigs) {
for (const std::string & ctg: this->contigs) {

std::shared_ptr<ctgPhaseblocks> ctg_pbs = this->phase_blocks[ctg];
std::shared_ptr<ctgVariants> qvars = ctg_pbs->ctg_superclusters->callset_vars[QUERY];
Expand Down
2 changes: 2 additions & 0 deletions src/phase.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ class phaseblockData {
int flip_errors, int variants, int ng50, int s_ngc50, int sf_ngc50);
void write_switchflips();
void phase();
void fix_allele_counts();
void write_genotype_error_summary(const std::vector<int> & allele_count_errors);
int calculate_ng50(bool break_on_switch = false, bool break_on_flip = false);

std::shared_ptr<fastaData> ref;
Expand Down
36 changes: 35 additions & 1 deletion src/variant.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,45 @@ void ctgVariants::add_var(int pos, int rlen, uint8_t type, uint8_t loc,
// added during phase()
this->phases.push_back(PHASE_NONE);
this->pb_phases.push_back(PHASE_NONE);
this->ac_errtype.push_back(AC_ERR_NONE);
this->ac_errtype.push_back(AC_UNKNOWN);
}

/******************************************************************************/

/* Precision/Recall calculation allows dynamically adjusting the variant genotype.
* Afterwards, this functions records how the genotype allele count has changed.
*/
int ctgVariants::set_allele_errtype(int vi) {
if (this->calc_gts[vi] == GT_ALT1_REF || this->calc_gts[vi] == GT_REF_ALT1) {
if (this->orig_gts[vi] == GT_ALT1_ALT1) {
return this->ac_errtype[vi] = AC_ERR_1_TO_2;
} else if (this->orig_gts[vi] == GT_ALT1_REF || this->orig_gts[vi] == GT_REF_ALT1) {
return this->ac_errtype[vi] = AC_ERR_1_TO_1;
} else {
return this->ac_errtype[vi] = AC_ERR_1_TO_0;
}
} else if (this->calc_gts[vi] == GT_ALT1_ALT1) {
if (this->orig_gts[vi] == GT_ALT1_ALT1) {
return this->ac_errtype[vi] = AC_ERR_2_TO_2;
} else if (this->orig_gts[vi] == GT_ALT1_REF || this->orig_gts[vi] == GT_REF_ALT1) {
return this->ac_errtype[vi] = AC_ERR_2_TO_1;
} else {
return this->ac_errtype[vi] = AC_ERR_2_TO_0;
}
} else if (this->calc_gts[vi] == GT_REF_REF) {
if (this->orig_gts[vi] == GT_ALT1_ALT1) {
return this->ac_errtype[vi] = AC_ERR_0_TO_2;
} else if (this->orig_gts[vi] == GT_ALT1_REF || this->orig_gts[vi] == GT_REF_ALT1) {
return this->ac_errtype[vi] = AC_ERR_0_TO_1;
}
}
return AC_UNKNOWN;
}


/******************************************************************************/


void variantData::print_phase_info(int callset) {

int total_phase_sets = 0;
Expand Down
1 change: 1 addition & 0 deletions src/variant.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class ctgVariants {
int phase_block, bool phase_switch, bool phase_flip, bool query = false);
bool var_on_hap(int var_idx, int hap_idx, bool calc = false) const;
void set_var_calcgt_on_hap(int var_idx, int hap, bool set = true);
int set_allele_errtype(int var_idx);
bool calcgt_is_swapped(int var_idx) const;

// originally parsed data (size n)
Expand Down

0 comments on commit 991c247

Please sign in to comment.