Skip to content

Commit dd67202

Browse files
committed
feat: track genotype errors, print to summary.vcf
1 parent a1a2852 commit dd67202

File tree

8 files changed

+77
-34
lines changed

8 files changed

+77
-34
lines changed

src/defs.h

+5
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ class idx4;
6868
#define ERRTYPE_UN 5 // unknown
6969
#define ERRTYPES 6
7070

71+
#define AC_ERR_NONE 0 // e.g. 0|1 -> 1|0 or 1|1 -> 1|1
72+
#define AC_ERR_1_TO_2 1 // e.g. 0|1 -> 1|1
73+
#define AC_ERR_2_TO_1 2 // e.g. 1|1 -> 0|1
74+
#define AC_ERRTYPES 3 // allele count error types
75+
7176
#define SWITCHTYPE_FLIP 0
7277
#define SWITCHTYPE_SWITCH 1
7378
#define SWITCHTYPE_SWITCH_AND_FLIP 2

src/dist.cpp

+43-22
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,6 @@ void calc_prec_recall(
280280
graph->qtypes[prev.qni] != TYPE_REF) { // node is variant
281281
if (print) printf("new query variant\n");
282282
int qvar_idx = graph->qidxs[prev.qni];
283-
qvars->set_var_calcgt_on_hap(qvar_idx, truth_hap, true);
284283
sync_qvars.push_back(qvar_idx);
285284
}
286285
// if we move into a query variant, include it in sync group and ref dist calc
@@ -352,6 +351,8 @@ void calc_prec_recall(
352351
qvars->ref_ed[truth_hap][qvar_idx] = ref_dist;
353352
qvars->query_ed[truth_hap][qvar_idx] = query_dist;
354353
qvars->credit[truth_hap][qvar_idx] = credit;
354+
if (errtype == ERRTYPE_TP)
355+
qvars->set_var_calcgt_on_hap(qvar_idx, truth_hap, true);
355356
}
356357
if (errtype == ERRTYPE_FP) errtype = ERRTYPE_FN;
357358
for (int tvar_idx : sync_tvars) {
@@ -805,7 +806,7 @@ void precision_recall_wrapper(
805806
int sc_idx = sc_groups[thread_step][SC_IDX][supclust_idx];
806807

807808
// set superclusters pointer
808-
std::shared_ptr<ctgSuperclusters> sc =
809+
std::shared_ptr<ctgSuperclusters> scs =
809810
clusterdata_ptr->superclusters[ctg];
810811

811812
////////////////////
@@ -815,13 +816,13 @@ void precision_recall_wrapper(
815816
// print cluster info
816817
printf("\n\nSupercluster: %d\n", sc_idx);
817818
for (int c = 0; c < CALLSETS; c++) {
818-
int cluster_beg = sc->superclusters[c][sc_idx];
819-
int cluster_end = sc->superclusters[c][sc_idx+1];
819+
int cluster_beg = scs->superclusters[c][sc_idx];
820+
int cluster_end = scs->superclusters[c][sc_idx+1];
820821
printf("%s: %d clusters (%d-%d)\n", callset_strs[c].data(),
821822
cluster_end-cluster_beg, cluster_beg, cluster_end);
822823

823824
for (int j = cluster_beg; j < cluster_end; j++) {
824-
std::shared_ptr<ctgVariants> vars = sc->callset_vars[c];
825+
std::shared_ptr<ctgVariants> vars = scs->callset_vars[c];
825826
int variant_beg = vars->clusters[j];
826827
int variant_end = vars->clusters[j+1];
827828
printf("\tCluster %d: %d variants (%d-%d)\n", j,
@@ -845,46 +846,66 @@ void precision_recall_wrapper(
845846
// query1/query2 graph to truth1, query1/query2 graph to truth2
846847
for (int hi = 0; hi < HAPS; hi++) {
847848
std::shared_ptr<Graph> graph(
848-
new Graph(sc, sc_idx, clusterdata_ptr->ref, ctg, hi));
849+
new Graph(scs, sc_idx, clusterdata_ptr->ref, ctg, hi));
849850
if (print) graph->print();
850851

851852
std::unordered_map<idx4, idx4> ptrs;
852853
calc_prec_recall_aln(graph, ptrs, print);
853854
calc_prec_recall(graph, ptrs, hi, print);
854855
}
855-
856-
// don't allow query 0|1 -> 1|1 if second allele is a FP
857-
fix_prec_recall_genotype(sc, sc_idx);
858856
}
859857
}
860858

861859

862860
/******************************************************************************/
863861

864862
/* The precision-recall calculation allows the calculated GT to be anything (including 0|0 or 1|1).
865-
We need to do some post-processing to fix this and set it to the most reasonable value.
863+
We need to do some post-processing to fix this and force the allele counts to be unchanged.
866864
*/
867-
void fix_prec_recall_genotype(std::shared_ptr<ctgSuperclusters> sc, int sc_idx) {
865+
void fix_genotype_allele_counts(std::shared_ptr<ctgSuperclusters> sc, int sc_idx) {
868866
int cluster_beg = sc->superclusters[QUERY][sc_idx];
869867
int cluster_end = sc->superclusters[QUERY][sc_idx+1];
870868
for (int ci = cluster_beg; ci < cluster_end; ci++) {
871869
std::shared_ptr<ctgVariants> vars = sc->callset_vars[QUERY];
872870
for (int vi = vars->clusters[ci]; vi < vars->clusters[ci+1]; vi++) {
873871

874-
// don't allow calculated GT to be 1|1 if either is a FP (convert to 0/1)
875-
if (vars->calc_gts[vi] == GT_ALT1_ALT1
876-
&& (vars->orig_gts[vi] == GT_REF_ALT1 || vars->orig_gts[vi] == GT_ALT1_REF)) {
877-
if (vars->errtypes[HAP1][vi] == ERRTYPE_FP) {
878-
vars->set_var_calcgt_on_hap(vi, HAP1, false);
879-
} else if (vars->errtypes[HAP2][vi] == ERRTYPE_FP) {
880-
vars->set_var_calcgt_on_hap(vi, HAP2, false);
872+
// force 1|1 query variants to be evaluated as such
873+
if (vars->orig_gts[vi] == GT_ALT1_ALT1) {
874+
if (vars->calc_gts[vi] == GT_REF_ALT1 || vars->calc_gts[vi] == GT_ALT1_REF) {
875+
vars->ac_errtype[vi] = AC_ERR_1_TO_2;
881876
}
882-
}
883-
// if the variant was a complete FP, don't mark it as a GT error. instead, set the
884-
// calculated GT to be equal to what was expected
885-
if (vars->calc_gts[vi] == GT_REF_REF) {
886877
vars->calc_gts[vi] = vars->orig_gts[vi];
887878
}
879+
// force 0|1 and 1|0 query variants to be evaluated as such
880+
else if (vars->orig_gts[vi] == GT_REF_ALT1 || vars->orig_gts[vi] == GT_ALT1_REF) {
881+
882+
// for called 1|1 variants, keep variant call with better calculated credit
883+
if (vars->calc_gts[vi] == GT_ALT1_ALT1) {
884+
885+
vars->ac_errtype[vi] = AC_ERR_2_TO_1;
886+
if (vars->credit[HAP1][vi] > vars->credit[HAP2][vi]) {
887+
vars->set_var_calcgt_on_hap(vi, HAP2, false);
888+
} else if (vars->credit[HAP1][vi] < vars->credit[HAP2][vi]) {
889+
vars->set_var_calcgt_on_hap(vi, HAP1, false);
890+
} else { // default to current phasing
891+
if (vars->pb_phases[vi] == PHASE_ORIG) {
892+
vars->calc_gts[vi] = vars->orig_gts[vi];
893+
} else { // PHASE_SWAP
894+
vars->calc_gts[vi] = (vars->orig_gts[vi] == GT_REF_ALT1) ?
895+
GT_ALT1_REF : GT_REF_ALT1;
896+
}
897+
}
898+
}
899+
900+
// for called 0|0 variants, don't upset the current phasing
901+
if (vars->calc_gts[vi] == GT_REF_REF) {
902+
if (vars->pb_phases[vi] == PHASE_ORIG) {
903+
vars->calc_gts[vi] = vars->orig_gts[vi];
904+
} else { // PHASE_SWAP
905+
vars->calc_gts[vi] = (vars->orig_gts[vi] == GT_REF_ALT1) ? GT_ALT1_REF : GT_REF_ALT1;
906+
}
907+
}
908+
}
888909
}
889910
}
890911
}

src/dist.h

+1-1
Original file line numberDiff line numberDiff line change
@@ -168,7 +168,7 @@ void calc_prec_recall(
168168
bool print = false
169169
);
170170

171-
void fix_prec_recall_genotype(std::shared_ptr<ctgSuperclusters> sc, int sc_idx);
171+
void fix_genotype_allele_counts(std::shared_ptr<ctgSuperclusters> sc, int sc_idx);
172172

173173
/******************************************************************************/
174174

src/globals.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@ std::vector<std::string> callset_strs = {"QUERY", "TRUTH"};
1414
std::vector<std::string> error_strs = {"TP", "FP", "FN", "PE", "GE", "??"};
1515
std::vector<std::string> gt_strs =
1616
{"0", "1", "0|0", "0|1", "1|0", "1|1", "1|2", "2|1", ".|.", "M|N" };
17-
std::vector<std::string> phase_strs = {"=", "X", "?"};
17+
std::vector<std::string> phase_strs = {"0", "1", "."};
18+
std::vector<std::string> ac_strs = {".", "+", "-"};
1819
std::vector<std::string> region_strs = {"OUTSIDE", "INSIDE ", "BORDER ", "OFF CTG"};
1920
std::vector<std::string> switch_strs =
2021
{"FLIP", "SWITCH", "SWITCH+FLIP", "SWITCH_ERR", "FLIP_BEG", "FLIP_END", "NONE"};

src/globals.h

+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ extern std::vector<std::string> aln_strs;
8080
extern std::vector<std::string> callset_strs;
8181
extern std::vector<std::string> error_strs;
8282
extern std::vector<std::string> gt_strs;
83+
extern std::vector<std::string> ac_strs;
8384
extern std::vector<std::string> phase_strs;
8485
extern std::vector<std::string> region_strs;
8586
extern std::vector<std::string> switch_strs;

src/phase.cpp

+12-1
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,10 @@ void phaseblockData::write_summary_vcf(std::string out_vcf_fn) {
3535
fprintf(out_vcf, "##FORMAT=<ID=SG,Number=1,Type=Integer,Description=\"Sync Group (index in supercluster, for credit assignment)\">\n");
3636
fprintf(out_vcf, "##FORMAT=<ID=PS,Number=1,Type=Integer,Description=\"Phase Set identifier (input, per-variant)\">\n");
3737
fprintf(out_vcf, "##FORMAT=<ID=PB,Number=1,Type=Integer,Description=\"Phase Block (output, per-supercluster, index in contig)\">\n");
38-
fprintf(out_vcf, "##FORMAT=<ID=BS,Number=1,Type=Integer,Description=\"Block State (phaseblock truth-to-query mapping state; 0 = T1Q1:T2Q2, 1 = T1Q2:T2Q1)\">\n");
38+
fprintf(out_vcf, "##FORMAT=<ID=BP,Number=1,Type=Integer,Description=\"Block Phase: 0 = PHASE_KEEP, 1 = PHASE_SWAP)\">\n");
39+
fprintf(out_vcf, "##FORMAT=<ID=VP,Number=1,Type=Integer,Description=\"Variant Phase: 0 = PHASE_ORIG, 1 = PHASE_SWAP, . = PHASE_NONE)\">\n");
3940
fprintf(out_vcf, "##FORMAT=<ID=FE,Number=1,Type=Integer,Description=\"Flip Error (a per-supercluster error)\">\n");
41+
fprintf(out_vcf, "##FORMAT=<ID=GE,Number=1,Type=String,Description=\"Genotype Error ('+' if 0/1 -> 1/1, '-' if 1/1 -> 0/1, '.' otherwise)\">\n");
4042
fprintf(out_vcf, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tTRUTH\tQUERY\n");
4143

4244
// write variants
@@ -228,7 +230,16 @@ phaseblockData::phaseblockData(std::shared_ptr<superclusterData> clusterdata_ptr
228230
ctg_pbs->phase_blocks.push_back(qvars->n);
229231
}
230232

233+
// calculate phasings, flip, and switch errors
231234
this->phase();
235+
236+
// fix genotypes, with ties defaulting to current phase
237+
for (const std::string & ctg : this->contigs) {
238+
std::shared_ptr<ctgSuperclusters> scs = clusterdata_ptr->superclusters[ctg];
239+
for (int sc_idx = 0; sc_idx < scs->n; sc_idx++) {
240+
fix_genotype_allele_counts(scs, sc_idx);
241+
}
242+
}
232243
}
233244

234245

src/variant.cpp

+10-7
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ void ctgVariants::add_var(int pos, int rlen, uint8_t type, uint8_t loc,
5252
// added during phase()
5353
this->phases.push_back(PHASE_NONE);
5454
this->pb_phases.push_back(PHASE_NONE);
55+
this->ac_errtype.push_back(AC_ERR_NONE);
5556
}
5657

5758
/******************************************************************************/
@@ -431,14 +432,14 @@ void ctgVariants::print_var_info(FILE* out_fp, std::shared_ptr<fastaData> ref,
431432
char ref_base;
432433
switch (this->types[idx]) {
433434
case TYPE_SUB:
434-
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BS:FE",
435+
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BP:VP:FE:GE",
435436
ctg.data(), this->poss[idx]+1, this->refs[idx].data(),
436437
this->alts[idx].data());
437438
break;
438439
case TYPE_INS:
439440
case TYPE_DEL:
440441
ref_base = ref->fasta.at(ctg)[this->poss[idx]-1];
441-
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BS:FE", ctg.data(),
442+
fprintf(out_fp, "%s\t%d\t.\t%s\t%s\t.\tPASS\t.\tGT:BD:BC:RD:QD:BK:QQ:SC:SG:PS:PB:BP:VP:FE:GE", ctg.data(),
442443
this->poss[idx], (ref_base + this->refs[idx]).data(),
443444
(ref_base + this->alts[idx]).data());
444445
break;
@@ -450,7 +451,7 @@ void ctgVariants::print_var_info(FILE* out_fp, std::shared_ptr<fastaData> ref,
450451

451452
void ctgVariants::print_var_empty(FILE* out_fp, int sc_idx,
452453
int phase_block, bool query /* = false */) {
453-
fprintf(out_fp, "\t.:.:.:.:.:.:.:%d:.:.:%d:.:.%s", sc_idx, phase_block, query ? "\n" : "");
454+
fprintf(out_fp, "\t.:.:.:.:.:.:.:%d:.:.:%d:.:.:.:.%s", sc_idx, phase_block, query ? "\n" : "");
454455
}
455456

456457

@@ -470,7 +471,7 @@ void ctgVariants::print_var_sample(FILE* out_fp, int vi, int hi, const std::stri
470471
errtype = query ? "FP" : "FN"; match_type = "lm";
471472
}
472473

473-
fprintf(out_fp, "\t%s:%s:%f:%s:%s:%s:%d:%d:%d:%d:%d:%s:%s%s", gt.data(), errtype.data(),
474+
fprintf(out_fp, "\t%s:%s:%f:%s:%s:%s:%d:%d:%d:%d:%d:%s:%s:%s:%s%s", gt.data(), errtype.data(),
474475
this->credit[hi][vi],
475476
this->ref_ed[hi][vi] == 0 ? "." :
476477
std::to_string(this->ref_ed[hi][vi]).data(),
@@ -479,16 +480,18 @@ void ctgVariants::print_var_sample(FILE* out_fp, int vi, int hi, const std::stri
479480
match_type.data(), int(this->var_quals[vi]), sc_idx,
480481
int(this->sync_group[hi][vi]), this->phase_sets[vi], phase_block,
481482
query ? (phase_switch ? "1" : "0") : "." ,
483+
phase_strs[this->phases[vi]].data(),
482484
query ? (phase_flip ? "1" : "0") : "." ,
485+
ac_strs[this->ac_errtype[vi]].data(),
483486
query ? "\n" : "");
484487
}
485488

486489

487490
/*******************************************************************************/
488491

489492

490-
void variantData::print_variant(FILE* out_fp, std::string ctg, int pos, int type,
491-
std::string ref, std::string alt, float qual, std::string gt) {
493+
void variantData::print_variant(FILE* out_fp, const std::string & ctg, int pos, int type,
494+
const std::string & ref, const std::string & alt, float qual, const std::string & gt) {
492495

493496
char ref_base;
494497
switch (type) {
@@ -519,7 +522,7 @@ void variantData::set_header(const std::shared_ptr<variantData> vcf) {
519522
this->lengths = vcf->lengths;
520523
this->ploidy = vcf->ploidy;
521524
this->ref = vcf->ref;
522-
for (std::string ctg : this->contigs)
525+
for (const std::string & ctg : this->contigs)
523526
for (int hap = 0; hap < 2; hap++)
524527
this->variants[hap][ctg] =
525528
std::shared_ptr<ctgVariants>(new ctgVariants(ctg));

src/variant.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class ctgVariants {
6161
// set during phase()
6262
std::vector<int> phases; // variant keep/swap/unknown, from alignment (calc_gt relative to orig_gt)
6363
std::vector<int> pb_phases; // phaseblock keep/swap, from phasing algorithm
64+
std::vector<int> ac_errtype; // allele count error type (e.g. 0|1 -> 1|1)
6465
};
6566

6667
class variantData {
@@ -72,8 +73,8 @@ class variantData {
7273

7374
// functions
7475
void write_vcf(std::string vcf_fn);
75-
void print_variant(FILE* out_fp, std::string ctg, int pos, int type,
76-
std::string ref, std::string alt, float qual, std::string gt);
76+
void print_variant(FILE* out_fp, const std::string & ctg, int pos, int type,
77+
const std::string & ref, const std::string & alt, float qual, const std::string & gt);
7778
void set_header(const std::shared_ptr<variantData> vcf);
7879
void add_variants( const std::vector<int> & cigar, int hap,
7980
int ref_pos, const std::string & ctg, const std::string & query,

0 commit comments

Comments
 (0)