@@ -280,7 +280,6 @@ void calc_prec_recall(
280
280
graph->qtypes [prev.qni ] != TYPE_REF) { // node is variant
281
281
if (print) printf (" new query variant\n " );
282
282
int qvar_idx = graph->qidxs [prev.qni ];
283
- qvars->set_var_calcgt_on_hap (qvar_idx, truth_hap, true );
284
283
sync_qvars.push_back (qvar_idx);
285
284
}
286
285
// if we move into a query variant, include it in sync group and ref dist calc
@@ -352,6 +351,8 @@ void calc_prec_recall(
352
351
qvars->ref_ed [truth_hap][qvar_idx] = ref_dist;
353
352
qvars->query_ed [truth_hap][qvar_idx] = query_dist;
354
353
qvars->credit [truth_hap][qvar_idx] = credit;
354
+ if (errtype == ERRTYPE_TP)
355
+ qvars->set_var_calcgt_on_hap (qvar_idx, truth_hap, true );
355
356
}
356
357
if (errtype == ERRTYPE_FP) errtype = ERRTYPE_FN;
357
358
for (int tvar_idx : sync_tvars) {
@@ -805,7 +806,7 @@ void precision_recall_wrapper(
805
806
int sc_idx = sc_groups[thread_step][SC_IDX][supclust_idx];
806
807
807
808
// set superclusters pointer
808
- std::shared_ptr<ctgSuperclusters> sc =
809
+ std::shared_ptr<ctgSuperclusters> scs =
809
810
clusterdata_ptr->superclusters [ctg];
810
811
811
812
// //////////////////
@@ -815,13 +816,13 @@ void precision_recall_wrapper(
815
816
// print cluster info
816
817
printf (" \n\n Supercluster: %d\n " , sc_idx);
817
818
for (int c = 0 ; c < CALLSETS; c++) {
818
- int cluster_beg = sc ->superclusters [c][sc_idx];
819
- int cluster_end = sc ->superclusters [c][sc_idx+1 ];
819
+ int cluster_beg = scs ->superclusters [c][sc_idx];
820
+ int cluster_end = scs ->superclusters [c][sc_idx+1 ];
820
821
printf (" %s: %d clusters (%d-%d)\n " , callset_strs[c].data (),
821
822
cluster_end-cluster_beg, cluster_beg, cluster_end);
822
823
823
824
for (int j = cluster_beg; j < cluster_end; j++) {
824
- std::shared_ptr<ctgVariants> vars = sc ->callset_vars [c];
825
+ std::shared_ptr<ctgVariants> vars = scs ->callset_vars [c];
825
826
int variant_beg = vars->clusters [j];
826
827
int variant_end = vars->clusters [j+1 ];
827
828
printf (" \t Cluster %d: %d variants (%d-%d)\n " , j,
@@ -845,46 +846,66 @@ void precision_recall_wrapper(
845
846
// query1/query2 graph to truth1, query1/query2 graph to truth2
846
847
for (int hi = 0 ; hi < HAPS; hi++) {
847
848
std::shared_ptr<Graph> graph (
848
- new Graph (sc , sc_idx, clusterdata_ptr->ref , ctg, hi));
849
+ new Graph (scs , sc_idx, clusterdata_ptr->ref , ctg, hi));
849
850
if (print) graph->print ();
850
851
851
852
std::unordered_map<idx4, idx4> ptrs;
852
853
calc_prec_recall_aln (graph, ptrs, print);
853
854
calc_prec_recall (graph, ptrs, hi, print);
854
855
}
855
-
856
- // don't allow query 0|1 -> 1|1 if second allele is a FP
857
- fix_prec_recall_genotype (sc, sc_idx);
858
856
}
859
857
}
860
858
861
859
862
860
/* *****************************************************************************/
863
861
864
862
/* The precision-recall calculation allows the calculated GT to be anything (including 0|0 or 1|1).
865
- We need to do some post-processing to fix this and set it to the most reasonable value .
863
+ We need to do some post-processing to fix this and force the allele counts to be unchanged .
866
864
*/
867
- void fix_prec_recall_genotype (std::shared_ptr<ctgSuperclusters> sc, int sc_idx) {
865
+ void fix_genotype_allele_counts (std::shared_ptr<ctgSuperclusters> sc, int sc_idx) {
868
866
int cluster_beg = sc->superclusters [QUERY][sc_idx];
869
867
int cluster_end = sc->superclusters [QUERY][sc_idx+1 ];
870
868
for (int ci = cluster_beg; ci < cluster_end; ci++) {
871
869
std::shared_ptr<ctgVariants> vars = sc->callset_vars [QUERY];
872
870
for (int vi = vars->clusters [ci]; vi < vars->clusters [ci+1 ]; vi++) {
873
871
874
- // don't allow calculated GT to be 1|1 if either is a FP (convert to 0/1)
875
- if (vars->calc_gts [vi] == GT_ALT1_ALT1
876
- && (vars->orig_gts [vi] == GT_REF_ALT1 || vars->orig_gts [vi] == GT_ALT1_REF)) {
877
- if (vars->errtypes [HAP1][vi] == ERRTYPE_FP) {
878
- vars->set_var_calcgt_on_hap (vi, HAP1, false );
879
- } else if (vars->errtypes [HAP2][vi] == ERRTYPE_FP) {
880
- vars->set_var_calcgt_on_hap (vi, HAP2, false );
872
+ // force 1|1 query variants to be evaluated as such
873
+ if (vars->orig_gts [vi] == GT_ALT1_ALT1) {
874
+ if (vars->calc_gts [vi] == GT_REF_ALT1 || vars->calc_gts [vi] == GT_ALT1_REF) {
875
+ vars->ac_errtype [vi] = AC_ERR_1_TO_2;
881
876
}
882
- }
883
- // if the variant was a complete FP, don't mark it as a GT error. instead, set the
884
- // calculated GT to be equal to what was expected
885
- if (vars->calc_gts [vi] == GT_REF_REF) {
886
877
vars->calc_gts [vi] = vars->orig_gts [vi];
887
878
}
879
+ // force 0|1 and 1|0 query variants to be evaluated as such
880
+ else if (vars->orig_gts [vi] == GT_REF_ALT1 || vars->orig_gts [vi] == GT_ALT1_REF) {
881
+
882
+ // for called 1|1 variants, keep variant call with better calculated credit
883
+ if (vars->calc_gts [vi] == GT_ALT1_ALT1) {
884
+
885
+ vars->ac_errtype [vi] = AC_ERR_2_TO_1;
886
+ if (vars->credit [HAP1][vi] > vars->credit [HAP2][vi]) {
887
+ vars->set_var_calcgt_on_hap (vi, HAP2, false );
888
+ } else if (vars->credit [HAP1][vi] < vars->credit [HAP2][vi]) {
889
+ vars->set_var_calcgt_on_hap (vi, HAP1, false );
890
+ } else { // default to current phasing
891
+ if (vars->pb_phases [vi] == PHASE_ORIG) {
892
+ vars->calc_gts [vi] = vars->orig_gts [vi];
893
+ } else { // PHASE_SWAP
894
+ vars->calc_gts [vi] = (vars->orig_gts [vi] == GT_REF_ALT1) ?
895
+ GT_ALT1_REF : GT_REF_ALT1;
896
+ }
897
+ }
898
+ }
899
+
900
+ // for called 0|0 variants, don't upset the current phasing
901
+ if (vars->calc_gts [vi] == GT_REF_REF) {
902
+ if (vars->pb_phases [vi] == PHASE_ORIG) {
903
+ vars->calc_gts [vi] = vars->orig_gts [vi];
904
+ } else { // PHASE_SWAP
905
+ vars->calc_gts [vi] = (vars->orig_gts [vi] == GT_REF_ALT1) ? GT_ALT1_REF : GT_REF_ALT1;
906
+ }
907
+ }
908
+ }
888
909
}
889
910
}
890
911
}
0 commit comments