diff --git a/analysis-v2/phab_cm/1_phab_eval_vcfs.sh b/analysis-v2/phab_cm/1_phab_eval_vcfs.sh index 32c16b0..1effc65 100755 --- a/analysis-v2/phab_cm/1_phab_eval_vcfs.sh +++ b/analysis-v2/phab_cm/1_phab_eval_vcfs.sh @@ -2,15 +2,15 @@ source globals.sh -# # truvari phab VCF normalization v4.0 -# source ~/software/Truvari-v4.0.0/venv3.10/bin/activate +# # truvari phab VCF normalization v4.2.1 +# source ~/software/Truvari-4.2.1/venv3.10/bin/activate # mkdir -p $out/phab # for i in "${!query_names[@]}"; do -# echo "truvari v4.0: phab mafft for '${query_names[i]}'" +# echo "phab: mafft for '${query_names[i]}'" # truvari phab \ # -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ -# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ +# -b $data/${truth_name}-${truth_version}/split/${truth_name}.most.vcf.gz \ +# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.most.vcf.gz \ # --bSamples HG002 \ # --cSamples HG002 \ # -f $data/refs/$ref_name \ @@ -55,55 +55,51 @@ for i in "${!query_names[@]}"; do $out/phab/${query_names[i]}.TRUTH.vcf.gz \ $data/refs/$ref_name \ --bed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ - -l 1000 -v 2 \ + -l 1000 \ -p $out/phab-vcfdist/${query_names[i]}. \ 2> $out/phab-vcfdist/${query_names[i]}.log done -# # vcfeval evaluation -# mkdir -p $out/phab-vcfeval -# for i in "${!query_names[@]}"; do -# echo "phab-vcfeval: evaluating '${query_names[i]}'" -# rm -r $out/phab-vcfeval/${query_names[i]} -# $timer -v $rtg vcfeval \ -# -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ -# -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ -# -t $data/refs/${ref_name}.sdf \ -# -m ga4gh \ -# -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# --threads 64 \ -# --ref-overlap \ -# --all-records \ -# -o $out/phab-vcfeval/${query_names[i]} \ -# 2> $out/phab-vcfeval/${query_names[i]}.log -# gunzip $out/phab-vcfeval/${query_names[i]}/output.vcf.gz -# done - -# awk 'BEGIN{OFS="\t"} {print $1,$2,$3-1}' \ -# $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ -# > $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.minus1.bed +# vcfeval evaluation +mkdir -p $out/phab-vcfeval +for i in "${!query_names[@]}"; do + echo "phab-vcfeval: evaluating '${query_names[i]}'" + rm -r $out/phab-vcfeval/${query_names[i]} + $timer -v $rtg vcfeval \ + -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ + -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ + -t $data/refs/${ref_name}.sdf \ + --bed-regions $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ + --evaluation-regions $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ + --threads 64 \ + -m ga4gh \ + --ref-overlap \ + --all-records \ + -o $out/phab-vcfeval/${query_names[i]} \ + 2> $out/phab-vcfeval/${query_names[i]}.log + gunzip $out/phab-vcfeval/${query_names[i]}/output.vcf.gz +done -# # Truvari evaluation -# mkdir -p $out/phab-truvari -# source ~/software/Truvari-4.1.0/venv3.10/bin/activate -# for i in "${!query_names[@]}"; do -# echo "Truvari: evaluating '${query_names[i]}'" -# rm -r $out/phab-truvari/${query_names[i]} -# $timer -v truvari bench \ -# -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ -# -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ -# -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \ -# --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.minus1.bed \ -# --no-ref a \ -# --sizemin 1 --sizefilt 1 --sizemax 1000 \ -# --pick single \ -# --typeignore \ -# --dup-to-ins \ -# -o $out/phab-truvari/${query_names[i]} \ -# 2> $out/phab-truvari/${query_names[i]}.log -# laytr tru2ga \ -# -i $out/phab-truvari/${query_names[i]}/ \ -# -o $out/phab-truvari/${query_names[i]}/result -# gunzip $out/phab-truvari/${query_names[i]}/result*.gz -# done -# deactivate +# Truvari evaluation +mkdir -p $out/phab-truvari +source ~/software/Truvari-4.2.1/venv3.10/bin/activate +for i in "${!query_names[@]}"; do + echo "phab-truvari: evaluating '${query_names[i]}'" + rm -r $out/phab-truvari/${query_names[i]} + $timer -v truvari bench \ + -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \ + -c $out/phab/${query_names[i]}.QUERY.vcf.gz \ + --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ + --no-ref a \ + --sizemin 1 --sizefilt 1 --sizemax 1000 \ + --pick single \ + --typeignore \ + --dup-to-ins \ + -o $out/phab-truvari/${query_names[i]} \ + 2> $out/phab-truvari/${query_names[i]}.log + truvari ga4gh \ + -i $out/phab-truvari/${query_names[i]}/ \ + -o $out/phab-truvari/${query_names[i]}/result + gunzip $out/phab-truvari/${query_names[i]}/result*.gz +done +deactivate diff --git a/analysis-v2/phab_cm/2_eval_vcfs.sh b/analysis-v2/phab_cm/2_eval_vcfs.sh deleted file mode 100755 index 9fd1aa5..0000000 --- a/analysis-v2/phab_cm/2_eval_vcfs.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/bash - -source globals.sh - -# vcfdist evaluation -mkdir -p $out/vcfdist -for i in "${!query_names[@]}"; do - echo "vcfdist: evaluating '${query_names[i]}'" - $timer -v /home/timdunn/vcfdist/src/vcfdist \ - $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ - $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ - $data/refs/$ref_name \ - --bed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ - -t 64 -r 128 \ - -l 1000 \ - -p $out/vcfdist/${query_names[i]}. \ - 2> $out/vcfdist/${query_names[i]}.log -done - -# vcfeval evaluation -mkdir -p $out/vcfeval -for i in "${!query_names[@]}"; do - echo "vcfeval: evaluating '${query_names[i]}'" - rm -r $out/vcfeval/${query_names[i]} - $timer -v $rtg vcfeval \ - -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ - -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ - -t $data/refs/${ref_name}.sdf \ - -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \ - --threads 64 \ - --ref-overlap \ - --vcf-score-field=QUAL \ - -o $out/vcfeval/${query_names[i]} \ - 2> $out/vcfeval/${query_names[i]}.log - gunzip $out/vcfeval/${query_names[i]}/*.vcf.gz -done - -source ~/software/Truvari-4.1.0/venv3.10/bin/activate - -# # truvari evaluation -# mkdir -p $out/truvari -# for i in "${!query_names[@]}"; do -# echo "truvari: evaluating '${query_names[i]}'" -# rm -r $out/truvari/${query_names[i]} -# $timer -v truvari bench \ -# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ -# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ -# -f $data/refs/$ref_name \ -# --bSample HG002 \ -# --cSample HG002 \ -# --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.minus1.bed \ -# --no-ref a \ -# --sizemin 1 --sizefilt 1 --sizemax 1000 \ -# --pick single \ -# --typeignore \ -# --dup-to-ins \ -# -o $out/truvari/${query_names[i]} \ -# 2> $out/truvari/${query_names[i]}.log -# laytr tru2ga \ -# -i $out/truvari/${query_names[i]}/ \ -# -o $out/truvari/${query_names[i]}/result -# gunzip $out/truvari/${query_names[i]}/result*.gz -# done - -# # wfa/mafft refine -# for aln in "${truvari_refine_options[@]}"; do -# for i in "${!query_names[@]}"; do -# echo "\ntruvari: ${aln} refine '${query_names[i]}'" -# mkdir -p $out/truvari-${aln} -# rm -rf $out/truvari-${aln}/${query_names[i]} -# cp -r $out/truvari/${query_names[i]} \ -# $out/truvari-${aln}/${query_names[i]} -# $timer -v truvari refine \ -# -f $data/refs/$ref_name \ -# -t 64 \ -# --use-original-vcfs \ -# --recount \ -# --align ${aln} \ -# $out/truvari-${aln}/${query_names[i]} \ -# 2> $out/truvari-${aln}/${query_names[i]}.log -# echo "laytr: ${aln} refine '${query_names[i]}'" -# laytr tru2ga \ -# -i $out/truvari-${aln}/${query_names[i]}/phab_bench/ \ -# -o $out/truvari-${aln}/${query_names[i]}/phab_bench/result -# gunzip $out/truvari-${aln}/${query_names[i]}/phab_bench/result*.gz -# rm -f $out/truvari-${aln}/${query_names[i]}/*.gz -# rm -f $out/truvari-${aln}/${query_names[i]}/*.vcf -# rm -f $out/truvari-${aln}/${query_names[i]}/*.tbi -# echo "bedtools: exclude phab '${query_names[i]}'" -# bedtools subtract \ -# -a $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.minus1.bed \ -# -b $out/truvari-${aln}/${query_names[i]}/candidate.refine.bed \ -# > $out/truvari-${aln}/${query_names[i]}/no-refine.bed -# echo "truvari: bench excluded phab '${query_names[i]}'" -# rm -rf $out/truvari-${aln}/${query_names[i]}/no_phab_bench -# truvari bench \ -# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \ -# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \ -# -f $data/refs/$ref_name \ -# --bSample HG002 \ -# --cSample HG002 \ -# --includebed $out/truvari-${aln}/${query_names[i]}/no-refine.bed \ -# --no-ref a \ -# --sizemin 1 --sizefilt 1 --sizemax 1000 \ -# --pick single \ -# --typeignore \ -# --dup-to-ins \ -# -o $out/truvari-${aln}/${query_names[i]}/no_phab_bench \ -# 2> $out/truvari-${aln}/${query_names[i]}/no_phab_bench.log -# echo "laytr: ${aln} refine '${query_names[i]}'" -# laytr tru2ga \ -# -i $out/truvari-${aln}/${query_names[i]}/no_phab_bench/ \ -# -o $out/truvari-${aln}/${query_names[i]}/no_phab_bench/result -# gunzip $out/truvari-${aln}/${query_names[i]}/no_phab_bench/result*.gz -# done -# done - -deactivate diff --git a/analysis-v2/phab_cm/3_gen_cm.py b/analysis-v2/phab_cm/2_gen_cm.py similarity index 98% rename from analysis-v2/phab_cm/3_gen_cm.py rename to analysis-v2/phab_cm/2_gen_cm.py index ec74d09..0e78ab1 100644 --- a/analysis-v2/phab_cm/3_gen_cm.py +++ b/analysis-v2/phab_cm/2_gen_cm.py @@ -160,7 +160,7 @@ def match(record1, record2): if var in truvari_records: (size, tv_type, gts) = truvari_records[var] if vd_type and ve_type and tv_type: counts[size][vd_type-1][(ve_type-1)*2 + tv_type-1] += gts - if size == SZ_SNP and vd_type and ve_type and not tv_type: + elif size == SZ_SNP and vd_type and ve_type and not tv_type: counts[size][vd_type-1][(ve_type-1)*2] += gts for size_idx in range(SZ_DIMS): diff --git a/analysis-v2/phab_cm/3_sum_cm.py b/analysis-v2/phab_cm/3_sum_cm.py index d0f280d..463a7d8 100644 --- a/analysis-v2/phab_cm/3_sum_cm.py +++ b/analysis-v2/phab_cm/3_sum_cm.py @@ -106,7 +106,7 @@ def hap2(var): -am = pp = am_diff = snp = 0 +am = pp = am_diff = snps = dels = 0 for ds in datasets: truvari_records = {} vcfdist_records = {} @@ -166,14 +166,20 @@ def hap2(var): if var in vcfeval_records: (size, ve_type, ve_bk) = vcfeval_records[var] if var in truvari_records: (size, tv_type, tv_bk) = truvari_records[var] skips[0 if vd_type else 1][0 if ve_type else 1][0 if tv_type else 1] += 1 - if vd_type and ve_type and tv_type: + + loc, ref, alt, gt = var.split() + + if vd_type and not ve_type and not tv_type: + print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk}") + + if vd_type and ve_type and tv_type: # all tools compare i = vd_type-1 j = (ve_type-1)*2 + tv_type-1 counts[size][i][j] += 1 if i == 0 and j == 1: if tv_bk == "am": # truvari allele match am += 1 - print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}") + # print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}") else: # print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}") pass @@ -201,25 +207,31 @@ def hap2(var): elif size == SZ_SNP and vd_type and ve_type and not tv_type: # Truvari skips SNPs counts[size][vd_type-1][(ve_type-1)*2] += 1 - snp += 1 - elif ve_type and tv_type and not vd_type: + snps += 1 + + elif vd_type and ve_type and not tv_type: # Truvari skip # print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}") + # if len(ref) > len(alt): + # dels += 1 pass + elif ve_type and tv_type and not vd_type: + print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}") + sum_counts = counts[SZ_INDEL][:][:] + counts[SZ_SV][:][:] print("[truvari-only TP] more lenient allele match:", am) print("[vcfeval-only FP] inexact match:", pp) print("[vcfeval-only FP] allele match stringency diff:", am_diff) -# print(" all keep:", skips[0][0][0]) -# print(" all skip:", skips[1][1][1]) -# print("vcfdist-only skip:", skips[1][0][0]) -# print("vcfeval-only skip:", skips[0][1][0]) -# print("truvari-only skip:", skips[0][0][1], f" of which {snp} are SNPs") -# print("vcfdist-only keep:", skips[0][1][1]) -# print("vcfeval-only keep:", skips[1][0][1]) -# print("truvari-only keep:", skips[1][1][0]) +print(" all keep:", skips[0][0][0]) +print(" all skip:", skips[1][1][1]) +print("vcfdist-only skip:", skips[1][0][0]) +print("vcfeval-only skip:", skips[0][1][0]) +print("truvari-only skip:", skips[0][0][1], f" of which {snps} are SNPs and {dels} are DELs") +print("vcfdist-only keep:", skips[0][1][1]) +print("vcfeval-only keep:", skips[1][0][1]) +print("truvari-only keep:", skips[1][1][0]) fig, ax = plt.subplots(figsize=(3,6)) ax.matshow(np.log(sum_counts + 0.1), cmap='Greys') diff --git a/analysis-v2/phab_cm/globals.sh b/analysis-v2/phab_cm/globals.sh index f589bc9..48a102a 100755 --- a/analysis-v2/phab_cm/globals.sh +++ b/analysis-v2/phab_cm/globals.sh @@ -27,8 +27,3 @@ query_versions=( "v4" "v4.20" ) - -truvari_refine_options=( - "wfa" - "mafft" -) diff --git a/analysis-v2/phab_cm/vcfdist_truvari_compare.py b/analysis-v2/phab_cm/vcfdist_truvari_compare.py deleted file mode 100644 index 9fe4acd..0000000 --- a/analysis-v2/phab_cm/vcfdist_truvari_compare.py +++ /dev/null @@ -1,241 +0,0 @@ -import vcf -import numpy as np -import matplotlib.pyplot as plt - -# chr20, with phab -truvari_prefix = "./truvari/giab-tr/result_" -vcfdist_prefix = "./vcfdist/giab-tr." -do_print = False - -SIZE = 0 -SZ_SNP = 0 -SZ_INDEL_1_50 = 1 -SZ_INDEL_50_500 = 2 -SZ_INDEL_500PLUS = 3 -SZ_DIMS = 4 -sizes = ["SNP", "INDEL 1-50", "INDEL 50-500", "INDEL 500+"] - -VD_NONE = 0 -VD_FP = 1 -VD_PP = 2 -VD_TP = 3 -VD_DIMS = 4 -vd_cats = ["None", "FP", "PP", "TP"] - -TV_NONE = 0 -TV_FP = 1 -TV_PP = 2 -TV_TP = 3 -TV_DIMS = 4 -tv_cats = ["None", "FP", "PP", "TP"] - -tv_min_seq_pct = 0.7 -tv_min_size_pct = 0.7 -tv_min_ovlp_pct = 0.0 - -counts = np.zeros((SZ_DIMS, VD_DIMS, TV_DIMS)) - -def get_size(ref : str, alt : str): - if len(ref) == 1 and len(alt) == 1: - return SZ_SNP - else: - size_diff = abs(len(ref) - len(alt)) - if size_diff == 0: - print("ERROR: size 0 INDEL") - exit(1) - elif size_diff <= 50: - return SZ_INDEL_1_50 - elif size_diff <= 500: - return SZ_INDEL_50_500 - else: - return SZ_INDEL_500PLUS - -def get_vd_type(credit : float): - if credit >= 0 and credit < 0.7: - return VD_FP - elif credit >= 0.7 and credit < 1: - return VD_PP - elif credit == 1: - return VD_TP - else: - print("ERROR: credit out of range") - exit(1) - - -def get_tv_type(tv_type : str, seqsim : float): - if seqsim is None: - return TV_FP - if tv_type == "TP" and seqsim < 1 and seqsim >= 0.7: - return TV_PP - if tv_type == "TP": - return TV_TP - if tv_type == "FP": - return TV_FP - - -# for callset in ["query", "truth"]: -for callset in ["query"]: - - # parse vcfdist and truvari summary VCFs - vcfdist_vcf = vcf.Reader(open(f"{vcfdist_prefix}summary.vcf", "r")) - truvari_vcf = vcf.Reader(open(f"{truvari_prefix}{callset}.vcf", "r")) - name = callset.upper() - print(name) - tv_used, vd_used, vd_2used = True, True, False - this_ctg = "chr1" - print(this_ctg) - - while True: - - # get next valid records for each - if tv_used: tv_rec = next(truvari_vcf, None) - if vd_used: vd_rec = next(vcfdist_vcf, None) - if vd_2used: vd_rec = next(vcfdist_vcf, None) - while vd_rec != None and vd_rec.genotype(name)['BD'] == None: # skip other callset - vd_rec = next(vcfdist_vcf, None) - while tv_rec != None and tv_rec.ALT[0] == "*": # skip nested var - tv_rec = next(truvari_vcf, None) - tv_used, vd_used, vd_2used = False, False, False - - if do_print: - print("============================================================") - print("Truvari:", tv_rec) - print("vcfdist:", vd_rec) - - # we've finished iterating through both VCFs - if tv_rec == None and vd_rec == None: break - - # we've finished this contig for both VCFs - if (tv_rec == None or tv_rec.CHROM != this_ctg) and \ - (vd_rec == None or vd_rec.CHROM != this_ctg): - if tv_rec == None: - this_ctg = vd_rec.CHROM - print(this_ctg) - elif vd_rec == None: - this_ctg = tv_rec.CHROM - print(this_ctg) - elif tv_rec.CHROM == vd_rec.CHROM: - this_ctg = tv_rec.CHROM - print(this_ctg) - else: - print("ERROR: different contigs up next") - - # if we've finished only one VCF, set high position - tv_pos = 2_000_000_000 if tv_rec == None else ( - 1_000_000_000 if tv_rec.CHROM != this_ctg else tv_rec.POS) - vd_pos = 2_000_000_000 if vd_rec == None else ( - 1_000_000_000 if vd_rec.CHROM != this_ctg else vd_rec.POS) - - if tv_pos == vd_pos: # position match - if do_print: print(f"TV VD {name} {tv_rec.genotype(name)['GT']} {vd_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}") - if tv_rec.REF == vd_rec.REF and tv_rec.ALT[0] == vd_rec.ALT[0]: # full match - tv_used, vd_used = True, True - if tv_rec.genotype(name)['GT'] == '1|1' or tv_rec.genotype(name)['GT'] == '1/1': - vd_2used = True # skip second split of this GT - size = get_size(tv_rec.REF, tv_rec.ALT[0]) - vd_type = get_vd_type(float(vd_rec.genotype(name)['BC'])) - tv_type = get_tv_type(tv_rec.genotype(name)['BD'], tv_rec.INFO['PctSeqSimilarity']) - counts[size][vd_type][tv_type] += 2 if vd_2used else 1 - if vd_type == VD_TP and tv_type == TV_FP: - if do_print: print("vcfdist TP, Truvari FP") - if vd_type == VD_FP and tv_type == TV_TP: - if do_print: print("vcfdist FP, Truvari TP") - - # skip if info unavailable - if tv_rec.INFO['PctSeqSimilarity'] == None or \ - tv_rec.INFO['PctSizeSimilarity'] == None or \ - tv_rec.INFO['PctRecOverlap'] == None: - counts[size][vd_type][TV_FP] += 1 - continue - - # vcfdist SNP adjacent to INS - elif len(vd_rec.REF) == 1 and len(vd_rec.ALT[0]) == 1: - vd_used = True - size = get_size(vd_rec.REF, vd_rec.ALT[0]) - vd_type = get_vd_type(float(vd_rec.genotype(name)['BC'])) - tv_type = TV_NONE - counts[size][vd_type][tv_type] += 1 - - else: # position but not allele match, likely different phasing order - - # get next record for each - tv_rec_next = next(truvari_vcf, None) - vd_rec_next = next(vcfdist_vcf, None) - while vd_rec_next != None and vd_rec_next.genotype(name)['BD'] == None: # skip other callset - vd_rec_next = next(vcfdist_vcf, None) - while tv_rec_next != None and tv_rec_next.ALT[0] == "*": # skip nested var - tv_rec_next = next(truvari_vcf, None) - - # test if swapping causes matches - if tv_rec.REF == vd_rec_next.REF and \ - tv_rec.ALT[0] == vd_rec_next.ALT[0] and \ - tv_rec_next.REF == vd_rec.REF and \ - tv_rec_next.ALT[0] == vd_rec.ALT[0]: - # 1: tv_rec, vd_rec_next - size1 = get_size(tv_rec.REF, tv_rec.ALT[0]) - vd_type1 = get_vd_type(float(vd_rec_next.genotype(name)['BC'])) - tv_type1 = get_tv_type(tv_rec_next.genotype(name)['BD'], tv_rec.INFO['PctSeqSimilarity']) - counts[size1][vd_type1][tv_type1] += 1 - # 2: tv_rec_next, vd_rec - size2 = get_size(tv_rec_next.REF, tv_rec_next.ALT[0]) - vd_type2 = get_vd_type(float(vd_rec.genotype(name)['BC'])) - tv_type2 = get_tv_type(tv_rec_next.genotype(name)['BD'], tv_rec.INFO['PctSeqSimilarity']) - counts[size2][vd_type2][tv_type2] += 1 - tv_used, vd_used = True, True - else: - print("ERROR: failed to match") - - # discard current two, pretend we haven't looked at next two - counts[size][VD_NONE][TV_NONE] += 2 - tv_rec = tv_rec_next - vd_rec = vd_rec_next - - - elif vd_pos < tv_pos: # vcfdist only - vd_used = True - size = get_size(vd_rec.REF, vd_rec.ALT[0]) - if do_print: - print(f" VD {name} {vd_rec.genotype(name)['GT']} {vd_rec.CHROM}:{vd_rec.POS}\t{vd_rec.REF}\t{vd_rec.ALT[0]}\t") - if size != SZ_SNP: - print(f" VD {name} {vd_rec.genotype(name)['GT']} {vd_rec.CHROM}:{vd_rec.POS}\t{vd_rec.REF}\t{vd_rec.ALT[0]}\t") - print("WARN: vcfdist only, not SNP") - vd_type = get_vd_type(float(vd_rec.genotype(name)['BC'])) - tv_type = TV_NONE - counts[size][vd_type][tv_type] += 1 - - elif tv_pos < vd_pos: # truvari only - print(f"TV {name} {tv_rec.genotype(name)['GT']} {tv_rec.CHROM}:{tv_rec.POS}\t{tv_rec.REF}\t{tv_rec.ALT[0]}") - print("WARN: Truvari only") - tv_used = True - - # skip unphased truvari variants - gt = tv_rec.genotype(name)['GT'] - if gt[1] == '/' and gt[0] != gt[2]: - continue - - size = get_size(tv_rec.REF, tv_rec.ALT[0]) - vd_type = VD_NONE - tv_type = get_tv_type(tv_rec.genotype(name)['BD'], tv_rec.INFO['PctSeqSimilarity']) - counts[size][vd_type][tv_type] += 1 - - # skip if info unavailable - if tv_rec.INFO['PctSeqSimilarity'] == None or \ - tv_rec.INFO['PctSizeSimilarity'] == None or \ - tv_rec.INFO['PctRecOverlap'] == None: - counts[size][vd_type][TV_FP] += 1 - continue - - for size_idx in range(SZ_DIMS): - fig, ax = plt.subplots() - ax.matshow(np.log(counts[size_idx,1:,1:] + 0.1), cmap="Blues") - plt.title(f"{name} {sizes[size_idx]} Confusion Matrix") - plt.ylabel("vcfdist") - ax.set_yticks(list(range(VD_DIMS-1))) - ax.set_yticklabels(vd_cats[1:]) - plt.xlabel("Truvari") - ax.set_xticks(list(range(TV_DIMS-1))) - ax.set_xticklabels(tv_cats[1:]) - for (i,j), z in np.ndenumerate(counts[size_idx,1:,1:]): - ax.text(j, i, f"{int(z)}", ha='center', va='center', - bbox=dict(boxstyle='round', facecolor='white', edgecolor='0.3')) - plt.savefig(f"img/tv_vd_{callset}_{size_idx}_cm.png", dpi=200)