Skip to content

Commit

Permalink
Updated phab analysis.
Browse files Browse the repository at this point in the history
  • Loading branch information
TimD1 committed Feb 19, 2024
1 parent 80e53f8 commit 35dd9a3
Show file tree
Hide file tree
Showing 6 changed files with 74 additions and 430 deletions.
100 changes: 48 additions & 52 deletions analysis-v2/phab_cm/1_phab_eval_vcfs.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,15 @@

source globals.sh

# # truvari phab VCF normalization v4.0
# source ~/software/Truvari-v4.0.0/venv3.10/bin/activate
# # truvari phab VCF normalization v4.2.1
# source ~/software/Truvari-4.2.1/venv3.10/bin/activate
# mkdir -p $out/phab
# for i in "${!query_names[@]}"; do
# echo "truvari v4.0: phab mafft for '${query_names[i]}'"
# echo "phab: mafft for '${query_names[i]}'"
# truvari phab \
# -r $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
# -b $data/${truth_name}-${truth_version}/split/${truth_name}.all.vcf.gz \
# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.all.vcf.gz \
# -b $data/${truth_name}-${truth_version}/split/${truth_name}.most.vcf.gz \
# -c $data/${query_names[i]}-${query_versions[i]}/split/${query_names[i]}.most.vcf.gz \
# --bSamples HG002 \
# --cSamples HG002 \
# -f $data/refs/$ref_name \
Expand Down Expand Up @@ -55,55 +55,51 @@ for i in "${!query_names[@]}"; do
$out/phab/${query_names[i]}.TRUTH.vcf.gz \
$data/refs/$ref_name \
--bed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
-l 1000 -v 2 \
-l 1000 \
-p $out/phab-vcfdist/${query_names[i]}. \
2> $out/phab-vcfdist/${query_names[i]}.log
done

# # vcfeval evaluation
# mkdir -p $out/phab-vcfeval
# for i in "${!query_names[@]}"; do
# echo "phab-vcfeval: evaluating '${query_names[i]}'"
# rm -r $out/phab-vcfeval/${query_names[i]}
# $timer -v $rtg vcfeval \
# -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
# -c $out/phab/${query_names[i]}.QUERY.vcf.gz \
# -t $data/refs/${ref_name}.sdf \
# -m ga4gh \
# -e $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
# --threads 64 \
# --ref-overlap \
# --all-records \
# -o $out/phab-vcfeval/${query_names[i]} \
# 2> $out/phab-vcfeval/${query_names[i]}.log
# gunzip $out/phab-vcfeval/${query_names[i]}/output.vcf.gz
# done

# awk 'BEGIN{OFS="\t"} {print $1,$2,$3-1}' \
# $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
# > $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.minus1.bed
# vcfeval evaluation
mkdir -p $out/phab-vcfeval
for i in "${!query_names[@]}"; do
echo "phab-vcfeval: evaluating '${query_names[i]}'"
rm -r $out/phab-vcfeval/${query_names[i]}
$timer -v $rtg vcfeval \
-b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
-c $out/phab/${query_names[i]}.QUERY.vcf.gz \
-t $data/refs/${ref_name}.sdf \
--bed-regions $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
--evaluation-regions $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
--threads 64 \
-m ga4gh \
--ref-overlap \
--all-records \
-o $out/phab-vcfeval/${query_names[i]} \
2> $out/phab-vcfeval/${query_names[i]}.log
gunzip $out/phab-vcfeval/${query_names[i]}/output.vcf.gz
done

# # Truvari evaluation
# mkdir -p $out/phab-truvari
# source ~/software/Truvari-4.1.0/venv3.10/bin/activate
# for i in "${!query_names[@]}"; do
# echo "Truvari: evaluating '${query_names[i]}'"
# rm -r $out/phab-truvari/${query_names[i]}
# $timer -v truvari bench \
# -b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
# -c $out/phab/${query_names[i]}.QUERY.vcf.gz \
# -f $data/refs/GCA_000001405.15_GRCh38_no_alt_analysis_set.fasta \
# --includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.minus1.bed \
# --no-ref a \
# --sizemin 1 --sizefilt 1 --sizemax 1000 \
# --pick single \
# --typeignore \
# --dup-to-ins \
# -o $out/phab-truvari/${query_names[i]} \
# 2> $out/phab-truvari/${query_names[i]}.log
# laytr tru2ga \
# -i $out/phab-truvari/${query_names[i]}/ \
# -o $out/phab-truvari/${query_names[i]}/result
# gunzip $out/phab-truvari/${query_names[i]}/result*.gz
# done
# deactivate
# Truvari evaluation
mkdir -p $out/phab-truvari
source ~/software/Truvari-4.2.1/venv3.10/bin/activate
for i in "${!query_names[@]}"; do
echo "phab-truvari: evaluating '${query_names[i]}'"
rm -r $out/phab-truvari/${query_names[i]}
$timer -v truvari bench \
-b $out/phab/${query_names[i]}.TRUTH.vcf.gz \
-c $out/phab/${query_names[i]}.QUERY.vcf.gz \
--includebed $data/giab-tr-v4.20/GIABTR.HG002.benchmark.chr20.bed \
--no-ref a \
--sizemin 1 --sizefilt 1 --sizemax 1000 \
--pick single \
--typeignore \
--dup-to-ins \
-o $out/phab-truvari/${query_names[i]} \
2> $out/phab-truvari/${query_names[i]}.log
truvari ga4gh \
-i $out/phab-truvari/${query_names[i]}/ \
-o $out/phab-truvari/${query_names[i]}/result
gunzip $out/phab-truvari/${query_names[i]}/result*.gz
done
deactivate
118 changes: 0 additions & 118 deletions analysis-v2/phab_cm/2_eval_vcfs.sh

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def match(record1, record2):
if var in truvari_records: (size, tv_type, gts) = truvari_records[var]
if vd_type and ve_type and tv_type:
counts[size][vd_type-1][(ve_type-1)*2 + tv_type-1] += gts
if size == SZ_SNP and vd_type and ve_type and not tv_type:
elif size == SZ_SNP and vd_type and ve_type and not tv_type:
counts[size][vd_type-1][(ve_type-1)*2] += gts

for size_idx in range(SZ_DIMS):
Expand Down
38 changes: 25 additions & 13 deletions analysis-v2/phab_cm/3_sum_cm.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ def hap2(var):



am = pp = am_diff = snp = 0
am = pp = am_diff = snps = dels = 0
for ds in datasets:
truvari_records = {}
vcfdist_records = {}
Expand Down Expand Up @@ -166,14 +166,20 @@ def hap2(var):
if var in vcfeval_records: (size, ve_type, ve_bk) = vcfeval_records[var]
if var in truvari_records: (size, tv_type, tv_bk) = truvari_records[var]
skips[0 if vd_type else 1][0 if ve_type else 1][0 if tv_type else 1] += 1
if vd_type and ve_type and tv_type:

loc, ref, alt, gt = var.split()

if vd_type and not ve_type and not tv_type:
print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk}")

if vd_type and ve_type and tv_type: # all tools compare
i = vd_type-1
j = (ve_type-1)*2 + tv_type-1
counts[size][i][j] += 1
if i == 0 and j == 1:
if tv_bk == "am": # truvari allele match
am += 1
print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}")
# print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}")
else:
# print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}")
pass
Expand Down Expand Up @@ -201,25 +207,31 @@ def hap2(var):

elif size == SZ_SNP and vd_type and ve_type and not tv_type: # Truvari skips SNPs
counts[size][vd_type-1][(ve_type-1)*2] += 1
snp += 1
elif ve_type and tv_type and not vd_type:
snps += 1

elif vd_type and ve_type and not tv_type: # Truvari skip
# print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}")
# if len(ref) > len(alt):
# dels += 1
pass

elif ve_type and tv_type and not vd_type:
print(f"{ds} {var} VD={cats[vd_type]}:{vd_bk} VE={cats[ve_type]}:{ve_bk} TV={cats[tv_type]}:{tv_bk}")


sum_counts = counts[SZ_INDEL][:][:] + counts[SZ_SV][:][:]
print("[truvari-only TP] more lenient allele match:", am)
print("[vcfeval-only FP] inexact match:", pp)
print("[vcfeval-only FP] allele match stringency diff:", am_diff)

# print(" all keep:", skips[0][0][0])
# print(" all skip:", skips[1][1][1])
# print("vcfdist-only skip:", skips[1][0][0])
# print("vcfeval-only skip:", skips[0][1][0])
# print("truvari-only skip:", skips[0][0][1], f" of which {snp} are SNPs")
# print("vcfdist-only keep:", skips[0][1][1])
# print("vcfeval-only keep:", skips[1][0][1])
# print("truvari-only keep:", skips[1][1][0])
print(" all keep:", skips[0][0][0])
print(" all skip:", skips[1][1][1])
print("vcfdist-only skip:", skips[1][0][0])
print("vcfeval-only skip:", skips[0][1][0])
print("truvari-only skip:", skips[0][0][1], f" of which {snps} are SNPs and {dels} are DELs")
print("vcfdist-only keep:", skips[0][1][1])
print("vcfeval-only keep:", skips[1][0][1])
print("truvari-only keep:", skips[1][1][0])

fig, ax = plt.subplots(figsize=(3,6))
ax.matshow(np.log(sum_counts + 0.1), cmap='Greys')
Expand Down
5 changes: 0 additions & 5 deletions analysis-v2/phab_cm/globals.sh
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,3 @@ query_versions=(
"v4"
"v4.20"
)

truvari_refine_options=(
"wfa"
"mafft"
)
Loading

0 comments on commit 35dd9a3

Please sign in to comment.