|
| 1 | +#!/usr/bin/env bash |
| 2 | + |
| 3 | +# List all positions (to stdout) at which VarCA called a variant but there were conflicts for the alternative allele. |
| 4 | +# Note that this script will only work for the chosen subset of variant callers. |
| 5 | + |
| 6 | +# arg1: merge.tsv.gz file |
| 7 | +# arg2: results.tsv.gz file |
| 8 | +# arg3: 'snp' or 'indel' |
| 9 | + |
| 10 | +# Ex: scripts/allele_conflicts.bash out/merged_indel/SRR891269/merge.tsv.gz out/new-classify/classify-indel/SRR891269_even_test/results.tsv.gz indel |
| 11 | + |
| 12 | +if [ "$3" = 'indel' ]; then |
| 13 | +# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tpg\tprob0\tprob1\tvarca" |
| 14 | + zcat "$1" | \ |
| 15 | + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \ |
| 16 | + awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $6 || $6 != $7 || $7 != $3' | \ |
| 17 | + tail -n+2 | \ |
| 18 | + awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \ |
| 19 | + sed 's/\t/,/' | \ |
| 20 | + LC_ALL=C sort -t $'\t' -k1,1 | \ |
| 21 | + LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <( |
| 22 | + zcat "$2" | \ |
| 23 | + awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \ |
| 24 | + sed 's/\t/,/' | \ |
| 25 | + LC_ALL=C sort -t $'\t' -k1,1 |
| 26 | + ) |
| 27 | +else |
| 28 | +# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpg\tpg\tprob0\tprob1\tvarca" |
| 29 | + zcat "$1" | \ |
| 30 | + scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \ |
| 31 | + awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $3' | \ |
| 32 | + tail -n+2 | \ |
| 33 | + awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \ |
| 34 | + sed 's/\t/,/' | \ |
| 35 | + LC_ALL=C sort -t $'\t' -k1,1 | \ |
| 36 | + LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <( |
| 37 | + zcat "$2" | \ |
| 38 | + awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \ |
| 39 | + sed 's/\t/,/' | \ |
| 40 | + LC_ALL=C sort -t $'\t' -k1,1 |
| 41 | + ) |
| 42 | +fi | \ |
| 43 | +sed 's/,/\t/' | \ |
| 44 | +LC_ALL=C sort -t $'\t' -k1,1V -k2,2n |
| 45 | + |
| 46 | +# To make a table containing alternative alleles for the following rules (as columns) |
| 47 | +# 1) caller priority rule |
| 48 | +# 2) majority rule |
| 49 | +# 3) platinum genomes |
| 50 | +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less |
| 51 | +# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less |
0 commit comments