Skip to content

Commit ff5bb00

Browse files
authored
Merge pull request #31 from aryarm/fix/2vcf-exit-code
remove non-zero exit code from 2vcf.py snakemake rule
2 parents 6121d43 + abfc6b7 commit ff5bb00

File tree

5 files changed

+66
-5
lines changed

5 files changed

+66
-5
lines changed

rules/classify.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -242,7 +242,7 @@ rule tsv2vcf:
242242
output: temp(config['out']+"/{sample}/results.vcf.gz")
243243
conda: "../envs/prc.yml"
244244
shell:
245-
"zcat {input.merge} | scripts/cgrep.bash - -E '^(CHROM|POS|REF)$|.*~(REF|ALT)$' | scripts/2vcf.py -o {output} {params.callers} {input.results}"
245+
"scripts/2vcf.py -o {output} {params.callers} {input.results} {input.merge}"
246246

247247
rule fix_vcf_header:
248248
""" add contigs to the header of the vcf """

rules/prepare.smk

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ rule bed_peaks:
144144
# to convert to BED, we must extract the first three columns (chr, start, stop)
145145
"cut -f -3 \"{input.peaks}\" | "
146146
"bedtools getfasta -fi {input.ref} -bedOut -bed stdin | "
147-
"sort -t $'\t' -k1,1V -k2,2n > \"{output}\" && "
147+
"sort -t $'\\t' -k1,1V -k2,2n > \"{output}\" && "
148148
"test -s \"{output}\""
149149

150150

scripts/2vcf.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,12 @@
1414

1515
def phred(vals):
1616
""" apply the phred scale to the vals provided """
17-
return -10*np.log10(1-vals)
18-
return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30
17+
with np.errstate(divide='raise'):
18+
try:
19+
return -10*np.log10(1-vals)
20+
except FloatingPointError:
21+
return np.float64(30)
22+
return -10*np.ma.log10(1-vals).filled(-3) # fill all infinite values with a phred scale of 30
1923

2024
def plot_line(lst, show_discards=False):
2125
plt.clf()
@@ -214,7 +218,8 @@ def main(prepared, classified, callers=None, cs=1000, all_sites=False, pretty=Fa
214218
prepared = get_calls(
215219
pd.read_csv(
216220
prepared, sep="\t", header=0, index_col=["CHROM", "POS"],
217-
dtype=str, chunksize=cs, na_values="."
221+
dtype=str, chunksize=cs, na_values=".",
222+
usecols=lambda col: col in ['CHROM', 'POS', 'REF'] or col.endswith('~REF') or col.endswith('~ALT')
218223
), callers, pretty
219224
)
220225
# flush out the first item in the generator: the vartype
@@ -334,6 +339,8 @@ def write_vcf(out, records):
334339
callers = args.callers.split(",")
335340

336341
if not args.internal:
342+
import matplotlib
343+
matplotlib.use('Agg')
337344
vcf = write_vcf(args.out, main(args.prepared, args.classified, callers, args.chunksize, args.all, args.pretty, args.type))
338345
else:
339346
if not sys.flags.interactive:

scripts/README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,9 @@ All python scripts implement the `--help` argument. For bash, R, and awk scripts
77
### [2vcf.py](2vcf.py)
88
A python script that uses files from the `prepare` and `classify` pipelines to create a VCF with the final, predicted variants. This script also has a special internal mode, which can be used for recalibrating the QUAL scores output in the VCF.
99

10+
### [allele_conflicts.bash](allele_conflicts.bash)
11+
A bash script for identifying sites at which the variant callers in our ensemble outputted conflicting alleles.
12+
1013
### [cgrep.bash](cgrep.bash)
1114
A bash script for extracting columns from TSVs via `grep`. Every argument besides the first is passed directly to `grep`.
1215

scripts/allele_conflicts.bash

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/usr/bin/env bash
2+
3+
# List all positions (to stdout) at which VarCA called a variant but there were conflicts for the alternative allele.
4+
# Note that this script will only work for the chosen subset of variant callers.
5+
6+
# arg1: merge.tsv.gz file
7+
# arg2: results.tsv.gz file
8+
# arg3: 'snp' or 'indel'
9+
10+
# Ex: scripts/allele_conflicts.bash out/merged_indel/SRR891269/merge.tsv.gz out/new-classify/classify-indel/SRR891269_even_test/results.tsv.gz indel
11+
12+
if [ "$3" = 'indel' ]; then
13+
# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpindel\tstrelka\tpg\tpg\tprob0\tprob1\tvarca"
14+
zcat "$1" | \
15+
scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pindel|illumina-strelka|pg-indel).*~(ALT)$' | \
16+
awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $6 || $6 != $7 || $7 != $3' | \
17+
tail -n+2 | \
18+
awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \
19+
sed 's/\t/,/' | \
20+
LC_ALL=C sort -t $'\t' -k1,1 | \
21+
LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <(
22+
zcat "$2" | \
23+
awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \
24+
sed 's/\t/,/' | \
25+
LC_ALL=C sort -t $'\t' -k1,1
26+
)
27+
else
28+
# echo -e "CHROM\tPOS\tgatk\tvarscan\tvardict\tpg\tmajority\tmajority_idx\tgatk\tvarscan\tvardict\tpg\tpg\tprob0\tprob1\tvarca"
29+
zcat "$1" | \
30+
scripts/cgrep.bash - -E '^(CHROM|POS)$|(gatk|varscan|vardict|pg-snp).*~(ALT)$' | \
31+
awk -F $'\t' -v 'OFS=\t' '$3 != $4 || $4 != $5 || $5 != $3' | \
32+
tail -n+2 | \
33+
awk -F $'\t' -v 'OFS=\t' '{ { for(i=3; i<=NF-1; i++) count[$i]++; } PROCINFO["sorted_in"] = "@val_num_desc"; { for (val in count) { print $0, val, count[val]; break; } } delete count; }' | \
34+
sed 's/\t/,/' | \
35+
LC_ALL=C sort -t $'\t' -k1,1 | \
36+
LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order - <(
37+
zcat "$2" | \
38+
awk -F $"\t" -v 'OFS=\t' 'NR == 1 || $NF == 1' | \
39+
sed 's/\t/,/' | \
40+
LC_ALL=C sort -t $'\t' -k1,1
41+
)
42+
fi | \
43+
sed 's/,/\t/' | \
44+
LC_ALL=C sort -t $'\t' -k1,1V -k2,2n
45+
46+
# To make a table containing alternative alleles for the following rules (as columns)
47+
# 1) caller priority rule
48+
# 2) majority rule
49+
# 3) platinum genomes
50+
# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-indel/SRR891269_even_test/final.vcf.gz | sed 's/gatk-indel/1/; s/varscan-indel/2/; s/vardict-indel/3/; s/pindel/4/; s/illumina-strelka/5/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-indel/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $7, $6;}' | less
51+
# bcftools query -f '%CHROM\t%POS\t%INFO/CALLER\n' out-original/new-classify/classify-snp/SRR891269_even_test/final.vcf.gz | sed 's/gatk-snp/1/; s/varscan-snp/2/; s/vardict-snp/3/' | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1 | LC_ALL=C join -t $'\t' -e '' -j1 -o auto --nocheck-order <(zcat out-original/new-classify/classify-snp/SRR891269_even_test/allele_conflicts.tsv.gz | sed 's/\t/,/' | LC_ALL=C sort -t $'\t' -k1,1) - | cut -f2- | awk -F $'\t' -v 'OFS=\t' '{print $$NF, $5, $4;}' | less

0 commit comments

Comments
 (0)