-
Notifications
You must be signed in to change notification settings - Fork 118
/
Copy pathMTR.pm
151 lines (109 loc) · 3.9 KB
/
MTR.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
=head1 CONTACT
Slave Petrovski <[email protected]>
Michael Silk <[email protected]>
=cut
=head1 NAME
MTR (Missense Tolerance Ratio)
=head1 SYNOPSIS
mv MTR.pm ~/.vep/Plugins
./vep -i variations.vcf --plugin MTR,mtrflatfile_2.0.tsv.gz
=head1 DESCRIPTION
A VEP plugin that retrieves Missense Tolerance Ratio (MTR) scores for
variants from a tabix-indexed flat file.
MTR scores quantify the amount of purifying selection acting
specifically on missense variants in a given window of protein-coding
sequence. It is estimated across a sliding window of 31 codons and uses
observed standing variation data from the WES component of the Exome
Aggregation Consortium Database (ExAC), version 2.0
(http://gnomad.broadinstitute.org).
Please cite the MTR publication alongside the VEP if you use this resource:
http://genome.cshlp.org/content/27/10/1715
The Bio::DB::HTS perl library or tabix utility must be installed in your path to use this plugin.
MTR flat files can be downloaded from http://biosig.unimelb.edu.au/mtr-viewer/downloads
The following steps are necessary before running the plugin
gzip -d mtrflatfile_2.0.txt.gz # to unzip the text file
cat mtrflatfile_2.0.txt | tr " " "\t" > mtrflatfile_2.00.tsv # to change the file to a tabbed delimited file
sed '1s/.*/#&/' mtrflatfile_2.00.tsv > mtrflatfile_2.0.tsv # to add # to the first line of the file
bgzip mtrflatfile_2.0.tsv
tabix -f -s 1 -b 2 -e 2 mtrflatfile_2.0.tsv.gz
NB: Data are available for GRCh37 only
=cut
package MTR;
use strict;
use warnings;
use Bio::EnsEMBL::Utils::Sequence qw(reverse_comp);
use Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin;
use base qw(Bio::EnsEMBL::Variation::Utils::BaseVepTabixPlugin);
sub new {
my $class = shift;
my $self = $class->SUPER::new(@_);
# get MTR file
my $file = $self->params->[0];
$self->add_file($file);
# to check for assembly
my $assembly = $self->{config}->{assembly};
if ($assembly ne "GRCh37") {
die "Assembly is not GRCh37, MTR only works with GRCh37. \n";
}
# get headers and store on self
open HEAD, "tabix -fh $file 1:1-1 2>&1 | ";
while(<HEAD>) {
next unless /^\#/;
chomp;
$_ =~ s/^\#//;
$self->{headers} = [split];
}
close HEAD;
if(! grep(/^Genomic_position$/, @{$self->{headers}})) {
die "Required header Genomic_position not found.\n";
}
return $self;
}
sub feature_types {
return ['Transcript'];
}
sub variation_feature_types {
return ['VariationFeature'];
}
sub get_header_info {
my $self = shift;
return {
MTR => 'MTR score',
FDR => 'MTR false discovery rate adjusted binomial exact test.',
MTR_centile => 'MTR gene-specific percentile'
}
}
sub run {
my ($self, $tva) = @_;
my $vf = $tva->variation_feature;
# get allele, reverse comp if needed
my $allele = $tva->variation_feature_seq;
reverse_comp(\$allele) if $vf->{strand} < 0;
return {} unless $allele =~ /^[ACGT]$/;
my $tr_id = $tva->transcript->stable_id;
# data is written by pos, allele, transcript ID (feature)
# grep lines read in matched on position so that they also are matched on allele and transcript ID
my ($res) = grep {
$_->{Genomic_position} == $vf->{start} &&
$_->{Genomic_position} == $vf->{end} &&
$_->{alt} eq $allele &&
$_->{Feature} eq $tr_id
} @{$self->get_data($vf->{chr}, $vf->{start}, $vf->{end})};
# return only the keys defined by get_header_info()
return $res ? { map {$_ => $res->{$_}} grep {defined($res->{$_}) && $res->{$_} ne '.'} keys %{$self->get_header_info} } : {};
}
sub parse_data {
my ($self, $line) = @_;
$line =~ s/\r$//g;
my @split = split /\t/, $line;
# parse data into hash of col names and values
my %data = map {$self->{headers}->[$_] => $split[$_]} (0..(scalar @{$self->{headers}} - 1));
return \%data;
}
sub get_start {
return $_[1]->{'Genomic_position'};
}
sub get_end {
return $_[1]->{'Genomic_position'};
}
1;