Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add new option to fetch clinvar somatic classification + patch db for 115 #1794

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 56 additions & 2 deletions modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,13 @@ sub get_features_by_regions_uncached {

my $adaptor = $self->get_adaptor('variation', 'phenotypefeature');
my $source_id = $self->clinvar_source_id_cache;
my $attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id) if defined($adaptor) && defined($source_id);
my $attribs;
my $attribs_clinical_impact;

if (defined($adaptor) && defined($source_id)) {
$attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id);
$attribs_clinical_impact = $adaptor->get_somatic_clin_impact_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id);
}

my $phenotype_attrib_id = $self->phenotype_attrib_id || 0;

Expand Down Expand Up @@ -185,9 +191,14 @@ sub get_features_by_regions_uncached {
while($sth->fetch) {
my %v_copy = %v;
$v_copy{allele_string} =~ s/\s+/\_/g;
my $v_clinsigs = $attribs->{($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end}};

my $key = ($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end};
# Clinical significance (germline)
my $v_clinsigs = $attribs->{$key};

my @pfas_by_allele;
my %clin_sigs;

foreach my $pfa(@{$v_clinsigs})
{
if(defined($pfa->{clinvar_clin_sig}) && $v_copy{variation_name} eq $pfa->{id})
Expand All @@ -198,6 +209,14 @@ sub get_features_by_regions_uncached {
}
my @array = keys(%clin_sigs);
$v_copy{clin_sig_allele} = join ';', @array if scalar(@array);

# somatic clinical impact
my $v_clin_impact = $attribs_clinical_impact->{$key};

if($v_clin_impact){
$v_copy{clinical_impact} = _format_clinical_impact($v_clin_impact);
}

$v_copy{variation_id} = $var_id;
## fix for e!94 alleles
$v_copy{allele_string} =~ s/\/$//g;
Expand All @@ -216,6 +235,41 @@ sub get_features_by_regions_uncached {
return \@return;
}

=head2 _format_clinical_impact

Example : $clinical_impact = _format_clinical_impact($v_clin_impact)
Description: Internal method to format the clinvar somatic classification
Returntype : string
Exceptions : none
Caller : get_features_by_regions_uncached()
Status : Stable

=cut

sub _format_clinical_impact {
my $v_clin_impact = shift;

my @somatic_clin_sig_list = ();

for my $pheno_feat (@{$v_clin_impact}) {
# Get the somatic clinical impact
# Example: Tier IV - Benign/Likely benign
if($pheno_feat->{somatic_clin_sig}) {
$pheno_feat->{somatic_clin_sig} =~ s/,/;/g;
my $tmp = $pheno_feat->{phenotype} . "(" . $pheno_feat->{somatic_clin_sig} . ")";
push @somatic_clin_sig_list, $tmp;
}

if($pheno_feat->{oncogenic_clin_sig}) {
my $tmp = $pheno_feat->{phenotype} . "(oncogenicity:" . $pheno_feat->{oncogenic_clin_sig} . ")";
push @somatic_clin_sig_list, $tmp;
}
}

my $classification = join(";", @somatic_clin_sig_list);

return $classification;
}

=head2 seq_region_cache

Expand Down
7 changes: 4 additions & 3 deletions modules/Bio/EnsEMBL/VEP/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -149,9 +149,10 @@ our @VEP_PARAMS = (
'nearest=s', # get nearest transcript, gene or symbol (for gene)
'distance=s', # set up/downstream distance
'clin_sig_allele=i', # use allele specific clinical significance data where it exists
'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV
'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements)
'remove_hgvsp_version', # removes translation version from hgvs_protein output
'clinvar_somatic_classification', # report the somatic classification of a variant as reported by ClinVar
'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV
'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements)
'remove_hgvsp_version', # removes translation version from hgvs_protein output


# verbosity options
Expand Down
4 changes: 3 additions & 1 deletion modules/Bio/EnsEMBL/VEP/Constants.pm
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ our @FLAG_FIELDS = (
{ flag => 'minimal', fields => ['MINIMISED']},
{ flag => 'spdi', fields => ['SPDI']},
{ flag => 'ga4gh_vrs', fields => ['GA4GH_VRS']},
{ flag => 'clinvar_somatic_classification', fields => ['CLINVAR_SOMATIC_CLASSIFICATION']},

# gene-related
{ flag => 'symbol', fields => ['SYMBOL','SYMBOL_SOURCE','HGNC_ID'] },
Expand Down Expand Up @@ -236,6 +237,7 @@ our %FIELD_DESCRIPTIONS = (
'MAX_AF_POPS' => 'Populations in which maximum allele frequency was observed',
'DISTANCE' => 'Shortest distance from variant to transcript',
'CLIN_SIG' => 'ClinVar clinical significance of the dbSNP variant',
'CLINVAR_SOMATIC_CLASSIFICATION' => 'ClinVar somatic classification of the dbSNP variant',
'BIOTYPE' => 'Biotype of transcript or regulatory feature',
'PUBMED' => 'Pubmed ID(s) of publications that cite existing variant',
'ALLELE_NUM' => 'Allele number from input; 0 is reference, 1 is first alternate etc',
Expand All @@ -257,7 +259,7 @@ our %FIELD_DESCRIPTIONS = (
'AMBIGUITY' => 'Allele ambiguity code',
'OverlapBP' => 'Number of base pairs overlapping with the corresponding structural variation feature',
'OverlapPC' => 'Percentage of corresponding structural variation feature overlapped by the given input',
'CHECK_REF' => 'Reports variants where the input reference does not match the expected reference',
'CHECK_REF' => 'Reports variants where the input reference does not match the expected reference',
'UPLOADED_ALLELE' => 'The variant allele uploaded',
'SHIFT_LENGTH' => 'Reports the number of bases the insertion or deletion has been shifted relative to the underlying transcript due to right alignment before consequence calculation',
'GENCODE_PRIMARY' => 'Reports if transcript is GENCODE primary'
Expand Down
9 changes: 7 additions & 2 deletions modules/Bio/EnsEMBL/VEP/OutputFactory.pm
Original file line number Diff line number Diff line change
Expand Up @@ -174,6 +174,7 @@ sub new {
max_af
pubmed
clin_sig_allele
clinvar_somatic_classification

numbers
domains
Expand Down Expand Up @@ -1019,7 +1020,7 @@ sub add_colocated_variant_info {
);

my %clin_sigs;

foreach my $ex(
sort {
($a->{somatic} || 0) <=> ($b->{somatic} || 0) ||
Expand All @@ -1043,7 +1044,11 @@ sub add_colocated_variant_info {
# VEP fetches the variation synonyms from the cache
push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr};

# Find allele specific clin_sig data if it exists
# ClinVar somatic classification
if(defined($ex->{clinical_impact}) && $self->{clinvar_somatic_classification}) {
$hash->{CLINVAR_SOMATIC_CLASSIFICATION} = $ex->{clinical_impact};
}

if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} )
{

Expand Down
11 changes: 10 additions & 1 deletion modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ my %RENAME_KEYS = (
'chr' => 'seq_region_name',
'variation_name' => 'id',
'sv' => 'colocated_structural_variants',
'clinical_impact' => 'clinvar_somatic_classification'
);

my %NUMBERIFY_EXEMPT = (
Expand All @@ -123,6 +124,7 @@ my @LIST_FIELDS = qw(
clin_sig
pubmed
var_synonyms
clinical_impact
);

my @FREQ_FIELDS = qw(
Expand Down Expand Up @@ -407,7 +409,7 @@ sub add_colocated_variant_info_JSON {
my $hash = shift;
my $frequency_hashes = shift;
my $ex_orig = shift;

# work on a copy as we're going to modify/delete things
my $ex;
%$ex = %$ex_orig;
Expand Down Expand Up @@ -456,6 +458,13 @@ sub add_colocated_variant_info_JSON {
}
$ex->{$field} = \%output_hash;
}
elsif($field eq 'clinical_impact') {
my @final_clinical_impact;
for my $clinical_impact (@{$ex->{$field}}){
push(@final_clinical_impact, $clinical_impact->{somatic_clin_sig}) if(defined $clinical_impact->{somatic_clin_sig});
}
$ex->{$field} = join(',', @final_clinical_impact);
}
else{
$ex->{$field} = [split(',', $ex->{$field})];
}
Expand Down
14 changes: 10 additions & 4 deletions modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ sub run {
}
$self->{freq_vcf} = $vep_params->{freq_vcf};
}

$self->dump_chrs($as, $cache);

# bgzip and tabix-index all_vars files
Expand Down Expand Up @@ -181,6 +181,7 @@ sub _generic_dump_info {
my @cols = (
@{$as->get_cache_columns()},
'clin_sig_allele',
'clinical_impact',
'pubmed',
'var_synonyms',
);
Expand Down Expand Up @@ -223,10 +224,15 @@ sub dump_obj {

# get freqs from VCFs?
$self->freqs_from_vcf($obj, $chr) if $self->{freq_vcf};

my $pubmed = $self->pubmed;
my $var_synonyms = $self->var_synonyms;
foreach my $v(@$obj) {
my $tmp_clinical_impact = '';
if($v->{clinical_impact}) {
$tmp_clinical_impact = $v->{clinical_impact};
}

my @tmp = (
$v->{variation_name},
$v->{failed} == 0 ? '' : $v->{failed},
Expand All @@ -238,8 +244,9 @@ sub dump_obj {
$v->{clin_sig} || '',
$v->{phenotype_or_disease} == 0 ? '' : $v->{phenotype_or_disease},
$v->{clin_sig_allele} || '',
$tmp_clinical_impact|| '',
);

push @tmp, $pubmed->{$v->{variation_name}} || '';
push @tmp, $var_synonyms->{$v->{variation_id}} || '';

Expand All @@ -257,7 +264,6 @@ sub dump_obj {
print $all_vars_fh "\n";
}
}

close DUMP;
}

Expand Down
1 change: 1 addition & 0 deletions t/Runner.t
Original file line number Diff line number Diff line change
Expand Up @@ -235,6 +235,7 @@ is_deeply($runner->get_OutputFactory, bless( {
'clin_sig_allele' => 1,
'var_synonyms' => undef,
'ga4gh_vrs' => undef,
'clinvar_somatic_classification' => undef,
}, 'Bio::EnsEMBL::VEP::OutputFactory::VEP_output' ), 'get_OutputFactory');


Expand Down
10 changes: 3 additions & 7 deletions t/test-genome-DBs/homo_vepiens/variation/meta.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
1 \N schema_type variation
2 \N schema_version 114
2 \N schema_version 115
3 \N patch patch_73_74_a.sql|schema version
4 \N patch patch_73_74_b.sql|Add doi and UCSC id to publication table
5 \N patch patch_73_74_c.sql|Add clinical_significance to variation_feature table
Expand Down Expand Up @@ -133,13 +133,9 @@
164 \N patch patch_107_108_a.sql|schema version
165 \N patch patch_107_108_b.sql|fix SAS population description
166 \N patch patch_108_109_a.sql|schema version
167 \N patch patch_109_110_a.sql|schema version
168 \N patch patch_109_110_b.sql|Add DDG2P data_source_attrib to variation_citation
169 \N patch patch_109_110_c.sql|Add new clinical_significance values to variation, variation_feature and structural_variation
170 \N patch patch_110_111_a.sql|schema version
171 \N patch patch_110_111_b.sql|Update transcript_variation primary key
172 \N patch patch_111_112_a.sql|schema version
173 \N patch patch_111_112_a.sql|schema version
174 \N patch patch_112_113_a.sql|schema version
175 \N patch patch_112_113_b.sql|Update meta_key length
176 \N patch patch_113_114_a.sql|schema version
177 \N patch patch_114_115_a.sql|schema version
178 \N patch patch_114_115_b.sql|Add column DNA_type to phenotype_feature
3 changes: 2 additions & 1 deletion t/test-genome-DBs/homo_vepiens/variation/table.sql
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ CREATE TABLE `meta` (
PRIMARY KEY (`meta_id`),
UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`),
KEY `species_value_idx` (`species_id`,`meta_value`)
) ENGINE=MyISAM AUTO_INCREMENT=177 DEFAULT CHARSET=latin1;
) ENGINE=MyISAM AUTO_INCREMENT=179 DEFAULT CHARSET=latin1;

CREATE TABLE `meta_coord` (
`table_name` varchar(40) NOT NULL,
Expand Down Expand Up @@ -239,6 +239,7 @@ CREATE TABLE `phenotype_feature` (
`seq_region_start` int(11) unsigned DEFAULT NULL,
`seq_region_end` int(11) unsigned DEFAULT NULL,
`seq_region_strand` tinyint(4) DEFAULT NULL,
`DNA_type` enum('Germline','Somatic') DEFAULT NULL,
PRIMARY KEY (`phenotype_feature_id`),
KEY `phenotype_idx` (`phenotype_id`),
KEY `object_idx` (`object_id`,`type`),
Expand Down