diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm index ee6b2d307..b320de367 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm @@ -154,7 +154,13 @@ sub get_features_by_regions_uncached { my $adaptor = $self->get_adaptor('variation', 'phenotypefeature'); my $source_id = $self->clinvar_source_id_cache; - my $attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id) if defined($adaptor) && defined($source_id); + my $attribs; + my $attribs_clinical_impact; + + if (defined($adaptor) && defined($source_id)) { + $attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id); + $attribs_clinical_impact = $adaptor->get_somatic_clin_impact_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id); + } my $phenotype_attrib_id = $self->phenotype_attrib_id || 0; @@ -185,9 +191,14 @@ sub get_features_by_regions_uncached { while($sth->fetch) { my %v_copy = %v; $v_copy{allele_string} =~ s/\s+/\_/g; - my $v_clinsigs = $attribs->{($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end}}; + + my $key = ($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end}; + # Clinical significance (germline) + my $v_clinsigs = $attribs->{$key}; + my @pfas_by_allele; my %clin_sigs; + foreach my $pfa(@{$v_clinsigs}) { if(defined($pfa->{clinvar_clin_sig}) && $v_copy{variation_name} eq $pfa->{id}) @@ -198,6 +209,14 @@ sub get_features_by_regions_uncached { } my @array = keys(%clin_sigs); $v_copy{clin_sig_allele} = join ';', @array if scalar(@array); + + # somatic clinical impact + my $v_clin_impact = $attribs_clinical_impact->{$key}; + + if($v_clin_impact){ + $v_copy{clinical_impact} = _format_clinical_impact($v_clin_impact); + } + $v_copy{variation_id} = $var_id; ## fix for e!94 alleles $v_copy{allele_string} =~ s/\/$//g; @@ -216,6 +235,41 @@ sub get_features_by_regions_uncached { return \@return; } +=head2 _format_clinical_impact + + Example : $clinical_impact = _format_clinical_impact($v_clin_impact) + Description: Internal method to format the clinvar somatic classification + Returntype : string + Exceptions : none + Caller : get_features_by_regions_uncached() + Status : Stable + +=cut + +sub _format_clinical_impact { + my $v_clin_impact = shift; + + my @somatic_clin_sig_list = (); + + for my $pheno_feat (@{$v_clin_impact}) { + # Get the somatic clinical impact + # Example: Tier IV - Benign/Likely benign + if($pheno_feat->{somatic_clin_sig}) { + $pheno_feat->{somatic_clin_sig} =~ s/,/;/g; + my $tmp = $pheno_feat->{phenotype} . "(" . $pheno_feat->{somatic_clin_sig} . ")"; + push @somatic_clin_sig_list, $tmp; + } + + if($pheno_feat->{oncogenic_clin_sig}) { + my $tmp = $pheno_feat->{phenotype} . "(oncogenicity:" . $pheno_feat->{oncogenic_clin_sig} . ")"; + push @somatic_clin_sig_list, $tmp; + } + } + + my $classification = join(";", @somatic_clin_sig_list); + + return $classification; +} =head2 seq_region_cache diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm index 6a37d7a33..8855afbcd 100755 --- a/modules/Bio/EnsEMBL/VEP/Config.pm +++ b/modules/Bio/EnsEMBL/VEP/Config.pm @@ -149,9 +149,10 @@ our @VEP_PARAMS = ( 'nearest=s', # get nearest transcript, gene or symbol (for gene) 'distance=s', # set up/downstream distance 'clin_sig_allele=i', # use allele specific clinical significance data where it exists - 'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV - 'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements) - 'remove_hgvsp_version', # removes translation version from hgvs_protein output + 'clinvar_somatic_classification', # report the somatic classification of a variant as reported by ClinVar + 'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV + 'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements) + 'remove_hgvsp_version', # removes translation version from hgvs_protein output # verbosity options diff --git a/modules/Bio/EnsEMBL/VEP/Constants.pm b/modules/Bio/EnsEMBL/VEP/Constants.pm index 369d09cbd..d09fa9a8e 100755 --- a/modules/Bio/EnsEMBL/VEP/Constants.pm +++ b/modules/Bio/EnsEMBL/VEP/Constants.pm @@ -79,6 +79,7 @@ our @FLAG_FIELDS = ( { flag => 'minimal', fields => ['MINIMISED']}, { flag => 'spdi', fields => ['SPDI']}, { flag => 'ga4gh_vrs', fields => ['GA4GH_VRS']}, + { flag => 'clinvar_somatic_classification', fields => ['CLINVAR_SOMATIC_CLASSIFICATION']}, # gene-related { flag => 'symbol', fields => ['SYMBOL','SYMBOL_SOURCE','HGNC_ID'] }, @@ -236,6 +237,7 @@ our %FIELD_DESCRIPTIONS = ( 'MAX_AF_POPS' => 'Populations in which maximum allele frequency was observed', 'DISTANCE' => 'Shortest distance from variant to transcript', 'CLIN_SIG' => 'ClinVar clinical significance of the dbSNP variant', + 'CLINVAR_SOMATIC_CLASSIFICATION' => 'ClinVar somatic classification of the dbSNP variant', 'BIOTYPE' => 'Biotype of transcript or regulatory feature', 'PUBMED' => 'Pubmed ID(s) of publications that cite existing variant', 'ALLELE_NUM' => 'Allele number from input; 0 is reference, 1 is first alternate etc', @@ -257,7 +259,7 @@ our %FIELD_DESCRIPTIONS = ( 'AMBIGUITY' => 'Allele ambiguity code', 'OverlapBP' => 'Number of base pairs overlapping with the corresponding structural variation feature', 'OverlapPC' => 'Percentage of corresponding structural variation feature overlapped by the given input', - 'CHECK_REF' => 'Reports variants where the input reference does not match the expected reference', + 'CHECK_REF' => 'Reports variants where the input reference does not match the expected reference', 'UPLOADED_ALLELE' => 'The variant allele uploaded', 'SHIFT_LENGTH' => 'Reports the number of bases the insertion or deletion has been shifted relative to the underlying transcript due to right alignment before consequence calculation', 'GENCODE_PRIMARY' => 'Reports if transcript is GENCODE primary' diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index 70cd5456e..44ec885a4 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -174,6 +174,7 @@ sub new { max_af pubmed clin_sig_allele + clinvar_somatic_classification numbers domains @@ -1019,7 +1020,7 @@ sub add_colocated_variant_info { ); my %clin_sigs; - + foreach my $ex( sort { ($a->{somatic} || 0) <=> ($b->{somatic} || 0) || @@ -1043,7 +1044,11 @@ sub add_colocated_variant_info { # VEP fetches the variation synonyms from the cache push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr}; - # Find allele specific clin_sig data if it exists + # ClinVar somatic classification + if(defined($ex->{clinical_impact}) && $self->{clinvar_somatic_classification}) { + $hash->{CLINVAR_SOMATIC_CLASSIFICATION} = $ex->{clinical_impact}; + } + if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} ) { diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm index 49a92af21..0d4fb610f 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm @@ -108,6 +108,7 @@ my %RENAME_KEYS = ( 'chr' => 'seq_region_name', 'variation_name' => 'id', 'sv' => 'colocated_structural_variants', + 'clinical_impact' => 'clinvar_somatic_classification' ); my %NUMBERIFY_EXEMPT = ( @@ -123,6 +124,7 @@ my @LIST_FIELDS = qw( clin_sig pubmed var_synonyms + clinical_impact ); my @FREQ_FIELDS = qw( @@ -407,7 +409,7 @@ sub add_colocated_variant_info_JSON { my $hash = shift; my $frequency_hashes = shift; my $ex_orig = shift; - + # work on a copy as we're going to modify/delete things my $ex; %$ex = %$ex_orig; @@ -456,6 +458,13 @@ sub add_colocated_variant_info_JSON { } $ex->{$field} = \%output_hash; } + elsif($field eq 'clinical_impact') { + my @final_clinical_impact; + for my $clinical_impact (@{$ex->{$field}}){ + push(@final_clinical_impact, $clinical_impact->{somatic_clin_sig}) if(defined $clinical_impact->{somatic_clin_sig}); + } + $ex->{$field} = join(',', @final_clinical_impact); + } else{ $ex->{$field} = [split(',', $ex->{$field})]; } diff --git a/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm b/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm index eec199fbd..f26b4c8db 100644 --- a/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm @@ -92,7 +92,7 @@ sub run { } $self->{freq_vcf} = $vep_params->{freq_vcf}; } - + $self->dump_chrs($as, $cache); # bgzip and tabix-index all_vars files @@ -181,6 +181,7 @@ sub _generic_dump_info { my @cols = ( @{$as->get_cache_columns()}, 'clin_sig_allele', + 'clinical_impact', 'pubmed', 'var_synonyms', ); @@ -223,10 +224,15 @@ sub dump_obj { # get freqs from VCFs? $self->freqs_from_vcf($obj, $chr) if $self->{freq_vcf}; - + my $pubmed = $self->pubmed; my $var_synonyms = $self->var_synonyms; foreach my $v(@$obj) { + my $tmp_clinical_impact = ''; + if($v->{clinical_impact}) { + $tmp_clinical_impact = $v->{clinical_impact}; + } + my @tmp = ( $v->{variation_name}, $v->{failed} == 0 ? '' : $v->{failed}, @@ -238,8 +244,9 @@ sub dump_obj { $v->{clin_sig} || '', $v->{phenotype_or_disease} == 0 ? '' : $v->{phenotype_or_disease}, $v->{clin_sig_allele} || '', + $tmp_clinical_impact|| '', ); - + push @tmp, $pubmed->{$v->{variation_name}} || ''; push @tmp, $var_synonyms->{$v->{variation_id}} || ''; @@ -257,7 +264,6 @@ sub dump_obj { print $all_vars_fh "\n"; } } - close DUMP; } diff --git a/t/Runner.t b/t/Runner.t index 02adc0aff..025964021 100755 --- a/t/Runner.t +++ b/t/Runner.t @@ -235,6 +235,7 @@ is_deeply($runner->get_OutputFactory, bless( { 'clin_sig_allele' => 1, 'var_synonyms' => undef, 'ga4gh_vrs' => undef, + 'clinvar_somatic_classification' => undef, }, 'Bio::EnsEMBL::VEP::OutputFactory::VEP_output' ), 'get_OutputFactory'); diff --git a/t/test-genome-DBs/homo_vepiens/variation/meta.txt b/t/test-genome-DBs/homo_vepiens/variation/meta.txt index fedcad1f2..2500e96d8 100644 --- a/t/test-genome-DBs/homo_vepiens/variation/meta.txt +++ b/t/test-genome-DBs/homo_vepiens/variation/meta.txt @@ -1,5 +1,5 @@ 1 \N schema_type variation -2 \N schema_version 114 +2 \N schema_version 115 3 \N patch patch_73_74_a.sql|schema version 4 \N patch patch_73_74_b.sql|Add doi and UCSC id to publication table 5 \N patch patch_73_74_c.sql|Add clinical_significance to variation_feature table @@ -133,13 +133,9 @@ 164 \N patch patch_107_108_a.sql|schema version 165 \N patch patch_107_108_b.sql|fix SAS population description 166 \N patch patch_108_109_a.sql|schema version -167 \N patch patch_109_110_a.sql|schema version -168 \N patch patch_109_110_b.sql|Add DDG2P data_source_attrib to variation_citation -169 \N patch patch_109_110_c.sql|Add new clinical_significance values to variation, variation_feature and structural_variation -170 \N patch patch_110_111_a.sql|schema version -171 \N patch patch_110_111_b.sql|Update transcript_variation primary key -172 \N patch patch_111_112_a.sql|schema version 173 \N patch patch_111_112_a.sql|schema version 174 \N patch patch_112_113_a.sql|schema version 175 \N patch patch_112_113_b.sql|Update meta_key length 176 \N patch patch_113_114_a.sql|schema version +177 \N patch patch_114_115_a.sql|schema version +178 \N patch patch_114_115_b.sql|Add column DNA_type to phenotype_feature diff --git a/t/test-genome-DBs/homo_vepiens/variation/table.sql b/t/test-genome-DBs/homo_vepiens/variation/table.sql index fb67a4931..f40ad2ba0 100644 --- a/t/test-genome-DBs/homo_vepiens/variation/table.sql +++ b/t/test-genome-DBs/homo_vepiens/variation/table.sql @@ -186,7 +186,7 @@ CREATE TABLE `meta` ( PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`), KEY `species_value_idx` (`species_id`,`meta_value`) -) ENGINE=MyISAM AUTO_INCREMENT=177 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=179 DEFAULT CHARSET=latin1; CREATE TABLE `meta_coord` ( `table_name` varchar(40) NOT NULL, @@ -239,6 +239,7 @@ CREATE TABLE `phenotype_feature` ( `seq_region_start` int(11) unsigned DEFAULT NULL, `seq_region_end` int(11) unsigned DEFAULT NULL, `seq_region_strand` tinyint(4) DEFAULT NULL, + `DNA_type` enum('Germline','Somatic') DEFAULT NULL, PRIMARY KEY (`phenotype_feature_id`), KEY `phenotype_idx` (`phenotype_id`), KEY `object_idx` (`object_id`,`type`),