Ensembl · dglemos · Sep 11, 2024 · Oct 24, 2024 · Oct 31, 2024 · Nov 18, 2024
diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm
@@ -154,7 +154,13 @@ sub get_features_by_regions_uncached {
 
     my $adaptor = $self->get_adaptor('variation', 'phenotypefeature');
     my $source_id = $self->clinvar_source_id_cache;
-    my $attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id) if defined($adaptor) && defined($source_id);
+    my $attribs;
+    my $attribs_clinical_impact;
+
+    if (defined($adaptor) && defined($source_id)) {
+      $attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id);
+      $attribs_clinical_impact = $adaptor->get_somatic_clin_impact_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id);
+    }
 
     my $phenotype_attrib_id = $self->phenotype_attrib_id || 0;
 
@@ -185,9 +191,14 @@ sub get_features_by_regions_uncached {
     while($sth->fetch) {
       my %v_copy = %v;
       $v_copy{allele_string} =~ s/\s+/\_/g;
-      my $v_clinsigs = $attribs->{($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end}};
+
+      my $key = ($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end};
+      # Clinical significance (germline)
+      my $v_clinsigs = $attribs->{$key};
+
       my @pfas_by_allele;
       my %clin_sigs;
+
       foreach my $pfa(@{$v_clinsigs})
       {
         if(defined($pfa->{clinvar_clin_sig}) && $v_copy{variation_name} eq $pfa->{id})
@@ -198,6 +209,14 @@ sub get_features_by_regions_uncached {
       }
       my @array = keys(%clin_sigs);
       $v_copy{clin_sig_allele} = join ';', @array if scalar(@array);
+
+      # somatic clinical impact
+      my $v_clin_impact = $attribs_clinical_impact->{$key};
+
+      if($v_clin_impact){
+        $v_copy{clinical_impact} = _format_clinical_impact($v_clin_impact);
+      }
+
       $v_copy{variation_id} = $var_id;
       ## fix for e!94 alleles
       $v_copy{allele_string} =~ s/\/$//g;
@@ -216,6 +235,41 @@ sub get_features_by_regions_uncached {
   return \@return;
 }
 
+=head2 _format_clinical_impact
+
+  Example    : $clinical_impact = _format_clinical_impact($v_clin_impact)
+  Description: Internal method to format the clinvar somatic classification
+  Returntype : string
+  Exceptions : none
+  Caller     : get_features_by_regions_uncached()
+  Status     : Stable
+
+=cut
+
+sub _format_clinical_impact {
+  my $v_clin_impact = shift;
+
+  my @somatic_clin_sig_list = ();
+
+  for my $pheno_feat (@{$v_clin_impact}) {
+    # Get the somatic clinical impact
+    # Example: Tier IV - Benign/Likely benign
+    if($pheno_feat->{somatic_clin_sig}) {
+      $pheno_feat->{somatic_clin_sig} =~ s/,/;/g;
+      my $tmp = $pheno_feat->{phenotype} . "(" . $pheno_feat->{somatic_clin_sig} . ")";
+      push @somatic_clin_sig_list, $tmp;
+    }
+
+    if($pheno_feat->{oncogenic_clin_sig}) {
+      my $tmp = $pheno_feat->{phenotype} . "(oncogenicity:" . $pheno_feat->{oncogenic_clin_sig} . ")";
+      push @somatic_clin_sig_list, $tmp;
+    }
+  }
+
+  my $classification = join(";", @somatic_clin_sig_list);
+
+  return $classification;
+}
 
 =head2 seq_region_cache
 

diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm
@@ -149,9 +149,10 @@ our @VEP_PARAMS = (
   'nearest=s',               # get nearest transcript, gene or symbol (for gene)
   'distance=s',              # set up/downstream distance
   'clin_sig_allele=i',       # use allele specific clinical significance data where it exists
-  'overlaps',                # report length and percent of a transcript or regulatory feature overlaped with a SV
-  'max_sv_size=i',           # modify the size of structural variant to be handled (limited by default to reduce memory requirements)
-  'remove_hgvsp_version',    # removes translation version from hgvs_protein output
+  'clinvar_somatic_classification',  # report the somatic classification of a variant as reported by ClinVar
+  'overlaps',                        # report length and percent of a transcript or regulatory feature overlaped with a SV
+  'max_sv_size=i',                   # modify the size of structural variant to be handled (limited by default to reduce memory requirements)
+  'remove_hgvsp_version',            # removes translation version from hgvs_protein output
 
 
   # verbosity options

diff --git a/modules/Bio/EnsEMBL/VEP/Constants.pm b/modules/Bio/EnsEMBL/VEP/Constants.pm
@@ -79,6 +79,7 @@ our @FLAG_FIELDS = (
   { flag => 'minimal',         fields => ['MINIMISED']},
   { flag => 'spdi',            fields => ['SPDI']},
   { flag => 'ga4gh_vrs',       fields => ['GA4GH_VRS']},
+  { flag => 'clinvar_somatic_classification', fields => ['CLINVAR_SOMATIC_CLASSIFICATION']},
 
   # gene-related
   { flag => 'symbol',          fields => ['SYMBOL','SYMBOL_SOURCE','HGNC_ID'] },
@@ -236,6 +237,7 @@ our %FIELD_DESCRIPTIONS = (
   'MAX_AF_POPS'        => 'Populations in which maximum allele frequency was observed',
   'DISTANCE'           => 'Shortest distance from variant to transcript',
   'CLIN_SIG'           => 'ClinVar clinical significance of the dbSNP variant',
+  'CLINVAR_SOMATIC_CLASSIFICATION' => 'ClinVar somatic classification of the dbSNP variant',
   'BIOTYPE'            => 'Biotype of transcript or regulatory feature',
   'PUBMED'             => 'Pubmed ID(s) of publications that cite existing variant',
   'ALLELE_NUM'         => 'Allele number from input; 0 is reference, 1 is first alternate etc',
@@ -257,7 +259,7 @@ our %FIELD_DESCRIPTIONS = (
   'AMBIGUITY'          => 'Allele ambiguity code',
   'OverlapBP'          => 'Number of base pairs overlapping with the corresponding structural variation feature',
   'OverlapPC'          => 'Percentage of corresponding structural variation feature overlapped by the given input',
-  'CHECK_REF'	       => 'Reports variants where the input reference does not match the expected reference',
+  'CHECK_REF'	         => 'Reports variants where the input reference does not match the expected reference',
   'UPLOADED_ALLELE'    => 'The variant allele uploaded',
   'SHIFT_LENGTH'       => 'Reports the number of bases the insertion or deletion has been shifted relative to the underlying transcript due to right alignment before consequence calculation',
   'GENCODE_PRIMARY'    => 'Reports if transcript is GENCODE primary'

diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm
@@ -174,6 +174,7 @@ sub new {
     max_af
     pubmed
     clin_sig_allele
+    clinvar_somatic_classification
 
     numbers
     domains
@@ -1019,7 +1020,7 @@ sub add_colocated_variant_info {
   );
 
   my %clin_sigs;
-  
+
   foreach my $ex(
     sort {
       ($a->{somatic} || 0) <=> ($b->{somatic} || 0) ||
@@ -1043,7 +1044,11 @@ sub add_colocated_variant_info {
     # VEP fetches the variation synonyms from the cache
     push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr};
 
-    # Find allele specific clin_sig data if it exists
+    # ClinVar somatic classification
+    if(defined($ex->{clinical_impact}) && $self->{clinvar_somatic_classification}) {
+      $hash->{CLINVAR_SOMATIC_CLASSIFICATION} = $ex->{clinical_impact};
+    }
+
     if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} )
     {
 

diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm
@@ -108,6 +108,7 @@ my %RENAME_KEYS = (
   'chr' => 'seq_region_name',
   'variation_name' => 'id',
   'sv' => 'colocated_structural_variants',
+  'clinical_impact' => 'clinvar_somatic_classification'
 );
 
 my %NUMBERIFY_EXEMPT = (
@@ -123,6 +124,7 @@ my @LIST_FIELDS = qw(
   clin_sig
   pubmed
   var_synonyms
+  clinical_impact
 );
 
 my @FREQ_FIELDS = qw(
@@ -407,7 +409,7 @@ sub add_colocated_variant_info_JSON {
   my $hash = shift;
   my $frequency_hashes = shift;
   my $ex_orig = shift;
-  
+
   # work on a copy as we're going to modify/delete things
   my $ex;
   %$ex = %$ex_orig;
@@ -456,6 +458,13 @@ sub add_colocated_variant_info_JSON {
       }
       $ex->{$field} = \%output_hash;
     }
+    elsif($field eq 'clinical_impact') {
+      my @final_clinical_impact;
+      for my $clinical_impact (@{$ex->{$field}}){
+        push(@final_clinical_impact, $clinical_impact->{somatic_clin_sig}) if(defined $clinical_impact->{somatic_clin_sig});
+      }
+      $ex->{$field} = join(',', @final_clinical_impact);
+    }
     else{
       $ex->{$field} = [split(',', $ex->{$field})];
     }

diff --git a/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm b/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm
@@ -92,7 +92,7 @@ sub run {
     }
     $self->{freq_vcf} = $vep_params->{freq_vcf};
   }
-  
+
   $self->dump_chrs($as, $cache);
 
   # bgzip and tabix-index all_vars files
@@ -181,6 +181,7 @@ sub _generic_dump_info {
   my @cols = (
     @{$as->get_cache_columns()},
     'clin_sig_allele',
+    'clinical_impact',
     'pubmed',
     'var_synonyms',
   );
@@ -223,10 +224,15 @@ sub dump_obj {
 
   # get freqs from VCFs?
   $self->freqs_from_vcf($obj, $chr) if $self->{freq_vcf};
-  
+
   my $pubmed = $self->pubmed;
   my $var_synonyms = $self->var_synonyms;
   foreach my $v(@$obj) {
+    my $tmp_clinical_impact = '';
+    if($v->{clinical_impact}) {
+      $tmp_clinical_impact = $v->{clinical_impact};
+    }
+
     my @tmp = (
       $v->{variation_name},
       $v->{failed} == 0 ? '' : $v->{failed},
@@ -238,8 +244,9 @@ sub dump_obj {
       $v->{clin_sig} || '',
       $v->{phenotype_or_disease} == 0 ? '' : $v->{phenotype_or_disease},
       $v->{clin_sig_allele} || '',
+      $tmp_clinical_impact|| '',
     );
-  
+
     push @tmp, $pubmed->{$v->{variation_name}} || '';
     push @tmp, $var_synonyms->{$v->{variation_id}} || '';
 
@@ -257,7 +264,6 @@ sub dump_obj {
       print $all_vars_fh "\n";
     }
   }
-
   close DUMP;
 }
 

diff --git a/t/Runner.t b/t/Runner.t
@@ -235,6 +235,7 @@ is_deeply($runner->get_OutputFactory, bless( {
   'clin_sig_allele' => 1,
   'var_synonyms' => undef,
   'ga4gh_vrs' => undef,
+  'clinvar_somatic_classification' => undef,
 }, 'Bio::EnsEMBL::VEP::OutputFactory::VEP_output' ), 'get_OutputFactory');
 
 

diff --git a/t/test-genome-DBs/homo_vepiens/variation/meta.txt b/t/test-genome-DBs/homo_vepiens/variation/meta.txt
@@ -1,5 +1,5 @@
 1	\N	schema_type	variation
-2	\N	schema_version	114
+2	\N	schema_version	115
 3	\N	patch	patch_73_74_a.sql|schema version
 4	\N	patch	patch_73_74_b.sql|Add doi and UCSC id to publication table
 5	\N	patch	patch_73_74_c.sql|Add clinical_significance to variation_feature table
@@ -133,13 +133,9 @@
 164	\N	patch	patch_107_108_a.sql|schema version
 165	\N	patch	patch_107_108_b.sql|fix SAS population description
 166	\N	patch	patch_108_109_a.sql|schema version
-167 \N  patch   patch_109_110_a.sql|schema version
-168 \N  patch   patch_109_110_b.sql|Add DDG2P data_source_attrib to variation_citation
-169 \N  patch   patch_109_110_c.sql|Add new clinical_significance values to variation, variation_feature and structural_variation
-170 \N  patch   patch_110_111_a.sql|schema version
-171 \N  patch   patch_110_111_b.sql|Update transcript_variation primary key
-172 \N  patch   patch_111_112_a.sql|schema version
 173	\N	patch	patch_111_112_a.sql|schema version
 174	\N	patch	patch_112_113_a.sql|schema version
 175	\N	patch	patch_112_113_b.sql|Update meta_key length
 176	\N	patch	patch_113_114_a.sql|schema version
+177	\N	patch	patch_114_115_a.sql|schema version
+178	\N	patch	patch_114_115_b.sql|Add column DNA_type to phenotype_feature
diff --git a/t/test-genome-DBs/homo_vepiens/variation/table.sql b/t/test-genome-DBs/homo_vepiens/variation/table.sql
@@ -186,7 +186,7 @@ CREATE TABLE `meta` (
   PRIMARY KEY (`meta_id`),
   UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`),
   KEY `species_value_idx` (`species_id`,`meta_value`)
-) ENGINE=MyISAM AUTO_INCREMENT=177 DEFAULT CHARSET=latin1;
+) ENGINE=MyISAM AUTO_INCREMENT=179 DEFAULT CHARSET=latin1;
 
 CREATE TABLE `meta_coord` (
   `table_name` varchar(40) NOT NULL,
@@ -239,6 +239,7 @@ CREATE TABLE `phenotype_feature` (
   `seq_region_start` int(11) unsigned DEFAULT NULL,
   `seq_region_end` int(11) unsigned DEFAULT NULL,
   `seq_region_strand` tinyint(4) DEFAULT NULL,
+  `DNA_type` enum('Germline','Somatic') DEFAULT NULL,
   PRIMARY KEY (`phenotype_feature_id`),
   KEY `phenotype_idx` (`phenotype_id`),
   KEY `object_idx` (`object_id`,`type`),