From 5eadc9f75b53f82ed2407a0e9eb23e496dfec4f2 Mon Sep 17 00:00:00 2001 From: dglemos Date: Wed, 11 Sep 2024 12:43:46 +0100 Subject: [PATCH 1/5] Add option to return somatic classification from clinvar --- .../AnnotationSource/Database/Variation.pm | 64 ++++++++++++++++++- modules/Bio/EnsEMBL/VEP/Config.pm | 1 + modules/Bio/EnsEMBL/VEP/Constants.pm | 4 +- modules/Bio/EnsEMBL/VEP/OutputFactory.pm | 21 ++++++ modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm | 15 ++++- 5 files changed, 101 insertions(+), 4 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm index ee6b2d307..0899b51f3 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm @@ -62,6 +62,8 @@ use Digest::MD5 qw(md5_hex); use Bio::EnsEMBL::Utils::Exception qw(throw warning); +use Data::Dumper; + use base qw( Bio::EnsEMBL::VEP::AnnotationSource::Database Bio::EnsEMBL::VEP::AnnotationType::Variation @@ -154,7 +156,13 @@ sub get_features_by_regions_uncached { my $adaptor = $self->get_adaptor('variation', 'phenotypefeature'); my $source_id = $self->clinvar_source_id_cache; - my $attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id) if defined($adaptor) && defined($source_id); + my $attribs; + my $attribs_clinical_impact; + + if (defined($adaptor) && defined($source_id)) { + $attribs = $adaptor->get_clinsig_alleles_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id); + $attribs_clinical_impact = $adaptor->get_somatic_clin_impact_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id); + } my $phenotype_attrib_id = $self->phenotype_attrib_id || 0; @@ -185,9 +193,14 @@ sub get_features_by_regions_uncached { while($sth->fetch) { my %v_copy = %v; $v_copy{allele_string} =~ s/\s+/\_/g; - my $v_clinsigs = $attribs->{($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end}}; + + my $key = ($chr_is_seq_region ? $chr : $sr_cache->{$chr}) . ':' . $v_copy{start} . '-' . $v_copy{end}; + # Clinical significance (germline) + my $v_clinsigs = $attribs->{$key}; + my @pfas_by_allele; my %clin_sigs; + foreach my $pfa(@{$v_clinsigs}) { if(defined($pfa->{clinvar_clin_sig}) && $v_copy{variation_name} eq $pfa->{id}) @@ -198,6 +211,15 @@ sub get_features_by_regions_uncached { } my @array = keys(%clin_sigs); $v_copy{clin_sig_allele} = join ';', @array if scalar(@array); + + # somatic clinical impact + my $v_clin_impact = $attribs_clinical_impact->{$key}; + + if($v_clin_impact){ + # TODO: edit the clinical impact format + $v_copy{clinical_impact} = _format_clinical_impact($v_clin_impact); + } + $v_copy{variation_id} = $var_id; ## fix for e!94 alleles $v_copy{allele_string} =~ s/\/$//g; @@ -216,6 +238,44 @@ sub get_features_by_regions_uncached { return \@return; } +sub _format_clinical_impact { + my $v_clin_impact = shift; + + my @somatic_clin_sig_list; + my @impact_assertion_list; + my @impact_clin_sig_list; + + for my $pheno_feat (@{$v_clin_impact}) { + my @tmp; + my $classification; + + # Get the somatic clinical impact + if($pheno_feat->{somatic_clin_sig}) { + @somatic_clin_sig_list = split(",", $pheno_feat->{somatic_clin_sig}); + } + if($pheno_feat->{impact_assertion}) { + @impact_assertion_list = split(",", $pheno_feat->{impact_assertion}); + } + if($pheno_feat->{impact_clin_sig}) { + @impact_clin_sig_list = split(",", $pheno_feat->{impact_clin_sig}); + } + + for (my $i=0; $i<(scalar @somatic_clin_sig_list); $i++) { + my $somatic_clin_sig = $somatic_clin_sig_list[$i]; + if(scalar @impact_assertion_list && scalar @impact_clin_sig_list) { + $somatic_clin_sig .= " (" . $impact_assertion_list[$i] . ":" . $impact_clin_sig_list[$i] . ")"; + } + push @tmp, $somatic_clin_sig; + } + + if(scalar @tmp) { + $classification = join(",", @tmp); + $pheno_feat->{somatic_clin_sig} = $classification; + } + } + + return $v_clin_impact; +} =head2 seq_region_cache diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm index 6a37d7a33..a5d894cbe 100755 --- a/modules/Bio/EnsEMBL/VEP/Config.pm +++ b/modules/Bio/EnsEMBL/VEP/Config.pm @@ -149,6 +149,7 @@ our @VEP_PARAMS = ( 'nearest=s', # get nearest transcript, gene or symbol (for gene) 'distance=s', # set up/downstream distance 'clin_sig_allele=i', # use allele specific clinical significance data where it exists + 'somatic_classification', # report the somatic classification of a variant as reported by ClinVar 'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV 'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements) 'remove_hgvsp_version', # removes translation version from hgvs_protein output diff --git a/modules/Bio/EnsEMBL/VEP/Constants.pm b/modules/Bio/EnsEMBL/VEP/Constants.pm index 369d09cbd..c39e8990f 100755 --- a/modules/Bio/EnsEMBL/VEP/Constants.pm +++ b/modules/Bio/EnsEMBL/VEP/Constants.pm @@ -79,6 +79,7 @@ our @FLAG_FIELDS = ( { flag => 'minimal', fields => ['MINIMISED']}, { flag => 'spdi', fields => ['SPDI']}, { flag => 'ga4gh_vrs', fields => ['GA4GH_VRS']}, + { flag => 'somatic_classification', fields => ['SOMATIC_CLASSIFICATION']}, # gene-related { flag => 'symbol', fields => ['SYMBOL','SYMBOL_SOURCE','HGNC_ID'] }, @@ -236,6 +237,7 @@ our %FIELD_DESCRIPTIONS = ( 'MAX_AF_POPS' => 'Populations in which maximum allele frequency was observed', 'DISTANCE' => 'Shortest distance from variant to transcript', 'CLIN_SIG' => 'ClinVar clinical significance of the dbSNP variant', + 'SOMATIC_CLASSIFICATION' => 'ClinVar somatic classification of the dbSNP variant', 'BIOTYPE' => 'Biotype of transcript or regulatory feature', 'PUBMED' => 'Pubmed ID(s) of publications that cite existing variant', 'ALLELE_NUM' => 'Allele number from input; 0 is reference, 1 is first alternate etc', @@ -257,7 +259,7 @@ our %FIELD_DESCRIPTIONS = ( 'AMBIGUITY' => 'Allele ambiguity code', 'OverlapBP' => 'Number of base pairs overlapping with the corresponding structural variation feature', 'OverlapPC' => 'Percentage of corresponding structural variation feature overlapped by the given input', - 'CHECK_REF' => 'Reports variants where the input reference does not match the expected reference', + 'CHECK_REF' => 'Reports variants where the input reference does not match the expected reference', 'UPLOADED_ALLELE' => 'The variant allele uploaded', 'SHIFT_LENGTH' => 'Reports the number of bases the insertion or deletion has been shifted relative to the underlying transcript due to right alignment before consequence calculation', 'GENCODE_PRIMARY' => 'Reports if transcript is GENCODE primary' diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index 70cd5456e..ea286b9ea 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -84,6 +84,8 @@ use Bio::EnsEMBL::VEP::OutputFactory::VEP_output; use Bio::EnsEMBL::VEP::OutputFactory::VCF; use Bio::EnsEMBL::VEP::OutputFactory::Tab; +use Data::Dumper; + our $CAN_USE_JSON; BEGIN { @@ -174,6 +176,7 @@ sub new { max_af pubmed clin_sig_allele + somatic_classification numbers domains @@ -1039,11 +1042,29 @@ sub add_colocated_variant_info { # ID push @{$hash->{Existing_variation}}, $ex->{variation_name} if $ex->{variation_name}; + # print "\nvariation name: ", $ex->{variation_name}, "\n"; + # Variation Synonyms # VEP fetches the variation synonyms from the cache push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr}; # Find allele specific clin_sig data if it exists + if(defined($ex->{clinical_impact}) && $self->{somatic_classification}) { + my @final_clin_impact; + + for my $ci (@{$ex->{clinical_impact}}) { + if(defined $ci->{somatic_clin_sig}) { + my $somatic_clin_sig = $ci->{somatic_clin_sig}; + if(defined $ci->{oncogenic_clin_sig}) { + $somatic_clin_sig = $somatic_clin_sig . " (Oncogenicity:" . $ci->{oncogenic_clin_sig} . ")"; + } + push @final_clin_impact, $somatic_clin_sig; + } + } + + $hash->{SOMATIC_CLASSIFICATION} = join(",", @final_clin_impact); + } + if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} ) { diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm index 49a92af21..7c9cf8bd0 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm @@ -90,6 +90,8 @@ use Bio::EnsEMBL::VEP::Utils qw(numberify); use JSON; use Scalar::Util qw(looks_like_number); +use Data::Dumper; + my %SKIP_KEYS = ( 'Uploaded_variation' => 1, 'Location' => 1, @@ -108,6 +110,7 @@ my %RENAME_KEYS = ( 'chr' => 'seq_region_name', 'variation_name' => 'id', 'sv' => 'colocated_structural_variants', + 'clinical_impact' => 'somatic_classification' ); my %NUMBERIFY_EXEMPT = ( @@ -123,6 +126,7 @@ my @LIST_FIELDS = qw( clin_sig pubmed var_synonyms + clinical_impact ); my @FREQ_FIELDS = qw( @@ -407,7 +411,7 @@ sub add_colocated_variant_info_JSON { my $hash = shift; my $frequency_hashes = shift; my $ex_orig = shift; - + # work on a copy as we're going to modify/delete things my $ex; %$ex = %$ex_orig; @@ -456,6 +460,13 @@ sub add_colocated_variant_info_JSON { } $ex->{$field} = \%output_hash; } + elsif($field eq 'clinical_impact') { + my @final_clinical_impact; + for my $clinical_impact (@{$ex->{$field}}){ + push(@final_clinical_impact, $clinical_impact->{somatic_clin_sig}) if(defined $clinical_impact->{somatic_clin_sig}); + } + $ex->{$field} = join(',', @final_clinical_impact); + } else{ $ex->{$field} = [split(',', $ex->{$field})]; } @@ -463,6 +474,8 @@ sub add_colocated_variant_info_JSON { push @{$hash->{colocated_variants}}, $ex; + print "-> ", Dumper($hash->{colocated_variants}); + return $hash; } From 37e4706cd0e9e90b1364660ed7aae07c17d1de02 Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 24 Oct 2024 09:07:43 +0100 Subject: [PATCH 2/5] Test new cache --- .../AnnotationSource/Database/Variation.pm | 38 +++++++++++++++++-- modules/Bio/EnsEMBL/VEP/OutputFactory.pm | 38 +++++++++++++------ .../VEP/Pipeline/DumpVEP/Dumper/Variation.pm | 35 +++++++++++++---- 3 files changed, 89 insertions(+), 22 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm index 0899b51f3..42c4fda18 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm @@ -164,6 +164,8 @@ sub get_features_by_regions_uncached { $attribs_clinical_impact = $adaptor->get_somatic_clin_impact_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id); } + print "\n->attribs_clinical_impact: ", Dumper($attribs_clinical_impact); + my $phenotype_attrib_id = $self->phenotype_attrib_id || 0; my $sth = $self->var_dbc->prepare(qq{ @@ -216,8 +218,8 @@ sub get_features_by_regions_uncached { my $v_clin_impact = $attribs_clinical_impact->{$key}; if($v_clin_impact){ - # TODO: edit the clinical impact format $v_copy{clinical_impact} = _format_clinical_impact($v_clin_impact); + print "---v_clin_impact specific key after format: ", Dumper($v_clin_impact); } $v_copy{variation_id} = $var_id; @@ -244,34 +246,64 @@ sub _format_clinical_impact { my @somatic_clin_sig_list; my @impact_assertion_list; my @impact_clin_sig_list; + my @oncogenic_clin_sig_list; + + # print "\n(_format_clinical_impact) ", Dumper($v_clin_impact); for my $pheno_feat (@{$v_clin_impact}) { my @tmp; my $classification; + # If somatic clinical significance already has '(' then the prognostic/diagnostic + # is already attached to it + if($pheno_feat->{final_somatic_clin_sig}) { + return $v_clin_impact; + } + # Get the somatic clinical impact + # Example: Tier IV - Benign/Likely benign if($pheno_feat->{somatic_clin_sig}) { @somatic_clin_sig_list = split(",", $pheno_feat->{somatic_clin_sig}); } + + # The impact assertion is optional + # Example: diagnostic,prognostic if($pheno_feat->{impact_assertion}) { @impact_assertion_list = split(",", $pheno_feat->{impact_assertion}); } + + # The impact_clin_sig is optional + # Example: supports diagnosis,better outcome if($pheno_feat->{impact_clin_sig}) { @impact_clin_sig_list = split(",", $pheno_feat->{impact_clin_sig}); } + if($pheno_feat->{oncogenic_clin_sig}) { + @oncogenic_clin_sig_list = split(",", $pheno_feat->{oncogenic_clin_sig}); + } + for (my $i=0; $i<(scalar @somatic_clin_sig_list); $i++) { my $somatic_clin_sig = $somatic_clin_sig_list[$i]; if(scalar @impact_assertion_list && scalar @impact_clin_sig_list) { $somatic_clin_sig .= " (" . $impact_assertion_list[$i] . ":" . $impact_clin_sig_list[$i] . ")"; + push @tmp, $somatic_clin_sig; } - push @tmp, $somatic_clin_sig; } if(scalar @tmp) { $classification = join(",", @tmp); - $pheno_feat->{somatic_clin_sig} = $classification; + # $pheno_feat->{somatic_clin_sig} = $classification; + $pheno_feat->{final_somatic_clin_sig} = $pheno_feat->{phenotype}." ".$classification; + } + elsif($pheno_feat->{somatic_clin_sig} && $pheno_feat->{oncogenic_clin_sig}) { + $pheno_feat->{final_somatic_clin_sig} = $pheno_feat->{phenotype}." ".$pheno_feat->{somatic_clin_sig}."(oncogenicity:".$pheno_feat->{oncogenic_clin_sig}.")"; } + elsif($pheno_feat->{somatic_clin_sig}) { + $pheno_feat->{final_somatic_clin_sig} = $pheno_feat->{phenotype}." ".$pheno_feat->{somatic_clin_sig}; + } + + # print "somatic_clin_sig: ", Dumper($pheno_feat->{somatic_clin_sig}); + } return $v_clin_impact; diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index ea286b9ea..b030b2edd 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -1023,6 +1023,8 @@ sub add_colocated_variant_info { my %clin_sigs; + # print "***vf_existing: ", Dumper($vf->{existing}); + foreach my $ex( sort { ($a->{somatic} || 0) <=> ($b->{somatic} || 0) || @@ -1048,21 +1050,35 @@ sub add_colocated_variant_info { # VEP fetches the variation synonyms from the cache push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr}; + # print Dumper($ex); + # Find allele specific clin_sig data if it exists if(defined($ex->{clinical_impact}) && $self->{somatic_classification}) { - my @final_clin_impact; - for my $ci (@{$ex->{clinical_impact}}) { - if(defined $ci->{somatic_clin_sig}) { - my $somatic_clin_sig = $ci->{somatic_clin_sig}; - if(defined $ci->{oncogenic_clin_sig}) { - $somatic_clin_sig = $somatic_clin_sig . " (Oncogenicity:" . $ci->{oncogenic_clin_sig} . ")"; - } - push @final_clin_impact, $somatic_clin_sig; - } - } + print "\n---Output factory: ", Dumper($ex->{clinical_impact}); - $hash->{SOMATIC_CLASSIFICATION} = join(",", @final_clin_impact); + if(ref($ex->{clinical_impact}) eq "ARRAY") { + my @final_clin_impact; + foreach my $pf (@{$ex->{clinical_impact}}) { + push @final_clin_impact, $pf->{final_somatic_clin_sig} if $pf->{final_somatic_clin_sig}; + } + $hash->{SOMATIC_CLASSIFICATION} = join(";", @final_clin_impact); + } + else { + $hash->{SOMATIC_CLASSIFICATION} = $ex->{clinical_impact}; + } + + print "Somatic classification: ", $hash->{SOMATIC_CLASSIFICATION}, "\n"; + + # for my $ci (@{$ex->{clinical_impact}}) { + # if(defined $ci->{somatic_clin_sig}) { + # my $somatic_clin_sig = $ci->{somatic_clin_sig}; + # if(defined $ci->{oncogenic_clin_sig}) { + # $somatic_clin_sig = $somatic_clin_sig . " (Oncogenicity:" . $ci->{oncogenic_clin_sig} . ")"; + # } + # push @final_clin_impact, $somatic_clin_sig; + # } + # } } if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} ) diff --git a/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm b/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm index eec199fbd..749ca1007 100644 --- a/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm @@ -35,6 +35,8 @@ use warnings; use FileHandle; use File::Path qw(mkpath); +use Data::Dumper; + use Bio::EnsEMBL::VEP::Config; use Bio::EnsEMBL::VEP::AnnotationSource::Database::Variation; use Bio::EnsEMBL::VEP::AnnotationSource::Cache::Variation; @@ -67,19 +69,20 @@ sub run { my $region_size = $self->param('region_size'); my $hive_dbc = $self->dbc; - $hive_dbc->disconnect_if_idle() if defined $hive_dbc; - + # die "(2) ", Dumper($hive_dbc); + # $hive_dbc->disconnect_if_idle() if defined $hive_dbc; + # die "(3)"; my $as = Bio::EnsEMBL::VEP::AnnotationSource::Database::Variation->new({ config => $config, cache_region_size => $region_size, }); - + # die "(4)"; my $cache = Bio::EnsEMBL::VEP::AnnotationSource::Cache::Variation->new({ config => $config, cache_region_size => $region_size, dir => $self->get_cache_dir($vep_params) }); - + # die "(5)"; # create prefixed names here to use later if($vep_params->{freq_vcf} && !$self->{freq_vcf}) { foreach my $vcf_conf(@{$vep_params->{freq_vcf}}) { @@ -92,7 +95,7 @@ sub run { } $self->{freq_vcf} = $vep_params->{freq_vcf}; } - + # die "(6)"; $self->dump_chrs($as, $cache); # bgzip and tabix-index all_vars files @@ -181,6 +184,7 @@ sub _generic_dump_info { my @cols = ( @{$as->get_cache_columns()}, 'clin_sig_allele', + 'clinical_impact', 'pubmed', 'var_synonyms', ); @@ -222,11 +226,25 @@ sub dump_obj { } # get freqs from VCFs? - $self->freqs_from_vcf($obj, $chr) if $self->{freq_vcf}; + # $self->freqs_from_vcf($obj, $chr) if $self->{freq_vcf}; + open(FH, '>>', "/hps/nobackup/flicek/ensembl/variation/dlemos/vep/test_code/clin_sig_clinvar_somatic/dump_cache/log.txt") or die $!; + my $pubmed = $self->pubmed; my $var_synonyms = $self->var_synonyms; foreach my $v(@$obj) { + + print FH "->", Dumper($v); + + my $tmp_clinical_impact = ''; + my @tmp_ci; + if($v->{clinical_impact}) { + foreach my $pf (@{$v->{clinical_impact}}) { + push @tmp_ci, $pf->{final_somatic_clin_sig}; + } + $tmp_clinical_impact = join(";", @tmp_ci); + } + my @tmp = ( $v->{variation_name}, $v->{failed} == 0 ? '' : $v->{failed}, @@ -238,8 +256,9 @@ sub dump_obj { $v->{clin_sig} || '', $v->{phenotype_or_disease} == 0 ? '' : $v->{phenotype_or_disease}, $v->{clin_sig_allele} || '', + $tmp_clinical_impact|| '', ); - + push @tmp, $pubmed->{$v->{variation_name}} || ''; push @tmp, $var_synonyms->{$v->{variation_id}} || ''; @@ -257,7 +276,7 @@ sub dump_obj { print $all_vars_fh "\n"; } } - + close(FH); close DUMP; } From 9af14f2dcadee3d77e48d2f433bf9b6793d7324d Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 31 Oct 2024 12:54:53 +0000 Subject: [PATCH 3/5] Update clinvar somatic format --- .../AnnotationSource/Database/Variation.pm | 76 +++++-------------- modules/Bio/EnsEMBL/VEP/OutputFactory.pm | 30 +------- 2 files changed, 22 insertions(+), 84 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm index 42c4fda18..099b05aef 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm @@ -164,8 +164,6 @@ sub get_features_by_regions_uncached { $attribs_clinical_impact = $adaptor->get_somatic_clin_impact_by_location($chr_is_seq_region ? $chr : $sr_cache->{$chr}, $s, $e, $source_id); } - print "\n->attribs_clinical_impact: ", Dumper($attribs_clinical_impact); - my $phenotype_attrib_id = $self->phenotype_attrib_id || 0; my $sth = $self->var_dbc->prepare(qq{ @@ -219,7 +217,6 @@ sub get_features_by_regions_uncached { if($v_clin_impact){ $v_copy{clinical_impact} = _format_clinical_impact($v_clin_impact); - print "---v_clin_impact specific key after format: ", Dumper($v_clin_impact); } $v_copy{variation_id} = $var_id; @@ -240,73 +237,40 @@ sub get_features_by_regions_uncached { return \@return; } -sub _format_clinical_impact { - my $v_clin_impact = shift; +=head2 _format_clinical_impact - my @somatic_clin_sig_list; - my @impact_assertion_list; - my @impact_clin_sig_list; - my @oncogenic_clin_sig_list; + Example : $clinical_impact = _format_clinical_impact($v_clin_impact) + Description: Internal method to format the clinvar somatic classification + Returntype : string + Exceptions : none + Caller : get_features_by_regions_uncached() + Status : Stable - # print "\n(_format_clinical_impact) ", Dumper($v_clin_impact); +=cut - for my $pheno_feat (@{$v_clin_impact}) { - my @tmp; - my $classification; +sub _format_clinical_impact { + my $v_clin_impact = shift; - # If somatic clinical significance already has '(' then the prognostic/diagnostic - # is already attached to it - if($pheno_feat->{final_somatic_clin_sig}) { - return $v_clin_impact; - } + my @somatic_clin_sig_list = (); + for my $pheno_feat (@{$v_clin_impact}) { # Get the somatic clinical impact # Example: Tier IV - Benign/Likely benign if($pheno_feat->{somatic_clin_sig}) { - @somatic_clin_sig_list = split(",", $pheno_feat->{somatic_clin_sig}); - } - - # The impact assertion is optional - # Example: diagnostic,prognostic - if($pheno_feat->{impact_assertion}) { - @impact_assertion_list = split(",", $pheno_feat->{impact_assertion}); - } - - # The impact_clin_sig is optional - # Example: supports diagnosis,better outcome - if($pheno_feat->{impact_clin_sig}) { - @impact_clin_sig_list = split(",", $pheno_feat->{impact_clin_sig}); + $pheno_feat->{somatic_clin_sig} =~ s/,/;/g; + my $tmp = $pheno_feat->{phenotype} . "(" . $pheno_feat->{somatic_clin_sig} . ")"; + push @somatic_clin_sig_list, $tmp; } if($pheno_feat->{oncogenic_clin_sig}) { - @oncogenic_clin_sig_list = split(",", $pheno_feat->{oncogenic_clin_sig}); + my $tmp = $pheno_feat->{phenotype} . "(oncogenicity:" . $pheno_feat->{oncogenic_clin_sig} . ")"; + push @somatic_clin_sig_list, $tmp; } - - for (my $i=0; $i<(scalar @somatic_clin_sig_list); $i++) { - my $somatic_clin_sig = $somatic_clin_sig_list[$i]; - if(scalar @impact_assertion_list && scalar @impact_clin_sig_list) { - $somatic_clin_sig .= " (" . $impact_assertion_list[$i] . ":" . $impact_clin_sig_list[$i] . ")"; - push @tmp, $somatic_clin_sig; - } - } - - if(scalar @tmp) { - $classification = join(",", @tmp); - # $pheno_feat->{somatic_clin_sig} = $classification; - $pheno_feat->{final_somatic_clin_sig} = $pheno_feat->{phenotype}." ".$classification; - } - elsif($pheno_feat->{somatic_clin_sig} && $pheno_feat->{oncogenic_clin_sig}) { - $pheno_feat->{final_somatic_clin_sig} = $pheno_feat->{phenotype}." ".$pheno_feat->{somatic_clin_sig}."(oncogenicity:".$pheno_feat->{oncogenic_clin_sig}.")"; - } - elsif($pheno_feat->{somatic_clin_sig}) { - $pheno_feat->{final_somatic_clin_sig} = $pheno_feat->{phenotype}." ".$pheno_feat->{somatic_clin_sig}; - } - - # print "somatic_clin_sig: ", Dumper($pheno_feat->{somatic_clin_sig}); - } - return $v_clin_impact; + my $classification = join(";", @somatic_clin_sig_list); + + return $classification; } =head2 seq_region_cache diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index b030b2edd..c700a3ab0 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -1050,35 +1050,9 @@ sub add_colocated_variant_info { # VEP fetches the variation synonyms from the cache push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr}; - # print Dumper($ex); - - # Find allele specific clin_sig data if it exists + # ClinVar somatic classification if(defined($ex->{clinical_impact}) && $self->{somatic_classification}) { - - print "\n---Output factory: ", Dumper($ex->{clinical_impact}); - - if(ref($ex->{clinical_impact}) eq "ARRAY") { - my @final_clin_impact; - foreach my $pf (@{$ex->{clinical_impact}}) { - push @final_clin_impact, $pf->{final_somatic_clin_sig} if $pf->{final_somatic_clin_sig}; - } - $hash->{SOMATIC_CLASSIFICATION} = join(";", @final_clin_impact); - } - else { - $hash->{SOMATIC_CLASSIFICATION} = $ex->{clinical_impact}; - } - - print "Somatic classification: ", $hash->{SOMATIC_CLASSIFICATION}, "\n"; - - # for my $ci (@{$ex->{clinical_impact}}) { - # if(defined $ci->{somatic_clin_sig}) { - # my $somatic_clin_sig = $ci->{somatic_clin_sig}; - # if(defined $ci->{oncogenic_clin_sig}) { - # $somatic_clin_sig = $somatic_clin_sig . " (Oncogenicity:" . $ci->{oncogenic_clin_sig} . ")"; - # } - # push @final_clin_impact, $somatic_clin_sig; - # } - # } + $hash->{SOMATIC_CLASSIFICATION} = $ex->{clinical_impact}; } if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} ) From 760d043c46f987ebadc329edf720ec700fd0cfd0 Mon Sep 17 00:00:00 2001 From: dglemos Date: Mon, 18 Nov 2024 14:41:53 +0000 Subject: [PATCH 4/5] Remove prints --- .../AnnotationSource/Database/Variation.pm | 2 -- modules/Bio/EnsEMBL/VEP/OutputFactory.pm | 4 --- modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm | 4 --- .../VEP/Pipeline/DumpVEP/Dumper/Variation.pm | 27 +++++-------------- 4 files changed, 7 insertions(+), 30 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm index 099b05aef..b320de367 100644 --- a/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/AnnotationSource/Database/Variation.pm @@ -62,8 +62,6 @@ use Digest::MD5 qw(md5_hex); use Bio::EnsEMBL::Utils::Exception qw(throw warning); -use Data::Dumper; - use base qw( Bio::EnsEMBL::VEP::AnnotationSource::Database Bio::EnsEMBL::VEP::AnnotationType::Variation diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index c700a3ab0..5d5b3f5b6 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -1022,8 +1022,6 @@ sub add_colocated_variant_info { ); my %clin_sigs; - - # print "***vf_existing: ", Dumper($vf->{existing}); foreach my $ex( sort { @@ -1044,8 +1042,6 @@ sub add_colocated_variant_info { # ID push @{$hash->{Existing_variation}}, $ex->{variation_name} if $ex->{variation_name}; - # print "\nvariation name: ", $ex->{variation_name}, "\n"; - # Variation Synonyms # VEP fetches the variation synonyms from the cache push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr}; diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm index 7c9cf8bd0..793babc44 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm @@ -90,8 +90,6 @@ use Bio::EnsEMBL::VEP::Utils qw(numberify); use JSON; use Scalar::Util qw(looks_like_number); -use Data::Dumper; - my %SKIP_KEYS = ( 'Uploaded_variation' => 1, 'Location' => 1, @@ -474,8 +472,6 @@ sub add_colocated_variant_info_JSON { push @{$hash->{colocated_variants}}, $ex; - print "-> ", Dumper($hash->{colocated_variants}); - return $hash; } diff --git a/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm b/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm index 749ca1007..f26b4c8db 100644 --- a/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm +++ b/modules/Bio/EnsEMBL/VEP/Pipeline/DumpVEP/Dumper/Variation.pm @@ -35,8 +35,6 @@ use warnings; use FileHandle; use File::Path qw(mkpath); -use Data::Dumper; - use Bio::EnsEMBL::VEP::Config; use Bio::EnsEMBL::VEP::AnnotationSource::Database::Variation; use Bio::EnsEMBL::VEP::AnnotationSource::Cache::Variation; @@ -69,20 +67,19 @@ sub run { my $region_size = $self->param('region_size'); my $hive_dbc = $self->dbc; - # die "(2) ", Dumper($hive_dbc); - # $hive_dbc->disconnect_if_idle() if defined $hive_dbc; - # die "(3)"; + $hive_dbc->disconnect_if_idle() if defined $hive_dbc; + my $as = Bio::EnsEMBL::VEP::AnnotationSource::Database::Variation->new({ config => $config, cache_region_size => $region_size, }); - # die "(4)"; + my $cache = Bio::EnsEMBL::VEP::AnnotationSource::Cache::Variation->new({ config => $config, cache_region_size => $region_size, dir => $self->get_cache_dir($vep_params) }); - # die "(5)"; + # create prefixed names here to use later if($vep_params->{freq_vcf} && !$self->{freq_vcf}) { foreach my $vcf_conf(@{$vep_params->{freq_vcf}}) { @@ -95,7 +92,7 @@ sub run { } $self->{freq_vcf} = $vep_params->{freq_vcf}; } - # die "(6)"; + $self->dump_chrs($as, $cache); # bgzip and tabix-index all_vars files @@ -226,23 +223,14 @@ sub dump_obj { } # get freqs from VCFs? - # $self->freqs_from_vcf($obj, $chr) if $self->{freq_vcf}; - - open(FH, '>>', "/hps/nobackup/flicek/ensembl/variation/dlemos/vep/test_code/clin_sig_clinvar_somatic/dump_cache/log.txt") or die $!; + $self->freqs_from_vcf($obj, $chr) if $self->{freq_vcf}; my $pubmed = $self->pubmed; my $var_synonyms = $self->var_synonyms; foreach my $v(@$obj) { - - print FH "->", Dumper($v); - my $tmp_clinical_impact = ''; - my @tmp_ci; if($v->{clinical_impact}) { - foreach my $pf (@{$v->{clinical_impact}}) { - push @tmp_ci, $pf->{final_somatic_clin_sig}; - } - $tmp_clinical_impact = join(";", @tmp_ci); + $tmp_clinical_impact = $v->{clinical_impact}; } my @tmp = ( @@ -276,7 +264,6 @@ sub dump_obj { print $all_vars_fh "\n"; } } - close(FH); close DUMP; } From 780862b2a842258370041e54c0d8fde4189f4449 Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 21 Nov 2024 11:52:38 +0000 Subject: [PATCH 5/5] Rename vep option and patch dbs --- modules/Bio/EnsEMBL/VEP/Config.pm | 8 ++++---- modules/Bio/EnsEMBL/VEP/Constants.pm | 4 ++-- modules/Bio/EnsEMBL/VEP/OutputFactory.pm | 8 +++----- modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm | 2 +- t/Runner.t | 1 + t/test-genome-DBs/homo_vepiens/variation/meta.txt | 10 +++------- t/test-genome-DBs/homo_vepiens/variation/table.sql | 3 ++- 7 files changed, 16 insertions(+), 20 deletions(-) diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm index a5d894cbe..8855afbcd 100755 --- a/modules/Bio/EnsEMBL/VEP/Config.pm +++ b/modules/Bio/EnsEMBL/VEP/Config.pm @@ -149,10 +149,10 @@ our @VEP_PARAMS = ( 'nearest=s', # get nearest transcript, gene or symbol (for gene) 'distance=s', # set up/downstream distance 'clin_sig_allele=i', # use allele specific clinical significance data where it exists - 'somatic_classification', # report the somatic classification of a variant as reported by ClinVar - 'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV - 'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements) - 'remove_hgvsp_version', # removes translation version from hgvs_protein output + 'clinvar_somatic_classification', # report the somatic classification of a variant as reported by ClinVar + 'overlaps', # report length and percent of a transcript or regulatory feature overlaped with a SV + 'max_sv_size=i', # modify the size of structural variant to be handled (limited by default to reduce memory requirements) + 'remove_hgvsp_version', # removes translation version from hgvs_protein output # verbosity options diff --git a/modules/Bio/EnsEMBL/VEP/Constants.pm b/modules/Bio/EnsEMBL/VEP/Constants.pm index c39e8990f..d09fa9a8e 100755 --- a/modules/Bio/EnsEMBL/VEP/Constants.pm +++ b/modules/Bio/EnsEMBL/VEP/Constants.pm @@ -79,7 +79,7 @@ our @FLAG_FIELDS = ( { flag => 'minimal', fields => ['MINIMISED']}, { flag => 'spdi', fields => ['SPDI']}, { flag => 'ga4gh_vrs', fields => ['GA4GH_VRS']}, - { flag => 'somatic_classification', fields => ['SOMATIC_CLASSIFICATION']}, + { flag => 'clinvar_somatic_classification', fields => ['CLINVAR_SOMATIC_CLASSIFICATION']}, # gene-related { flag => 'symbol', fields => ['SYMBOL','SYMBOL_SOURCE','HGNC_ID'] }, @@ -237,7 +237,7 @@ our %FIELD_DESCRIPTIONS = ( 'MAX_AF_POPS' => 'Populations in which maximum allele frequency was observed', 'DISTANCE' => 'Shortest distance from variant to transcript', 'CLIN_SIG' => 'ClinVar clinical significance of the dbSNP variant', - 'SOMATIC_CLASSIFICATION' => 'ClinVar somatic classification of the dbSNP variant', + 'CLINVAR_SOMATIC_CLASSIFICATION' => 'ClinVar somatic classification of the dbSNP variant', 'BIOTYPE' => 'Biotype of transcript or regulatory feature', 'PUBMED' => 'Pubmed ID(s) of publications that cite existing variant', 'ALLELE_NUM' => 'Allele number from input; 0 is reference, 1 is first alternate etc', diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm index 5d5b3f5b6..44ec885a4 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory.pm @@ -84,8 +84,6 @@ use Bio::EnsEMBL::VEP::OutputFactory::VEP_output; use Bio::EnsEMBL::VEP::OutputFactory::VCF; use Bio::EnsEMBL::VEP::OutputFactory::Tab; -use Data::Dumper; - our $CAN_USE_JSON; BEGIN { @@ -176,7 +174,7 @@ sub new { max_af pubmed clin_sig_allele - somatic_classification + clinvar_somatic_classification numbers domains @@ -1047,8 +1045,8 @@ sub add_colocated_variant_info { push @{$hash->{VAR_SYNONYMS}}, $ex->{var_synonyms} if $self->{var_synonyms} && $ex->{var_synonyms} && !$self->{_config}->{_params}->{is_vr}; # ClinVar somatic classification - if(defined($ex->{clinical_impact}) && $self->{somatic_classification}) { - $hash->{SOMATIC_CLASSIFICATION} = $ex->{clinical_impact}; + if(defined($ex->{clinical_impact}) && $self->{clinvar_somatic_classification}) { + $hash->{CLINVAR_SOMATIC_CLASSIFICATION} = $ex->{clinical_impact}; } if(defined($ex->{clin_sig_allele}) && $self->{clin_sig_allele} ) diff --git a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm index 793babc44..0d4fb610f 100755 --- a/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm +++ b/modules/Bio/EnsEMBL/VEP/OutputFactory/JSON.pm @@ -108,7 +108,7 @@ my %RENAME_KEYS = ( 'chr' => 'seq_region_name', 'variation_name' => 'id', 'sv' => 'colocated_structural_variants', - 'clinical_impact' => 'somatic_classification' + 'clinical_impact' => 'clinvar_somatic_classification' ); my %NUMBERIFY_EXEMPT = ( diff --git a/t/Runner.t b/t/Runner.t index 02adc0aff..025964021 100755 --- a/t/Runner.t +++ b/t/Runner.t @@ -235,6 +235,7 @@ is_deeply($runner->get_OutputFactory, bless( { 'clin_sig_allele' => 1, 'var_synonyms' => undef, 'ga4gh_vrs' => undef, + 'clinvar_somatic_classification' => undef, }, 'Bio::EnsEMBL::VEP::OutputFactory::VEP_output' ), 'get_OutputFactory'); diff --git a/t/test-genome-DBs/homo_vepiens/variation/meta.txt b/t/test-genome-DBs/homo_vepiens/variation/meta.txt index fedcad1f2..2500e96d8 100644 --- a/t/test-genome-DBs/homo_vepiens/variation/meta.txt +++ b/t/test-genome-DBs/homo_vepiens/variation/meta.txt @@ -1,5 +1,5 @@ 1 \N schema_type variation -2 \N schema_version 114 +2 \N schema_version 115 3 \N patch patch_73_74_a.sql|schema version 4 \N patch patch_73_74_b.sql|Add doi and UCSC id to publication table 5 \N patch patch_73_74_c.sql|Add clinical_significance to variation_feature table @@ -133,13 +133,9 @@ 164 \N patch patch_107_108_a.sql|schema version 165 \N patch patch_107_108_b.sql|fix SAS population description 166 \N patch patch_108_109_a.sql|schema version -167 \N patch patch_109_110_a.sql|schema version -168 \N patch patch_109_110_b.sql|Add DDG2P data_source_attrib to variation_citation -169 \N patch patch_109_110_c.sql|Add new clinical_significance values to variation, variation_feature and structural_variation -170 \N patch patch_110_111_a.sql|schema version -171 \N patch patch_110_111_b.sql|Update transcript_variation primary key -172 \N patch patch_111_112_a.sql|schema version 173 \N patch patch_111_112_a.sql|schema version 174 \N patch patch_112_113_a.sql|schema version 175 \N patch patch_112_113_b.sql|Update meta_key length 176 \N patch patch_113_114_a.sql|schema version +177 \N patch patch_114_115_a.sql|schema version +178 \N patch patch_114_115_b.sql|Add column DNA_type to phenotype_feature diff --git a/t/test-genome-DBs/homo_vepiens/variation/table.sql b/t/test-genome-DBs/homo_vepiens/variation/table.sql index fb67a4931..f40ad2ba0 100644 --- a/t/test-genome-DBs/homo_vepiens/variation/table.sql +++ b/t/test-genome-DBs/homo_vepiens/variation/table.sql @@ -186,7 +186,7 @@ CREATE TABLE `meta` ( PRIMARY KEY (`meta_id`), UNIQUE KEY `species_key_value_idx` (`species_id`,`meta_key`,`meta_value`), KEY `species_value_idx` (`species_id`,`meta_value`) -) ENGINE=MyISAM AUTO_INCREMENT=177 DEFAULT CHARSET=latin1; +) ENGINE=MyISAM AUTO_INCREMENT=179 DEFAULT CHARSET=latin1; CREATE TABLE `meta_coord` ( `table_name` varchar(40) NOT NULL, @@ -239,6 +239,7 @@ CREATE TABLE `phenotype_feature` ( `seq_region_start` int(11) unsigned DEFAULT NULL, `seq_region_end` int(11) unsigned DEFAULT NULL, `seq_region_strand` tinyint(4) DEFAULT NULL, + `DNA_type` enum('Germline','Somatic') DEFAULT NULL, PRIMARY KEY (`phenotype_feature_id`), KEY `phenotype_idx` (`phenotype_id`), KEY `object_idx` (`object_id`,`type`),