From 9e011a1ea1a473ce88975e04389d562cf5138509 Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 6 Apr 2023 12:14:47 +0100 Subject: [PATCH 01/12] Update SpliceAI gpu --- .../EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm | 5 ++++- .../EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm index be5c5b2be..c6dd79f5d 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm @@ -68,7 +68,10 @@ sub run_spliceai { die("Directory ($output_vcf_files_dir) doesn't exist"); } - my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation"; + my $tmp_dir = $main_dir . "/tmp"; + $self->run_system_command("mkdir $tmp_dir"); + + my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation -B 4096 -T 256 -t $tmp_dir"; # Add option to calculate masked scores if($masked_scores) { diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm index 5f5cc0f48..7ea0594c2 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm @@ -60,7 +60,7 @@ sub default_options { output_dir => $self->o('main_dir') . '/output', # final output files already merged by chromosome fasta_file => $self->o('fasta_file'), gene_annotation => $self->o('gene_annotation'), - step_size => 4000, # number of variants used to split the main vcf files + step_size => 200000, # number of variants used to split the main vcf files check_transcripts => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts transcripts_from_file => undef, time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db @@ -68,7 +68,7 @@ sub default_options { registry => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1 output_file_name => 'spliceai_final_scores_', - pipeline_wide_analysis_capacity => 500, + pipeline_wide_analysis_capacity => 80, pipeline_db => { -host => $self->o('hive_db_host'), @@ -85,7 +85,7 @@ sub resource_classes { my ($self) = @_; return { %{$self->SUPER::resource_classes}, - '8Gb_8c_job' => {'LSF' => '-n 8 -q production -R"select[mem>8000] rusage[mem=8000]" -M8000' }, + 'gpu' => {'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"' }, # the queue gpu-a100 can access /hps/nobackup '4Gb_job' => {'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000'}, }; } @@ -151,7 +151,7 @@ sub pipeline_analyses { -hive_capacity => $self->o('pipeline_wide_analysis_capacity'), -analysis_capacity => $self->o('pipeline_wide_analysis_capacity'), -input_ids => [], - -rc_name => '8Gb_8c_job', + -rc_name => 'gpu', -parameters => { 'main_dir' => $self->o('main_dir'), 'split_vcf_input_dir' => $self->o('split_vcf_input_dir'), From 0f4a804ab60e658e69c344a46d6435cab5296f8f Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 6 Apr 2023 12:27:19 +0100 Subject: [PATCH 02/12] Remove unsorted files --- modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm | 3 +++ .../Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm index 054cb7662..58ea624a3 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm @@ -84,6 +84,9 @@ sub merge_vcf_files { # Sort final file $self->run_system_command("bcftools sort -o $final_file_sorted $final_file"); + + # Remove unsorted file + $self->run_system_command("rm $final_file"); } 1; diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm index 7ea0594c2..aad0fd8f4 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm @@ -66,7 +66,7 @@ sub default_options { time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db masked_scores => 1, # calculate masked scores registry => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1 - output_file_name => 'spliceai_final_scores_', + output_file_name => 'spliceai_final_scores_' . $self->o('ensembl_release') . '_', pipeline_wide_analysis_capacity => 80, From e3fca0361c6bbdf68e7fbda6129c6d3731f0d174 Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 6 Apr 2023 12:31:18 +0100 Subject: [PATCH 03/12] Compress final files --- modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm | 2 ++ 1 file changed, 2 insertions(+) diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm index 58ea624a3..52cbdd620 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm @@ -84,6 +84,8 @@ sub merge_vcf_files { # Sort final file $self->run_system_command("bcftools sort -o $final_file_sorted $final_file"); + $self->run_system_command("bgzip $final_file_sorted"); + $self->run_system_command("tabix -p vcf $final_file_sorted.gz"); # Remove unsorted file $self->run_system_command("rm $final_file"); From 3625319e491418bb8a474c76e88209f289c25641 Mon Sep 17 00:00:00 2001 From: dglemos Date: Thu, 27 Apr 2023 11:59:02 +0100 Subject: [PATCH 04/12] Support compressed files as input --- .../Pipeline/SpliceAI/SpliceAI_conf.pm | 24 +++++++---- .../Variation/Pipeline/SpliceAI/SplitFiles.pm | 42 ++++++++++++------- 2 files changed, 44 insertions(+), 22 deletions(-) diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm index aad0fd8f4..728a85903 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm @@ -54,7 +54,7 @@ sub default_options { pipeline_name => 'spliceai_scores', main_dir => $self->o('main_dir'), # main directory where all files and directories are going to be stored input_directory => $self->o('main_dir') . '/input_vcf_files', # input files - split_vcf_no_header_dir => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers) + split_vcf_no_header_dir => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers) split_vcf_input_dir => $self->o('main_dir') . '/split_vcf_input', # contains the splitted input vcf files with headers, these are the files used to run SpliceAI split_vcf_output_dir => $self->o('main_dir') . '/split_vcf_output', # temporary output files, still splitted output_dir => $self->o('main_dir') . '/output', # final output files already merged by chromosome @@ -63,8 +63,8 @@ sub default_options { step_size => 200000, # number of variants used to split the main vcf files check_transcripts => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts transcripts_from_file => undef, - time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db - masked_scores => 1, # calculate masked scores + time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db + masked_scores => 1, # calculate masked scores registry => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1 output_file_name => 'spliceai_final_scores_' . $self->o('ensembl_release') . '_', @@ -85,8 +85,18 @@ sub resource_classes { my ($self) = @_; return { %{$self->SUPER::resource_classes}, - 'gpu' => {'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"' }, # the queue gpu-a100 can access /hps/nobackup - '4Gb_job' => {'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000'}, + 'gpu' => { + 'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"', # the queue gpu-a100 can access /hps/nobackup + 'SLURM' => '--time=1:30:00 --gres=gpu:1' # GPU memory doesn’t need to be specified + }, + '4Gb_job' => { + 'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000', + 'SLURM' => "--partition=standard --time=4:00:00 --mem=4G" + }, + 'default' => { + 'LSF' => '-R"select[mem>1000] rusage[mem=1000]" -M1000', + 'SLURM' => "--partition=standard --time=1:00:00 --mem=1G" + } }; } @@ -99,11 +109,11 @@ sub pipeline_analyses { -input_ids => [{}], -parameters => { 'input_directory' => $self->o('input_directory'), - 'inputcmd' => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf" -printf "%f\n"', + 'inputcmd' => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf.gz" -printf "%f\n"', }, -flow_into => { '2->A' => {'split_files' => {'vcf_file' => '#_0#'}}, - 'A->1' => ['get_chr_dir'], + 'A->1' => ['get_chr_dir'] }, }, { -logic_name => 'split_files', diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm index 1a89d0be1..e7ea461b8 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm @@ -64,7 +64,7 @@ sub run { sub set_chr_from_filename { my $self = shift; my $vcf_file = $self->param_required('vcf_file'); - $vcf_file =~ /.*_chr(.*)\.vcf$/; + $vcf_file =~ /.*_chr(.*)\.vcf.gz$/; my $chr = $1; if (!$chr) { die("Could not get chromosome name from file name ($vcf_file)."); @@ -82,8 +82,19 @@ sub split_vcf_file { my $split_vcf_output_dir = $self->param_required('split_vcf_output_dir'); my $step_size = $self->param_required('step_size'); + # If we are checking the transcripts then the input file is a VCF + # Remove .gz + if($check) { + $vcf_file =~ s/vcf.gz/vcf/; + } + my $vcf_file_path = $input_dir . '/' . $vcf_file; + if(!$check && $vcf_file =~ /vcf.gz/) { + $self->run_system_command("gunzip $vcf_file_path"); + $vcf_file_path =~ s/vcf.gz/vcf/; + } + if (! -d $input_dir) { die("Directory ($input_dir) doesn't exist"); } @@ -173,8 +184,13 @@ sub check_split_vcf_file { } # prepare input vcf file for tabix - $self->run_system_command("bgzip $vcf_file_path"); - $self->run_system_command("tabix -p vcf $vcf_file_path.gz"); + if($vcf_file !~ /vcf.gz/) { + $self->run_system_command("bgzip $vcf_file_path"); + $self->run_system_command("tabix -p vcf $vcf_file_path.gz"); + } + else { + $self->run_system_command("tabix -p vcf $vcf_file_path"); + } # Create directory to store the input file only with the variants of interest (overlapping new transcripts) my $vcf_file_path_subset = $main_dir . '/input_vcf_files_subset'; @@ -184,7 +200,10 @@ sub check_split_vcf_file { $self->param('input_dir_subset', $vcf_file_path_subset); - open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file) or die $!; + my $vcf_file_sub = $vcf_file; + $vcf_file_sub =~ s/.gz//; + + open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file_sub) or die $!; my $positions_of_interest = $transcripts->{$chr}; foreach my $position (@$positions_of_interest) { @@ -194,8 +213,8 @@ sub check_split_vcf_file { my $pos_string = sprintf("%s:%i-%i", $chr, $transcript_start, $transcript_end); - (open my $read, "tabix " . $input_dir . "/" . $vcf_file . ".gz $pos_string |") - || die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . ".gz \n" ; + (open my $read, "tabix " . $input_dir . "/" . $vcf_file . " $pos_string |") + or die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . " \n" ; while (my $row = <$read>) { chomp $row; @@ -212,8 +231,8 @@ sub check_split_vcf_file { close($write); # Sort new vcf file - my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file; - my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file; + my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file_sub; + my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file_sub; $self->run_system_command("sort -o $vcf_file_subset_sorted -k1,1 -k2,2n $vcf_file_subset"); $self->run_system_command("mv $vcf_file_subset_sorted $vcf_file_subset"); } @@ -291,11 +310,4 @@ sub get_new_transcripts_file { $self->param('transcripts', \%new_transcripts); } -sub count_lines { - my $self = shift; - my $filename = shift; - - -} - 1; From 42595529ba67e93f0c9735fa94e2eca226f6134c Mon Sep 17 00:00:00 2001 From: dglemos Date: Fri, 10 May 2024 14:28:00 +0100 Subject: [PATCH 05/12] Update spliceai --- .../Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm | 4 +++- .../EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm | 6 +++--- .../Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm | 3 +++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm index c6dd79f5d..fd0aa12b6 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm @@ -69,7 +69,9 @@ sub run_spliceai { } my $tmp_dir = $main_dir . "/tmp"; - $self->run_system_command("mkdir $tmp_dir"); + + # activate conda env + $self->run_system_command("source activate spliceai"); my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation -B 4096 -T 256 -t $tmp_dir"; diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm index 728a85903..7efd09777 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm @@ -60,7 +60,7 @@ sub default_options { output_dir => $self->o('main_dir') . '/output', # final output files already merged by chromosome fasta_file => $self->o('fasta_file'), gene_annotation => $self->o('gene_annotation'), - step_size => 200000, # number of variants used to split the main vcf files + step_size => 500000, # number of variants used to split the main vcf files check_transcripts => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts transcripts_from_file => undef, time_interval => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db @@ -87,7 +87,7 @@ sub resource_classes { %{$self->SUPER::resource_classes}, 'gpu' => { 'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"', # the queue gpu-a100 can access /hps/nobackup - 'SLURM' => '--time=1:30:00 --gres=gpu:1' # GPU memory doesn’t need to be specified + 'SLURM' => '--time=5:00:00 --gres=gpu:a100:1 --mem=24G' }, '4Gb_job' => { 'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000', @@ -95,7 +95,7 @@ sub resource_classes { }, 'default' => { 'LSF' => '-R"select[mem>1000] rusage[mem=1000]" -M1000', - 'SLURM' => "--partition=standard --time=1:00:00 --mem=1G" + 'SLURM' => "--partition=standard --time=1:00:00 --mem=2G" } }; } diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm index e7ea461b8..a67be4b5f 100644 --- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm +++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm @@ -59,6 +59,9 @@ sub run { $self->check_split_vcf_file(); } $self->split_vcf_file(); + + # create tmp dir where spliceai tmp files will be stored + $self->create_dir($self->param_required('main_dir') . '/tmp'); } sub set_chr_from_filename { From 02b579c023ddf8e130fba9132ee008e7fea84013 Mon Sep 17 00:00:00 2001 From: dglemos Date: Fri, 23 Aug 2024 15:46:06 +0100 Subject: [PATCH 06/12] Add script to generate gene annotation file for spliceai --- scripts/python/spliceai_annotation_file.py | 176 +++++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 scripts/python/spliceai_annotation_file.py diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py new file mode 100644 index 000000000..616e5f708 --- /dev/null +++ b/scripts/python/spliceai_annotation_file.py @@ -0,0 +1,176 @@ +#!/usr/bin/env python3 + +""" + Script to generate the gene annotation file for SpliceAI. + This file contains data about the transcript that is being used for each gene. + SpliceAI only annotates variants overlapping these transcripts. + + Template provided by SpliceAI: https://github.com/Illumina/SpliceAI/blob/master/spliceai/annotations/grch38.txt + + Gene annotation file format: + #NAME CHROM STRAND TX_START TX_END EXON_START EXON_END + KRTAP27-1 21 - 30337013 30337694 30337013, 30337694, + + Options: + --output_file gene annotation output file (Optional. Default: gene_annotation.txt) + --species species name (Optional. Default: homo_sapiens) + --assembly assembly version (Optional. Default: 38) + --host core database host (Mandatory) + --port host port (Mandatory) + --user host user (Mandatory) + --release core database version (Mandatory) +""" + +import argparse +import mysql.connector +from mysql.connector import Error + + +def fetch_transcripts(species, assembly, release, host, port, user): + gene_annotation = {} + database = f"{species}_core_{release}_{assembly}" + + sql_select = """ + SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, + e.seq_region_start,e.seq_region_end FROM transcript t + JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id + JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id + JOIN seq_region s ON t.seq_region_id = s.seq_region_id + JOIN gene g ON g.gene_id = t.gene_id + JOIN gene_attrib ga ON g.gene_id = ga.gene_id + JOIN exon_transcript et ON t.transcript_id = et.transcript_id + JOIN exon e ON e.exon_id = et.exon_id + WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding' and ga.attrib_type_id = 4 + """ + # For human select 'MANE_Select' transcripts + if species == "homo_sapiens": + sql_select = f"{sql_select} and atr.code = 'MANE_Select'" + else: + sql_select = f"{sql_select} and atr.code = 'is_canonical'" + + # Sort results + sql_select = f"{sql_select} order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end" + + connection = mysql.connector.connect(host=host, + database=database, + user=user, + password='', + port=port) + + try: + if connection.is_connected(): + cursor = connection.cursor() + cursor.execute(sql_select) + data = cursor.fetchall() + for row in data: + strand = row[2] + if strand == 1: + strand = "+" + else: + strand = "-" + + if row[0] not in gene_annotation: + exons_start = [] + exons_end = [] + exons_start.append(str(row[5])) + exons_end.append(str(row[6])) + + gene_annotation[row[0]] = { + "chr": row[1], + "strand": strand, + "start": row[3], + "end": row[4], + "exons_start": exons_start, + "exons_end": exons_end + } + else: + gene_annotation[row[0]]["exons_start"].append(str(row[5])) + gene_annotation[row[0]]["exons_end"].append(str(row[6])) + + except Error as e: + print(f"Error while connecting to MySQL {database}", e) + finally: + if connection.is_connected(): + cursor.close() + connection.close() + + return gene_annotation + +def sanity_checks(transcripts_list): + fail_list = [] + + for gene, data in transcripts_list.items(): + check = 1 + + # transcript start > end + if data["start"] >= data["end"]: + check = 0 + + # the exon start has to be bigger than the last exon end + i = 0 + for exon_start in data["exons_start"]: + if i > 0: + last_exon_end = data["exons_end"][i - 1] + if(int(exon_start) < int(last_exon_end)): + check = 0 + + i += 1 + + if check == 0: + fail_list.append(gene) + + return fail_list + +def write_output(transcripts_list, output_file): + # Write to output file + with open(output_file, "w") as f: + f.write("#NAME\tCHROM\tSTRAND\tTX_START\tTX_END\tEXON_START\tEXON_END\n") + + for gene, data in transcripts_list.items(): + chr = data["chr"] + strand = data["strand"] + start = data["start"] + end = data["end"] + exons_start = (",").join(data["exons_start"]) + exons_end = (",").join(data["exons_end"]) + + f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start}\t{exons_end}\n") + + +def main(): + parser = argparse.ArgumentParser(description="Generate the gene annotation file for SpliceAI") + parser.add_argument("-o", "--output_file", + default="gene_annotation.txt", + help="output file (default: gene_annotation.txt)") + parser.add_argument("-sp", "--species", + default="homo_sapiens", + help="species (default: homo_sapiens)") + parser.add_argument("-a", "--assembly", + default="38", + help="species assembly (default: 38)") + parser.add_argument("-r", "--release", required=True) + parser.add_argument("--host", required=True) + parser.add_argument("--port", required=True) + parser.add_argument("--user", required=True) + args = parser.parse_args() + + output_file = args.output_file + species = args.species + assembly = args.assembly + release = args.release + host = args.host + port = args.port + user = args.user + + transcripts_list = fetch_transcripts(species, assembly, release, host, port, user) + + check = sanity_checks(transcripts_list) + + if not check: + write_output(transcripts_list, output_file) + else: + print("Sanity checks failed for the following genes: ", (", ").join(check)) + + +if __name__ == '__main__': + main() \ No newline at end of file From 5901d680f0619ebcd7710494e97d46346ae88b1b Mon Sep 17 00:00:00 2001 From: dglemos Date: Fri, 23 Aug 2024 15:49:10 +0100 Subject: [PATCH 07/12] Add comma --- scripts/python/spliceai_annotation_file.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py index 616e5f708..72ad49313 100644 --- a/scripts/python/spliceai_annotation_file.py +++ b/scripts/python/spliceai_annotation_file.py @@ -134,7 +134,7 @@ def write_output(transcripts_list, output_file): exons_start = (",").join(data["exons_start"]) exons_end = (",").join(data["exons_end"]) - f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start}\t{exons_end}\n") + f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start},\t{exons_end},\n") def main(): From 88dbd76baa2688dd649ba2c86d638f7673dd759a Mon Sep 17 00:00:00 2001 From: dglemos Date: Fri, 23 Aug 2024 16:20:43 +0100 Subject: [PATCH 08/12] Update sql query for non-human --- scripts/python/spliceai_annotation_file.py | 47 +++++++++++++--------- 1 file changed, 29 insertions(+), 18 deletions(-) diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py index 72ad49313..7112e8ddc 100644 --- a/scripts/python/spliceai_annotation_file.py +++ b/scripts/python/spliceai_annotation_file.py @@ -30,26 +30,37 @@ def fetch_transcripts(species, assembly, release, host, port, user): gene_annotation = {} database = f"{species}_core_{release}_{assembly}" - sql_select = """ - SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, - e.seq_region_start,e.seq_region_end FROM transcript t - JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id - JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id - JOIN seq_region s ON t.seq_region_id = s.seq_region_id - JOIN gene g ON g.gene_id = t.gene_id - JOIN gene_attrib ga ON g.gene_id = ga.gene_id - JOIN exon_transcript et ON t.transcript_id = et.transcript_id - JOIN exon e ON e.exon_id = et.exon_id - WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding' and ga.attrib_type_id = 4 - """ - # For human select 'MANE_Select' transcripts + # For human we select 'MANE_Select' transcripts and the gene name is a gene attrib if species == "homo_sapiens": - sql_select = f"{sql_select} and atr.code = 'MANE_Select'" + sql_select = """ + SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, + e.seq_region_start,e.seq_region_end FROM transcript t + JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id + JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id + JOIN seq_region s ON t.seq_region_id = s.seq_region_id + JOIN gene g ON g.gene_id = t.gene_id + JOIN gene_attrib ga ON g.gene_id = ga.gene_id + JOIN exon_transcript et ON t.transcript_id = et.transcript_id + JOIN exon e ON e.exon_id = et.exon_id + WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding' + and ga.attrib_type_id = 4 and atr.code = 'MANE_Select' + order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end + """ else: - sql_select = f"{sql_select} and atr.code = 'is_canonical'" - - # Sort results - sql_select = f"{sql_select} order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end" + # For other species we select the canonical transcripts and the gene name is in xref + sql_select = """ + SELECT xr.display_label,t.gene_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, + e.seq_region_start,e.seq_region_end FROM transcript t + JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id + JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id + JOIN seq_region s ON t.seq_region_id = s.seq_region_id + JOIN gene g ON g.gene_id = t.gene_id + JOIN exon_transcript et ON t.transcript_id = et.transcript_id + JOIN exon e ON e.exon_id = et.exon_id + JOIN xref xr ON g.display_xref_id = xr.xref_id + WHERE t.stable_id like 'ENS%' and t.biotype = 'protein_coding' and atr.code = 'is_canonical' + order by xr.display_label,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end + """ connection = mysql.connector.connect(host=host, database=database, From 1b71d2743a4278b9d97b79486d0b33a59dcd82cb Mon Sep 17 00:00:00 2001 From: dglemos Date: Tue, 27 Aug 2024 15:25:07 +0100 Subject: [PATCH 09/12] Fix sql query --- scripts/python/spliceai_annotation_file.py | 2 +- tools/variant_simulator/simulate_variation | 45 ++++++++-------------- 2 files changed, 18 insertions(+), 29 deletions(-) diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py index 7112e8ddc..29afcb4e0 100644 --- a/scripts/python/spliceai_annotation_file.py +++ b/scripts/python/spliceai_annotation_file.py @@ -49,7 +49,7 @@ def fetch_transcripts(species, assembly, release, host, port, user): else: # For other species we select the canonical transcripts and the gene name is in xref sql_select = """ - SELECT xr.display_label,t.gene_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, + SELECT DISTINCT g.stable_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end, e.seq_region_start,e.seq_region_end FROM transcript t JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation index 582fb76d9..421c99f50 100755 --- a/tools/variant_simulator/simulate_variation +++ b/tools/variant_simulator/simulate_variation @@ -60,7 +60,8 @@ usage() if ($help); die "Illegal arguments -exonsOnly -intronsOnly can't be used together !" if $exonOnly && $intronOnly; die "Illegal arguments -intronsOnly -codingsOnly can't be used together !" if $codingOnly && $intronOnly; -die "Illegal arguments -onlyMane can only be used alone !" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly); +die "Illegal arguments -onlyMane can only be used alone!" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly); +die "Missing arguments -spliceai has to be used with -onlyMane or -chromOnly!" if $spliceai && !$onlyMane && !$chromOnly; my $registry = 'Bio::EnsEMBL::Registry'; if (defined $registryFile){ @@ -92,7 +93,7 @@ my $biotype = 'protein_coding'; my $FHO; open ($FHO, ">$outFile") or die "Can't open file to write: $outFile, $!\n"; -print_header($FHO, $assembly, $spliceai); +print_header($FHO, $assembly, $spliceai, $species, $chromOnly); my @geneIDs; if ($chromOnly || $gene){ #if chrom or gene specified then use the most stringent @@ -126,36 +127,24 @@ sub print_header { my $FHO = shift; my $assembly = shift; my $spliceai = shift; + my $species = shift; + my $chromOnly = shift; print $FHO "##fileformat=VCFv4.2", "\n"; + if(defined $spliceai) { - my $reference = $assembly eq "grch38" ? "GRCh38/hg38" : "GRCh37/hg19"; + my $reference = $assembly; + + if($species eq "homo_sapiens" || $species eq "human") { + $reference = $assembly eq "grch38" ? "GRCh38/hg38" : "GRCh37/hg19"; + } print $FHO "##reference=$reference", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; - print $FHO "##contig=", "\n"; + + # Get chromosome size + my $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', $chromOnly); + my $length = $chr_slice->seq_region_length(); + + print $FHO "##contig=", "\n"; } else { print $FHO '##INFO=',"\n"; From 96b0d0c02ff63adc45d4baf6b99bd8e9b8a818f7 Mon Sep 17 00:00:00 2001 From: dglemos Date: Fri, 30 Aug 2024 09:25:06 +0100 Subject: [PATCH 10/12] Add option to simulate_variation --- tools/variant_simulator/simulate_variation | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation index 421c99f50..0d5f32568 100755 --- a/tools/variant_simulator/simulate_variation +++ b/tools/variant_simulator/simulate_variation @@ -34,7 +34,7 @@ use Bio::EnsEMBL::Registry; use Getopt::Long qw(GetOptions); my ($help, $chromOnly, $gene, $registryFile, -$exonOnly, $intronOnly, $codingOnly,$onlyMane,$refseq_genes,$spliceai +$exonOnly, $intronOnly, $codingOnly, $canonicalOnly, $onlyMane, $refseq_genes, $spliceai ); my ($species, $assembly, $edges, $outFile) = ("homo_sapiens", "grch38", 0, "simulated.vcf"); @@ -47,6 +47,7 @@ GetOptions( "species=s" => \$species, "output|o=s" => \$outFile, "exonsOnly" => \$exonOnly, "codingOnly" => \$codingOnly, + "canonicalOnly" => \$canonicalOnly, "onlyMane" => \$onlyMane, "intronsOnly" => \$intronOnly, "assembly=s" => \$assembly, @@ -61,7 +62,6 @@ usage() if ($help); die "Illegal arguments -exonsOnly -intronsOnly can't be used together !" if $exonOnly && $intronOnly; die "Illegal arguments -intronsOnly -codingsOnly can't be used together !" if $codingOnly && $intronOnly; die "Illegal arguments -onlyMane can only be used alone!" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly); -die "Missing arguments -spliceai has to be used with -onlyMane or -chromOnly!" if $spliceai && !$onlyMane && !$chromOnly; my $registry = 'Bio::EnsEMBL::Registry'; if (defined $registryFile){ @@ -183,6 +183,13 @@ sub processGenes{ @parts = ($transcript); } } + } elsif ($canonicalOnly) { + my @transc = @{ $geneID->get_all_Transcripts() }; + while ( my $transcript = shift @transc ) { + if($transcript->is_canonical()) { + @parts = ($transcript); + } + } } else { @parts = ($geneID); @@ -254,6 +261,7 @@ sub usage { -exonsOnly Generate SNPs only for all exons of the genes -intronsOnly Generate SNPs only for all introns of the genes -onlyMane Generate SNPs only for MANE transcripts + -canonicalOnly Generate SNPs only for canonical transcripts -edge Length of flanking region in which to generate SNPs (Default: 0) From cc8dafc5914975094c42b2c0b7173844c0d9e9ca Mon Sep 17 00:00:00 2001 From: dglemos Date: Fri, 30 Aug 2024 10:16:30 +0100 Subject: [PATCH 11/12] simulate_variation to write unique variants --- tools/variant_simulator/simulate_variation | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation index 0d5f32568..2e0606397 100755 --- a/tools/variant_simulator/simulate_variation +++ b/tools/variant_simulator/simulate_variation @@ -206,7 +206,9 @@ sub generateSNPs { my $fho = shift; my $spliceai = shift; + my %list_variants; my @bases = qw/A C T G/; + while( my $part = shift (@$elements)){ my $start = $part->seq_region_start - $edges; my $pos = $start; @@ -216,8 +218,10 @@ sub generateSNPs { $partID = $part->display_id if $intronOnly; $partID = $part->display_id; my $seq = $slice_adaptor->fetch_by_region("chromosome", $chrom, $start ,$part->seq_region_end + $edges)->seq; + my @seqAr = split("", $seq); my ($id, $qual, $filter, $info) = (".", ".", ".", "."); + for ( my $i = 0; $i < @seqAr; $i++ ) { my $ref_seq = $seqAr[$i]; my @tmp_alts = grep {$_ ne $ref_seq} @bases; @@ -226,11 +230,18 @@ sub generateSNPs { $info = 'GENE='.$gene_name.';'.'FEATURE='.$partID; } while(my $tmp_alt = shift @tmp_alts) { + my $key = $chrom."-".$pos."-".$ref_seq."-".$tmp_alt; + if(!$spliceai) { - $id = $chrom."-".$pos."-".$ref_seq."-".$tmp_alt; + $id = $key; + } + + # write to file only if variant is unique + if(!$list_variants{$key}) { + $list_variants{$key} = 1; + printf $fho "%s\t%s\t%s\t%s\t%s\t", $chrom, $pos, $id, $ref_seq, $tmp_alt; + printf $fho "%s\t%s\t%s\n", $qual, $filter, $info; } - printf $fho "%s\t%s\t%s\t%s\t%s\t", $chrom, $pos, $id, $ref_seq, $tmp_alt; - printf $fho "%s\t%s\t%s\n", $qual, $filter, $info; } } } From bc55b803976247edd63ea266d06dd52510ff9dd5 Mon Sep 17 00:00:00 2001 From: dglemos Date: Fri, 30 Aug 2024 10:38:10 +0100 Subject: [PATCH 12/12] simulate_variation fix variable --- tools/variant_simulator/simulate_variation | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation index 2e0606397..203c8bfe9 100755 --- a/tools/variant_simulator/simulate_variation +++ b/tools/variant_simulator/simulate_variation @@ -89,6 +89,7 @@ my $slice_adaptor = $registry->get_adaptor( $species, $dbtype, 'slice' ); my $gene_adaptor = $registry->get_adaptor( $species, $dbtype, "gene" ); my $biotype = 'protein_coding'; +my %list_variants; my $FHO; open ($FHO, ">$outFile") or die "Can't open file to write: $outFile, $!\n"; @@ -206,7 +207,6 @@ sub generateSNPs { my $fho = shift; my $spliceai = shift; - my %list_variants; my @bases = qw/A C T G/; while( my $part = shift (@$elements)){