From 9e011a1ea1a473ce88975e04389d562cf5138509 Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Thu, 6 Apr 2023 12:14:47 +0100
Subject: [PATCH 01/12] Update SpliceAI gpu

---
 .../EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm    | 5 ++++-
 .../EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm  | 8 ++++----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm
index be5c5b2be..c6dd79f5d 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm
@@ -68,7 +68,10 @@ sub run_spliceai {
     die("Directory ($output_vcf_files_dir) doesn't exist");
   }
 
-  my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation";
+  my $tmp_dir = $main_dir . "/tmp";
+  $self->run_system_command("mkdir $tmp_dir");
+
+  my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation -B 4096 -T 256 -t $tmp_dir";
 
   # Add option to calculate masked scores
   if($masked_scores) {
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
index 5f5cc0f48..7ea0594c2 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
@@ -60,7 +60,7 @@ sub default_options {
         output_dir                 => $self->o('main_dir') . '/output', # final output files already merged by chromosome
         fasta_file                 => $self->o('fasta_file'),
         gene_annotation            => $self->o('gene_annotation'),
-        step_size                  => 4000, # number of variants used to split the main vcf files
+        step_size                  => 200000, # number of variants used to split the main vcf files
         check_transcripts          => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts
         transcripts_from_file      => undef,
 	time_interval              => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
@@ -68,7 +68,7 @@ sub default_options {
         registry                   => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1
         output_file_name           => 'spliceai_final_scores_',
 
-        pipeline_wide_analysis_capacity => 500,
+        pipeline_wide_analysis_capacity => 80,
 
         pipeline_db => {
             -host   => $self->o('hive_db_host'),
@@ -85,7 +85,7 @@ sub resource_classes {
     my ($self) = @_;
     return {
         %{$self->SUPER::resource_classes},
-        '8Gb_8c_job'  => {'LSF' => '-n 8 -q production -R"select[mem>8000]  rusage[mem=8000]" -M8000' },
+        'gpu'  => {'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"' }, # the queue gpu-a100 can access /hps/nobackup
         '4Gb_job'     => {'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000'},
     };
 }
@@ -151,7 +151,7 @@ sub pipeline_analyses {
         -hive_capacity => $self->o('pipeline_wide_analysis_capacity'),
         -analysis_capacity => $self->o('pipeline_wide_analysis_capacity'),
         -input_ids  => [],
-        -rc_name => '8Gb_8c_job',
+        -rc_name => 'gpu',
         -parameters => {
           'main_dir'             => $self->o('main_dir'),
           'split_vcf_input_dir'  => $self->o('split_vcf_input_dir'),

From 0f4a804ab60e658e69c344a46d6435cab5296f8f Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Thu, 6 Apr 2023 12:27:19 +0100
Subject: [PATCH 02/12] Remove unsorted files

---
 modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm  | 3 +++
 .../Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm   | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
index 054cb7662..58ea624a3 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
@@ -84,6 +84,9 @@ sub merge_vcf_files {
 
   # Sort final file
   $self->run_system_command("bcftools sort -o $final_file_sorted $final_file");
+
+  # Remove unsorted file
+  $self->run_system_command("rm $final_file");
 }
 
 1;
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
index 7ea0594c2..aad0fd8f4 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
@@ -66,7 +66,7 @@ sub default_options {
 	time_interval              => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
 	masked_scores              => 1, # calculate masked scores
         registry                   => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1
-        output_file_name           => 'spliceai_final_scores_',
+        output_file_name           => 'spliceai_final_scores_' . $self->o('ensembl_release') . '_',
 
         pipeline_wide_analysis_capacity => 80,
 

From e3fca0361c6bbdf68e7fbda6129c6d3731f0d174 Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Thu, 6 Apr 2023 12:31:18 +0100
Subject: [PATCH 03/12] Compress final files

---
 modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
index 58ea624a3..52cbdd620 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/MergeFiles.pm
@@ -84,6 +84,8 @@ sub merge_vcf_files {
 
   # Sort final file
   $self->run_system_command("bcftools sort -o $final_file_sorted $final_file");
+  $self->run_system_command("bgzip $final_file_sorted");
+  $self->run_system_command("tabix -p vcf $final_file_sorted.gz");
 
   # Remove unsorted file
   $self->run_system_command("rm $final_file");

From 3625319e491418bb8a474c76e88209f289c25641 Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Thu, 27 Apr 2023 11:59:02 +0100
Subject: [PATCH 04/12] Support compressed files as input

---
 .../Pipeline/SpliceAI/SpliceAI_conf.pm        | 24 +++++++----
 .../Variation/Pipeline/SpliceAI/SplitFiles.pm | 42 ++++++++++++-------
 2 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
index aad0fd8f4..728a85903 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
@@ -54,7 +54,7 @@ sub default_options {
         pipeline_name              => 'spliceai_scores',
         main_dir                   => $self->o('main_dir'), # main directory where all files and directories are going to be stored
         input_directory            => $self->o('main_dir') . '/input_vcf_files', # input files
-	split_vcf_no_header_dir    => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers)
+        split_vcf_no_header_dir    => $self->o('main_dir') . '/split_vcf_no_header', # contains the input files after being splitted (files without headers)
         split_vcf_input_dir        => $self->o('main_dir') . '/split_vcf_input', # contains the splitted input vcf files with headers, these are the files used to run SpliceAI
         split_vcf_output_dir       => $self->o('main_dir') . '/split_vcf_output', # temporary output files, still splitted
         output_dir                 => $self->o('main_dir') . '/output', # final output files already merged by chromosome
@@ -63,8 +63,8 @@ sub default_options {
         step_size                  => 200000, # number of variants used to split the main vcf files
         check_transcripts          => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts
         transcripts_from_file      => undef,
-	time_interval              => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
-	masked_scores              => 1, # calculate masked scores
+        time_interval              => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
+        masked_scores              => 1, # calculate masked scores
         registry                   => undef, # database where new MANE transcripts are going to be checked; only used if check_transcripts = 1
         output_file_name           => 'spliceai_final_scores_' . $self->o('ensembl_release') . '_',
 
@@ -85,8 +85,18 @@ sub resource_classes {
     my ($self) = @_;
     return {
         %{$self->SUPER::resource_classes},
-        'gpu'  => {'LSF' => '-q gpu-a100 -gpu "num=1:gmem=80000"' }, # the queue gpu-a100 can access /hps/nobackup
-        '4Gb_job'     => {'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000'},
+        'gpu'      => {
+                        'LSF'   => '-q gpu-a100 -gpu "num=1:gmem=80000"', # the queue gpu-a100 can access /hps/nobackup
+                        'SLURM' => '--time=1:30:00 --gres=gpu:1' # GPU memory doesn’t need to be specified
+                      },
+        '4Gb_job'  => {
+                        'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000',
+                        'SLURM' => "--partition=standard --time=4:00:00 --mem=4G"
+                      },
+         'default' => {
+                        'LSF' => '-R"select[mem>1000] rusage[mem=1000]" -M1000',
+                        'SLURM' => "--partition=standard --time=1:00:00 --mem=1G"
+                      }
     };
 }
 
@@ -99,11 +109,11 @@ sub pipeline_analyses {
           -input_ids  => [{}],
           -parameters => {
             'input_directory' => $self->o('input_directory'),
-            'inputcmd'        => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf" -printf "%f\n"',
+            'inputcmd'        => 'find #input_directory# -type f -name "all_snps_ensembl_*.vcf.gz" -printf "%f\n"',
           },
           -flow_into  => {
             '2->A' => {'split_files' => {'vcf_file' => '#_0#'}},
-            'A->1' => ['get_chr_dir'],
+            'A->1' => ['get_chr_dir']
           },
       },
       { -logic_name => 'split_files',
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
index 1a89d0be1..e7ea461b8 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
@@ -64,7 +64,7 @@ sub run {
 sub set_chr_from_filename {
   my $self = shift;
   my $vcf_file = $self->param_required('vcf_file');
-  $vcf_file =~ /.*_chr(.*)\.vcf$/;
+  $vcf_file =~ /.*_chr(.*)\.vcf.gz$/;
   my $chr = $1;
   if (!$chr) {
     die("Could not get chromosome name from file name ($vcf_file).");
@@ -82,8 +82,19 @@ sub split_vcf_file {
   my $split_vcf_output_dir = $self->param_required('split_vcf_output_dir');
   my $step_size = $self->param_required('step_size');
 
+  # If we are checking the transcripts then the input file is a VCF
+  # Remove .gz
+  if($check) {
+    $vcf_file =~ s/vcf.gz/vcf/;
+  }
+
   my $vcf_file_path = $input_dir . '/' . $vcf_file;
 
+  if(!$check && $vcf_file =~ /vcf.gz/) {
+    $self->run_system_command("gunzip $vcf_file_path");
+    $vcf_file_path =~ s/vcf.gz/vcf/;
+  }
+
   if (! -d $input_dir) {
     die("Directory ($input_dir) doesn't exist");
   }
@@ -173,8 +184,13 @@ sub check_split_vcf_file {
   }
 
   # prepare input vcf file for tabix
-  $self->run_system_command("bgzip $vcf_file_path");
-  $self->run_system_command("tabix -p vcf $vcf_file_path.gz");
+  if($vcf_file !~ /vcf.gz/) {
+    $self->run_system_command("bgzip $vcf_file_path");
+    $self->run_system_command("tabix -p vcf $vcf_file_path.gz");
+  }
+  else {
+    $self->run_system_command("tabix -p vcf $vcf_file_path");
+  }
 
   # Create directory to store the input file only with the variants of interest (overlapping new transcripts)
   my $vcf_file_path_subset = $main_dir . '/input_vcf_files_subset';
@@ -184,7 +200,10 @@ sub check_split_vcf_file {
 
   $self->param('input_dir_subset', $vcf_file_path_subset);
 
-  open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file) or die $!;
+  my $vcf_file_sub = $vcf_file;
+  $vcf_file_sub =~ s/.gz//;
+
+  open(my $write, '>', $vcf_file_path_subset . '/' . $vcf_file_sub) or die $!;
 
   my $positions_of_interest = $transcripts->{$chr};
   foreach my $position (@$positions_of_interest) {
@@ -194,8 +213,8 @@ sub check_split_vcf_file {
 
     my $pos_string = sprintf("%s:%i-%i", $chr, $transcript_start, $transcript_end);
 
-    (open my $read, "tabix  " . $input_dir . "/" . $vcf_file . ".gz $pos_string |")
-      || die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . ".gz \n" ;
+    (open my $read, "tabix  " . $input_dir . "/" . $vcf_file . " $pos_string |")
+      or die "Failed to read from input vcf file " . $input_dir . "/" . $vcf_file . " \n" ;
 
     while (my $row = <$read>) {
       chomp $row;
@@ -212,8 +231,8 @@ sub check_split_vcf_file {
   close($write);
 
   # Sort new vcf file
-  my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file;
-  my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file;
+  my $vcf_file_subset = $vcf_file_path_subset . '/' . $vcf_file_sub;
+  my $vcf_file_subset_sorted = $vcf_file_path_subset . '/sorted_' . $vcf_file_sub;
   $self->run_system_command("sort -o $vcf_file_subset_sorted -k1,1 -k2,2n $vcf_file_subset");
   $self->run_system_command("mv $vcf_file_subset_sorted $vcf_file_subset");
 }
@@ -291,11 +310,4 @@ sub get_new_transcripts_file {
   $self->param('transcripts', \%new_transcripts);
 }
 
-sub count_lines {
-  my $self = shift;
-  my $filename = shift;
-
-  
-}
-
 1;

From 42595529ba67e93f0c9735fa94e2eca226f6134c Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Fri, 10 May 2024 14:28:00 +0100
Subject: [PATCH 05/12] Update spliceai

---
 .../Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm  | 4 +++-
 .../EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm    | 6 +++---
 .../Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm   | 3 +++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm
index c6dd79f5d..fd0aa12b6 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/RunSpliceAI.pm
@@ -69,7 +69,9 @@ sub run_spliceai {
   }
 
   my $tmp_dir = $main_dir . "/tmp";
-  $self->run_system_command("mkdir $tmp_dir");
+
+  # activate conda env
+  $self->run_system_command("source activate spliceai");
 
   my $cmd = "spliceai -I $vcf_input_dir_chr/$vcf_file -O $output_vcf_files_dir/$vcf_file -R $fasta_file -A $gene_annotation -B 4096 -T 256 -t $tmp_dir";
 
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
index 728a85903..7efd09777 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SpliceAI_conf.pm
@@ -60,7 +60,7 @@ sub default_options {
         output_dir                 => $self->o('main_dir') . '/output', # final output files already merged by chromosome
         fasta_file                 => $self->o('fasta_file'),
         gene_annotation            => $self->o('gene_annotation'),
-        step_size                  => 200000, # number of variants used to split the main vcf files
+        step_size                  => 500000, # number of variants used to split the main vcf files
         check_transcripts          => 0, # if set to 1 checks which are the new MANE Select transcripts for the last months and only calculates SpliceAI scores for these variants overlapping these transcripts
         transcripts_from_file      => undef,
         time_interval              => 4, # checks which transcripts were updated/created in the last 4 months; only used if check_transcripts = 1 and we want to check the new transcripts in the core db
@@ -87,7 +87,7 @@ sub resource_classes {
         %{$self->SUPER::resource_classes},
         'gpu'      => {
                         'LSF'   => '-q gpu-a100 -gpu "num=1:gmem=80000"', # the queue gpu-a100 can access /hps/nobackup
-                        'SLURM' => '--time=1:30:00 --gres=gpu:1' # GPU memory doesn’t need to be specified
+                        'SLURM' => '--time=5:00:00 --gres=gpu:a100:1 --mem=24G'
                       },
         '4Gb_job'  => {
                         'LSF' => '-q production -R"select[mem>4000] rusage[mem=4000]" -M4000',
@@ -95,7 +95,7 @@ sub resource_classes {
                       },
          'default' => {
                         'LSF' => '-R"select[mem>1000] rusage[mem=1000]" -M1000',
-                        'SLURM' => "--partition=standard --time=1:00:00 --mem=1G"
+                        'SLURM' => "--partition=standard --time=1:00:00 --mem=2G"
                       }
     };
 }
diff --git a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
index e7ea461b8..a67be4b5f 100644
--- a/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
+++ b/modules/Bio/EnsEMBL/Variation/Pipeline/SpliceAI/SplitFiles.pm
@@ -59,6 +59,9 @@ sub run {
     $self->check_split_vcf_file();
   }
   $self->split_vcf_file();
+
+  # create tmp dir where spliceai tmp files will be stored
+  $self->create_dir($self->param_required('main_dir') . '/tmp');
 }
 
 sub set_chr_from_filename {

From 02b579c023ddf8e130fba9132ee008e7fea84013 Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Fri, 23 Aug 2024 15:46:06 +0100
Subject: [PATCH 06/12] Add script to generate gene annotation file for
 spliceai

---
 scripts/python/spliceai_annotation_file.py | 176 +++++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 scripts/python/spliceai_annotation_file.py

diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py
new file mode 100644
index 000000000..616e5f708
--- /dev/null
+++ b/scripts/python/spliceai_annotation_file.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+
+"""
+    Script to generate the gene annotation file for SpliceAI.
+    This file contains data about the transcript that is being used for each gene.
+    SpliceAI only annotates variants overlapping these transcripts.
+
+    Template provided by SpliceAI: https://github.com/Illumina/SpliceAI/blob/master/spliceai/annotations/grch38.txt
+
+    Gene annotation file format:
+        #NAME   CHROM   STRAND  TX_START    TX_END  EXON_START  EXON_END
+        KRTAP27-1   21  -   30337013    30337694    30337013,   30337694,
+
+    Options:
+            --output_file   gene annotation output file (Optional. Default: gene_annotation.txt)
+            --species       species name                (Optional. Default: homo_sapiens)
+            --assembly      assembly version            (Optional. Default: 38)
+            --host          core database host          (Mandatory)
+            --port          host port                   (Mandatory)
+            --user          host user                   (Mandatory)
+            --release       core database version       (Mandatory)
+"""
+
+import argparse
+import mysql.connector
+from mysql.connector import Error
+
+
+def fetch_transcripts(species, assembly, release, host, port, user):
+    gene_annotation = {}
+    database = f"{species}_core_{release}_{assembly}"
+
+    sql_select = """
+                    SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
+                    e.seq_region_start,e.seq_region_end FROM transcript t
+                    JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
+                    JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
+                    JOIN seq_region s ON t.seq_region_id = s.seq_region_id
+                    JOIN gene g ON g.gene_id = t.gene_id
+                    JOIN gene_attrib ga ON g.gene_id = ga.gene_id
+                    JOIN exon_transcript et ON t.transcript_id = et.transcript_id
+                    JOIN exon e ON e.exon_id = et.exon_id
+                    WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding' and ga.attrib_type_id = 4
+                """
+    # For human select 'MANE_Select' transcripts
+    if species == "homo_sapiens":
+        sql_select = f"{sql_select} and atr.code = 'MANE_Select'"
+    else:
+        sql_select = f"{sql_select} and atr.code = 'is_canonical'"
+
+    # Sort results
+    sql_select = f"{sql_select} order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end"
+
+    connection = mysql.connector.connect(host=host,
+                                         database=database,
+                                         user=user,
+                                         password='',
+                                         port=port)
+
+    try:
+        if connection.is_connected():
+            cursor = connection.cursor()
+            cursor.execute(sql_select)
+            data = cursor.fetchall()
+            for row in data:
+                strand = row[2]
+                if strand == 1:
+                    strand = "+"
+                else:
+                    strand = "-"
+
+                if row[0] not in gene_annotation:
+                    exons_start = []
+                    exons_end = []
+                    exons_start.append(str(row[5]))
+                    exons_end.append(str(row[6]))
+
+                    gene_annotation[row[0]] = {
+                        "chr": row[1],
+                        "strand": strand,
+                        "start": row[3],
+                        "end": row[4],
+                        "exons_start": exons_start,
+                        "exons_end": exons_end
+                    }
+                else:
+                    gene_annotation[row[0]]["exons_start"].append(str(row[5]))
+                    gene_annotation[row[0]]["exons_end"].append(str(row[6]))
+
+    except Error as e:
+        print(f"Error while connecting to MySQL {database}", e)
+    finally:
+        if connection.is_connected():
+            cursor.close()
+            connection.close()
+
+    return gene_annotation
+
+def sanity_checks(transcripts_list):
+    fail_list = []
+
+    for gene, data in transcripts_list.items():
+        check = 1
+
+        # transcript start > end
+        if data["start"] >= data["end"]:
+            check = 0
+        
+        # the exon start has to be bigger than the last exon end
+        i = 0
+        for exon_start in data["exons_start"]:
+            if i > 0:
+                last_exon_end = data["exons_end"][i - 1]
+                if(int(exon_start) < int(last_exon_end)):
+                    check = 0
+
+            i += 1
+
+        if check == 0:
+            fail_list.append(gene)
+
+    return fail_list
+
+def write_output(transcripts_list, output_file):
+    # Write to output file
+    with open(output_file, "w") as f:
+        f.write("#NAME\tCHROM\tSTRAND\tTX_START\tTX_END\tEXON_START\tEXON_END\n")
+
+        for gene, data in transcripts_list.items():
+            chr = data["chr"]
+            strand = data["strand"]
+            start = data["start"]
+            end = data["end"]
+            exons_start = (",").join(data["exons_start"])
+            exons_end = (",").join(data["exons_end"])
+
+            f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start}\t{exons_end}\n")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Generate the gene annotation file for SpliceAI")
+    parser.add_argument("-o", "--output_file",
+                        default="gene_annotation.txt",
+                        help="output file (default: gene_annotation.txt)")
+    parser.add_argument("-sp", "--species",
+                        default="homo_sapiens",
+                        help="species (default: homo_sapiens)")
+    parser.add_argument("-a", "--assembly",
+                        default="38",
+                        help="species assembly (default: 38)")
+    parser.add_argument("-r", "--release", required=True)
+    parser.add_argument("--host", required=True)
+    parser.add_argument("--port", required=True)
+    parser.add_argument("--user", required=True)
+    args = parser.parse_args()
+
+    output_file = args.output_file
+    species = args.species
+    assembly = args.assembly
+    release = args.release
+    host = args.host
+    port = args.port
+    user = args.user
+
+    transcripts_list = fetch_transcripts(species, assembly, release, host, port, user)
+
+    check = sanity_checks(transcripts_list)
+
+    if not check:
+        write_output(transcripts_list, output_file)
+    else:
+        print("Sanity checks failed for the following genes: ", (", ").join(check))
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file

From 5901d680f0619ebcd7710494e97d46346ae88b1b Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Fri, 23 Aug 2024 15:49:10 +0100
Subject: [PATCH 07/12] Add comma

---
 scripts/python/spliceai_annotation_file.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py
index 616e5f708..72ad49313 100644
--- a/scripts/python/spliceai_annotation_file.py
+++ b/scripts/python/spliceai_annotation_file.py
@@ -134,7 +134,7 @@ def write_output(transcripts_list, output_file):
             exons_start = (",").join(data["exons_start"])
             exons_end = (",").join(data["exons_end"])
 
-            f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start}\t{exons_end}\n")
+            f.write(f"{gene}\t{chr}\t{strand}\t{start}\t{end}\t{exons_start},\t{exons_end},\n")
 
 
 def main():

From 88dbd76baa2688dd649ba2c86d638f7673dd759a Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Fri, 23 Aug 2024 16:20:43 +0100
Subject: [PATCH 08/12] Update sql query for non-human

---
 scripts/python/spliceai_annotation_file.py | 47 +++++++++++++---------
 1 file changed, 29 insertions(+), 18 deletions(-)

diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py
index 72ad49313..7112e8ddc 100644
--- a/scripts/python/spliceai_annotation_file.py
+++ b/scripts/python/spliceai_annotation_file.py
@@ -30,26 +30,37 @@ def fetch_transcripts(species, assembly, release, host, port, user):
     gene_annotation = {}
     database = f"{species}_core_{release}_{assembly}"
 
-    sql_select = """
-                    SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
-                    e.seq_region_start,e.seq_region_end FROM transcript t
-                    JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
-                    JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
-                    JOIN seq_region s ON t.seq_region_id = s.seq_region_id
-                    JOIN gene g ON g.gene_id = t.gene_id
-                    JOIN gene_attrib ga ON g.gene_id = ga.gene_id
-                    JOIN exon_transcript et ON t.transcript_id = et.transcript_id
-                    JOIN exon e ON e.exon_id = et.exon_id
-                    WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding' and ga.attrib_type_id = 4
-                """
-    # For human select 'MANE_Select' transcripts
+    # For human we select 'MANE_Select' transcripts and the gene name is a gene attrib
     if species == "homo_sapiens":
-        sql_select = f"{sql_select} and atr.code = 'MANE_Select'"
+        sql_select = """
+                        SELECT ga.value,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
+                        e.seq_region_start,e.seq_region_end FROM transcript t
+                        JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
+                        JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
+                        JOIN seq_region s ON t.seq_region_id = s.seq_region_id
+                        JOIN gene g ON g.gene_id = t.gene_id
+                        JOIN gene_attrib ga ON g.gene_id = ga.gene_id
+                        JOIN exon_transcript et ON t.transcript_id = et.transcript_id
+                        JOIN exon e ON e.exon_id = et.exon_id
+                        WHERE t.stable_id like 'ENST%' and t.biotype = 'protein_coding'
+                        and ga.attrib_type_id = 4 and atr.code = 'MANE_Select'
+                        order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end
+                """
     else:
-        sql_select = f"{sql_select} and atr.code = 'is_canonical'"
-
-    # Sort results
-    sql_select = f"{sql_select} order by ga.value,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end"
+        # For other species we select the canonical transcripts and the gene name is in xref
+        sql_select = """
+                        SELECT xr.display_label,t.gene_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
+                        e.seq_region_start,e.seq_region_end FROM transcript t
+                        JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
+                        JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
+                        JOIN seq_region s ON t.seq_region_id = s.seq_region_id
+                        JOIN gene g ON g.gene_id = t.gene_id
+                        JOIN exon_transcript et ON t.transcript_id = et.transcript_id
+                        JOIN exon e ON e.exon_id = et.exon_id
+                        JOIN xref xr ON g.display_xref_id = xr.xref_id
+                        WHERE t.stable_id like 'ENS%' and t.biotype = 'protein_coding' and atr.code = 'is_canonical'
+                        order by xr.display_label,s.name,t.seq_region_start,t.seq_region_end,e.seq_region_start,e.seq_region_end
+                     """
 
     connection = mysql.connector.connect(host=host,
                                          database=database,

From 1b71d2743a4278b9d97b79486d0b33a59dcd82cb Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Tue, 27 Aug 2024 15:25:07 +0100
Subject: [PATCH 09/12] Fix sql query

---
 scripts/python/spliceai_annotation_file.py |  2 +-
 tools/variant_simulator/simulate_variation | 45 ++++++++--------------
 2 files changed, 18 insertions(+), 29 deletions(-)

diff --git a/scripts/python/spliceai_annotation_file.py b/scripts/python/spliceai_annotation_file.py
index 7112e8ddc..29afcb4e0 100644
--- a/scripts/python/spliceai_annotation_file.py
+++ b/scripts/python/spliceai_annotation_file.py
@@ -49,7 +49,7 @@ def fetch_transcripts(species, assembly, release, host, port, user):
     else:
         # For other species we select the canonical transcripts and the gene name is in xref
         sql_select = """
-                        SELECT xr.display_label,t.gene_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
+                        SELECT DISTINCT g.stable_id,s.name,t.seq_region_strand,t.seq_region_start,t.seq_region_end,
                         e.seq_region_start,e.seq_region_end FROM transcript t
                         JOIN transcript_attrib ta ON t.transcript_id = ta.transcript_id
                         JOIN attrib_type atr ON ta.attrib_type_id = atr.attrib_type_id
diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation
index 582fb76d9..421c99f50 100755
--- a/tools/variant_simulator/simulate_variation
+++ b/tools/variant_simulator/simulate_variation
@@ -60,7 +60,8 @@ usage() if ($help);
 
 die "Illegal arguments -exonsOnly -intronsOnly can't be used together !" if $exonOnly && $intronOnly;
 die "Illegal arguments -intronsOnly -codingsOnly can't be used together !" if $codingOnly && $intronOnly;
-die "Illegal arguments -onlyMane can only be used alone !" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly);
+die "Illegal arguments -onlyMane can only be used alone!" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly);
+die "Missing arguments -spliceai has to be used with -onlyMane or -chromOnly!" if $spliceai && !$onlyMane && !$chromOnly;
 
 my $registry = 'Bio::EnsEMBL::Registry';
 if (defined $registryFile){
@@ -92,7 +93,7 @@ my $biotype = 'protein_coding';
 my $FHO;
 open ($FHO, ">$outFile") or die "Can't open file to write: $outFile, $!\n";
 
-print_header($FHO, $assembly, $spliceai);
+print_header($FHO, $assembly, $spliceai, $species, $chromOnly);
 
 my @geneIDs;
 if ($chromOnly || $gene){ #if chrom or gene specified then use the most stringent
@@ -126,36 +127,24 @@ sub print_header {
   my $FHO = shift;
   my $assembly = shift;
   my $spliceai = shift;
+  my $species = shift;
+  my $chromOnly = shift;
 
   print $FHO "##fileformat=VCFv4.2", "\n";
+
   if(defined $spliceai) {
-    my $reference = $assembly eq "grch38" ? "GRCh38/hg38" : "GRCh37/hg19";
+    my $reference = $assembly;
+
+    if($species eq "homo_sapiens" || $species eq "human") {
+      $reference = $assembly eq "grch38" ? "GRCh38/hg38" : "GRCh37/hg19";
+    }
     print $FHO "##reference=$reference", "\n";
-    print $FHO "##contig=<ID=1,length=248956422>", "\n";
-    print $FHO "##contig=<ID=2,length=242193529>", "\n";
-    print $FHO "##contig=<ID=3,length=198295559>", "\n";
-    print $FHO "##contig=<ID=4,length=190214555>", "\n";
-    print $FHO "##contig=<ID=5,length=181538259>", "\n";
-    print $FHO "##contig=<ID=6,length=170805979>", "\n";
-    print $FHO "##contig=<ID=7,length=159345973>", "\n";
-    print $FHO "##contig=<ID=8,length=145138636>", "\n";
-    print $FHO "##contig=<ID=9,length=138394717>", "\n";
-    print $FHO "##contig=<ID=10,length=133797422>", "\n";
-    print $FHO "##contig=<ID=11,length=135086622>", "\n";
-    print $FHO "##contig=<ID=12,length=133275309>", "\n";
-    print $FHO "##contig=<ID=13,length=114364328>", "\n";
-    print $FHO "##contig=<ID=14,length=107043718>", "\n";
-    print $FHO "##contig=<ID=15,length=101991189>", "\n";
-    print $FHO "##contig=<ID=16,length=90338345>", "\n";
-    print $FHO "##contig=<ID=17,length=83257441>", "\n";
-    print $FHO "##contig=<ID=18,length=80373285>", "\n";
-    print $FHO "##contig=<ID=19,length=58617616>", "\n";
-    print $FHO "##contig=<ID=20,length=64444167>", "\n";
-    print $FHO "##contig=<ID=21,length=46709983>", "\n";
-    print $FHO "##contig=<ID=22,length=50818468>", "\n";
-    print $FHO "##contig=<ID=X,length=156040895>", "\n";
-    print $FHO "##contig=<ID=Y,length=57227415>", "\n";
-    print $FHO "##contig=<ID=MT,length=16569>", "\n";
+
+    # Get chromosome size
+    my $chr_slice = $slice_adaptor->fetch_by_region( 'chromosome', $chromOnly);
+    my $length = $chr_slice->seq_region_length();
+
+    print $FHO "##contig=<ID=$chromOnly,length=$length>", "\n";
   }
   else {
     print $FHO '##INFO=<ID=GENE,Number=1,Type=String,Description="Gene symbol or Gene stable id">',"\n";

From 96b0d0c02ff63adc45d4baf6b99bd8e9b8a818f7 Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Fri, 30 Aug 2024 09:25:06 +0100
Subject: [PATCH 10/12] Add option to simulate_variation

---
 tools/variant_simulator/simulate_variation | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation
index 421c99f50..0d5f32568 100755
--- a/tools/variant_simulator/simulate_variation
+++ b/tools/variant_simulator/simulate_variation
@@ -34,7 +34,7 @@ use Bio::EnsEMBL::Registry;
 use Getopt::Long qw(GetOptions);
 
 my ($help, $chromOnly, $gene, $registryFile,
-$exonOnly, $intronOnly, $codingOnly,$onlyMane,$refseq_genes,$spliceai
+$exonOnly, $intronOnly, $codingOnly, $canonicalOnly, $onlyMane, $refseq_genes, $spliceai
 );
 my ($species, $assembly, $edges, $outFile) = ("homo_sapiens", "grch38", 0, "simulated.vcf");
 
@@ -47,6 +47,7 @@ GetOptions( "species=s" => \$species,
     "output|o=s"  => \$outFile,
     "exonsOnly"   => \$exonOnly,
     "codingOnly"  => \$codingOnly,
+    "canonicalOnly" => \$canonicalOnly,
     "onlyMane"    => \$onlyMane,
     "intronsOnly" => \$intronOnly,
     "assembly=s"  => \$assembly,
@@ -61,7 +62,6 @@ usage() if ($help);
 die "Illegal arguments -exonsOnly -intronsOnly can't be used together !" if $exonOnly && $intronOnly;
 die "Illegal arguments -intronsOnly -codingsOnly can't be used together !" if $codingOnly && $intronOnly;
 die "Illegal arguments -onlyMane can only be used alone!" if $onlyMane && ($exonOnly || $codingOnly || $intronOnly);
-die "Missing arguments -spliceai has to be used with -onlyMane or -chromOnly!" if $spliceai && !$onlyMane && !$chromOnly;
 
 my $registry = 'Bio::EnsEMBL::Registry';
 if (defined $registryFile){
@@ -183,6 +183,13 @@ sub processGenes{
           @parts = ($transcript);
         }
       }
+    } elsif ($canonicalOnly) {
+      my @transc = @{ $geneID->get_all_Transcripts() };
+      while ( my $transcript = shift @transc ) {
+        if($transcript->is_canonical()) {
+          @parts = ($transcript);
+        }
+      }
     }
       else {
       @parts = ($geneID);
@@ -254,6 +261,7 @@ sub usage {
       -exonsOnly         Generate SNPs only for all exons of the genes
       -intronsOnly       Generate SNPs only for all introns of the genes
       -onlyMane          Generate SNPs only for MANE transcripts
+      -canonicalOnly     Generate SNPs only for canonical transcripts
 
       -edge              Length of flanking region in which to generate SNPs (Default: 0)
 

From cc8dafc5914975094c42b2c0b7173844c0d9e9ca Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Fri, 30 Aug 2024 10:16:30 +0100
Subject: [PATCH 11/12] simulate_variation to write unique variants

---
 tools/variant_simulator/simulate_variation | 17 ++++++++++++++---
 1 file changed, 14 insertions(+), 3 deletions(-)

diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation
index 0d5f32568..2e0606397 100755
--- a/tools/variant_simulator/simulate_variation
+++ b/tools/variant_simulator/simulate_variation
@@ -206,7 +206,9 @@ sub generateSNPs {
   my $fho = shift;
   my $spliceai = shift;
 
+  my %list_variants;
   my @bases = qw/A C T G/;
+
   while( my $part = shift (@$elements)){
     my $start = $part->seq_region_start - $edges;
     my $pos = $start;
@@ -216,8 +218,10 @@ sub generateSNPs {
     $partID = $part->display_id if $intronOnly;
     $partID = $part->display_id;
     my $seq = $slice_adaptor->fetch_by_region("chromosome", $chrom, $start ,$part->seq_region_end + $edges)->seq;
+
     my @seqAr = split("", $seq);
     my ($id, $qual, $filter, $info) = (".", ".", ".", ".");
+
     for ( my $i = 0; $i < @seqAr; $i++ ) {
          my $ref_seq = $seqAr[$i];
          my @tmp_alts = grep {$_ ne $ref_seq} @bases;
@@ -226,11 +230,18 @@ sub generateSNPs {
            $info = 'GENE='.$gene_name.';'.'FEATURE='.$partID;
          }
          while(my $tmp_alt = shift @tmp_alts) {
+           my $key = $chrom."-".$pos."-".$ref_seq."-".$tmp_alt;
+
            if(!$spliceai) {
-             $id = $chrom."-".$pos."-".$ref_seq."-".$tmp_alt;
+             $id = $key;
+           }
+
+           # write to file only if variant is unique
+           if(!$list_variants{$key}) {
+            $list_variants{$key} = 1;
+            printf $fho "%s\t%s\t%s\t%s\t%s\t", $chrom, $pos, $id, $ref_seq, $tmp_alt;
+            printf $fho "%s\t%s\t%s\n", $qual, $filter, $info;
            }
-           printf $fho "%s\t%s\t%s\t%s\t%s\t", $chrom, $pos, $id, $ref_seq, $tmp_alt;
-           printf $fho "%s\t%s\t%s\n", $qual, $filter, $info;
         }
     }
   }

From bc55b803976247edd63ea266d06dd52510ff9dd5 Mon Sep 17 00:00:00 2001
From: dglemos <dlemos@ebi.ac.uk>
Date: Fri, 30 Aug 2024 10:38:10 +0100
Subject: [PATCH 12/12] simulate_variation fix variable

---
 tools/variant_simulator/simulate_variation | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/variant_simulator/simulate_variation b/tools/variant_simulator/simulate_variation
index 2e0606397..203c8bfe9 100755
--- a/tools/variant_simulator/simulate_variation
+++ b/tools/variant_simulator/simulate_variation
@@ -89,6 +89,7 @@ my $slice_adaptor = $registry->get_adaptor( $species, $dbtype, 'slice' );
 my $gene_adaptor = $registry->get_adaptor( $species, $dbtype, "gene" );
 
 my $biotype = 'protein_coding';
+my %list_variants;
 
 my $FHO;
 open ($FHO, ">$outFile") or die "Can't open file to write: $outFile, $!\n";
@@ -206,7 +207,6 @@ sub generateSNPs {
   my $fho = shift;
   my $spliceai = shift;
 
-  my %list_variants;
   my @bases = qw/A C T G/;
 
   while( my $part = shift (@$elements)){