Ensembl · nuno-agostinho · Mar 4, 2024 · Mar 14, 2024 · Sep 5, 2024 · Sep 5, 2024
diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
@@ -459,10 +459,15 @@ sub _create_transcript {
   my $tr_record = shift;
   my $gene_record = shift;
 
-  return unless $tr_record->{_children};
+  return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS';
+
+  if (!$self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS') {
+    $self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (useful for NCBI microbe GFF/GTF annotations)");
+    return;
+  }
 
   my $id = $tr_record->{attributes}->{transcript_id} || $self->_record_get_id($tr_record);
-  $id =~ s/^(gene|transcript)://i;
+  $id =~ s/^(gene|transcript|cds)[:-]//i;
 
   my $slice = $self->get_slice($tr_record->{chr});
 
@@ -518,8 +523,18 @@ sub _create_transcript {
 
   # check for exons for protein_coding biotype
   if ($biotype eq 'protein_coding' && scalar @exons == 0){
-    $self->warning_msg("WARNING: No exons found for protein_coding transcript $id");
-    return;
+    if ($self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS'){
+      # create single-exon transcript based on a CDS without transcript parent
+      # example: NCBI microbe GFF/GTF annotations
+      push @exons, {
+        start  => $tr_record->{start},
+        end    => $tr_record->{end},
+        strand => $tr_record->{strand},
+      };
+    } else {
+      $self->warning_msg("WARNING: No exons found for protein-coding transcript $id");
+      return;
+    }
   }
 
   # sort exons

diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
@@ -227,11 +227,11 @@ sub _record_get_biotype {
   my ($self, $record, $gene_record) = @_;
 
   if(!exists($record->{_biotype})) {
-
     # Ensembl-y GFFs have biotype as an attribute
     my $biotype = $record->{attributes}->{biotype} ||
                   $record->{attributes}->{transcript_type} ||
-                  $record->{attributes}->{transcript_biotype};
+                  $record->{attributes}->{transcript_biotype} ||
+                  ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef);
 
     # others we need to (guess) work it out
     if(!$biotype) {

diff --git a/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm b/modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
@@ -134,7 +134,12 @@ sub _record_get_parent_ids {
   my ($self, $record) = @_;
   if(!exists($record->{_parent_id})) {
     my $type = lc($record->{type});
-    my @parent_ids = ($PARENTS{$type}) ? split(',',$record->{attributes}->{$PARENTS{$type}.'_id'}) : ();
+    my $parent_type = $PARENTS{$type};
+
+    # Avoid unassigned transcripts (fix for NCBI microbe GTFs)
+    $parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/;
+
+    my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : ();
     $record->{_parent_id} = \@parent_ids;
   }
   return $record->{_parent_id};
@@ -162,7 +167,8 @@ sub _record_get_id {
 =head2 _record_get_biotype
 
   Arg 1      : hashref $transcript_record_hash
-  Example    : $biotype = $as->_record_get_biotype($tr_record);
+  Arg 2      : hashref $gene_record_hash
+  Example    : $biotype = $as->_record_get_biotype($tr_record, $gene_record);
   Description: Get sequence ontology (SO) biotype of this record. Attempts to
                find it in the "biotype", "transcript_type" or "transcript_biotype"
                attribute fields, and if that fails default to the source field.
@@ -174,11 +180,13 @@ sub _record_get_id {
 =cut
 
 sub _record_get_biotype {
-  return
-    $_[1]->{attributes}->{transcript_biotype} ||
-    $_[1]->{attributes}->{transcript_type} ||
-    $_[1]->{attributes}->{biotype} ||
-    $_[1]->{source};
+  my ($self, $record, $gene_record) = @_;
+  my $biotype = $record->{attributes}->{transcript_biotype} ||
+                $record->{attributes}->{transcript_type} ||
+                $record->{attributes}->{biotype} ||
+                ($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef) ||
+                $record->{source};
+  return $biotype;
 }
 
 1;
diff --git a/modules/Bio/EnsEMBL/VEP/Config.pm b/modules/Bio/EnsEMBL/VEP/Config.pm
@@ -256,6 +256,7 @@ our @VEP_PARAMS = (
   'ucsc_assembly=s',         # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38
   'ucsc_data_root=s',        # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/
   'custom_multi_allelic',    # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific
+  'cds_as_transcript_gxf',   # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI microbe annotation)
 
   # plugins
   'plugin=s@',               # specify a method in a module in the plugins directory

diff --git a/t/AnnotationSource_File_GFF.t b/t/AnnotationSource_File_GFF.t
@@ -284,7 +284,7 @@ SKIP: {
      'type' => 'mRNA',
   );
   my $trans = $as->lazy_load_transcript(\%feature_record, $feature_record{_gene_record});
-  ok($tmp =~ /No exons found for protein_coding transcript/, 'no exons warning message');
+  ok($tmp =~ /No exons found for protein-coding transcript/, 'no exons warning message');
 
   # restore STDERR
   open(STDERR, ">&SAVE") or die "Can't restore STDERR\n";