Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support NCBI microbe GTF/GFF with no transcripts (CDS only) #1627

Open
wants to merge 7 commits into
base: postreleasefix/113
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions modules/Bio/EnsEMBL/VEP/AnnotationSource/File/BaseGXF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -459,10 +459,15 @@ sub _create_transcript {
my $tr_record = shift;
my $gene_record = shift;

return unless $tr_record->{_children};
return unless $tr_record->{_children} or $tr_record->{type} eq 'CDS';

if (!$self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS') {
$self->warning_msg("WARNING: $self->{file} contains CDS features whose parents are genes (instead of transcripts); use --cds_as_transcript_gxf to parse protein-coding CDS as single-exon transcripts (useful for NCBI microbe GFF/GTF annotations)");
return;
}

my $id = $tr_record->{attributes}->{transcript_id} || $self->_record_get_id($tr_record);
$id =~ s/^(gene|transcript)://i;
$id =~ s/^(gene|transcript|cds)[:-]//i;

my $slice = $self->get_slice($tr_record->{chr});

Expand Down Expand Up @@ -518,8 +523,18 @@ sub _create_transcript {

# check for exons for protein_coding biotype
if ($biotype eq 'protein_coding' && scalar @exons == 0){
$self->warning_msg("WARNING: No exons found for protein_coding transcript $id");
return;
if ($self->param('cds_as_transcript_gxf') && $tr_record->{type} eq 'CDS'){
# create single-exon transcript based on a CDS without transcript parent
# example: NCBI microbe GFF/GTF annotations
push @exons, {
start => $tr_record->{start},
end => $tr_record->{end},
strand => $tr_record->{strand},
};
} else {
$self->warning_msg("WARNING: No exons found for protein-coding transcript $id");
return;
}
}

# sort exons
Expand Down
4 changes: 2 additions & 2 deletions modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GFF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -227,11 +227,11 @@ sub _record_get_biotype {
my ($self, $record, $gene_record) = @_;

if(!exists($record->{_biotype})) {

# Ensembl-y GFFs have biotype as an attribute
my $biotype = $record->{attributes}->{biotype} ||
$record->{attributes}->{transcript_type} ||
$record->{attributes}->{transcript_biotype};
$record->{attributes}->{transcript_biotype} ||
($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef);

# others we need to (guess) work it out
if(!$biotype) {
Expand Down
22 changes: 15 additions & 7 deletions modules/Bio/EnsEMBL/VEP/AnnotationSource/File/GTF.pm
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,12 @@ sub _record_get_parent_ids {
my ($self, $record) = @_;
if(!exists($record->{_parent_id})) {
my $type = lc($record->{type});
my @parent_ids = ($PARENTS{$type}) ? split(',',$record->{attributes}->{$PARENTS{$type}.'_id'}) : ();
my $parent_type = $PARENTS{$type};

# Avoid unassigned transcripts (fix for NCBI microbe GTFs)
$parent_type = 'gene' if $parent_type && $record->{attributes}->{$parent_type.'_id'} =~ /^unassigned/;

my @parent_ids = ($parent_type) ? split(',',$record->{attributes}->{$parent_type.'_id'}) : ();
$record->{_parent_id} = \@parent_ids;
}
return $record->{_parent_id};
Expand Down Expand Up @@ -162,7 +167,8 @@ sub _record_get_id {
=head2 _record_get_biotype

Arg 1 : hashref $transcript_record_hash
Example : $biotype = $as->_record_get_biotype($tr_record);
Arg 2 : hashref $gene_record_hash
Example : $biotype = $as->_record_get_biotype($tr_record, $gene_record);
Description: Get sequence ontology (SO) biotype of this record. Attempts to
find it in the "biotype", "transcript_type" or "transcript_biotype"
attribute fields, and if that fails default to the source field.
Expand All @@ -174,11 +180,13 @@ sub _record_get_id {
=cut

sub _record_get_biotype {
return
$_[1]->{attributes}->{transcript_biotype} ||
$_[1]->{attributes}->{transcript_type} ||
$_[1]->{attributes}->{biotype} ||
$_[1]->{source};
my ($self, $record, $gene_record) = @_;
my $biotype = $record->{attributes}->{transcript_biotype} ||
$record->{attributes}->{transcript_type} ||
$record->{attributes}->{biotype} ||
($self->param('cds_as_transcript_gxf') ? $gene_record->{attributes}->{gene_biotype} : undef) ||
$record->{source};
return $biotype;
}

1;
1 change: 1 addition & 0 deletions modules/Bio/EnsEMBL/VEP/Config.pm
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,7 @@ our @VEP_PARAMS = (
'ucsc_assembly=s', # required for phyloP, phastCons, e.g. use hg19 for GRCh37, hg38 for GRCh38
'ucsc_data_root=s', # replace if you have the data locally, defaults to http://hgdownload.cse.ucsc.edu/goldenpath/
'custom_multi_allelic', # prevents filtering of custom annotation data when comma separated lists are assumed to be allele specific
'cds_as_transcript_gxf', # parse CDS with gene parents as single-exon transcripts on GTF/GFF3 files (useful for NCBI microbe annotation)

# plugins
'plugin=s@', # specify a method in a module in the plugins directory
Expand Down
2 changes: 1 addition & 1 deletion t/AnnotationSource_File_GFF.t
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ SKIP: {
'type' => 'mRNA',
);
my $trans = $as->lazy_load_transcript(\%feature_record, $feature_record{_gene_record});
ok($tmp =~ /No exons found for protein_coding transcript/, 'no exons warning message');
ok($tmp =~ /No exons found for protein-coding transcript/, 'no exons warning message');

# restore STDERR
open(STDERR, ">&SAVE") or die "Can't restore STDERR\n";
Expand Down