-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_sequences_with_no_blast_hits.pl
148 lines (122 loc) · 4.19 KB
/
retrieve_sequences_with_no_blast_hits.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env perl
# Retrieves sequences that do not have blast results.
# Usage:
# perl extract_sequences_with_no_blast_hits.pl [blast output file]
# [fasta file that was input to blast]
# [minimum percent identity for a blast hit to be counted]
# [minimum query coverage for a blast hit to be counted]
# Prints to console. To print to file, use
# perl extract_sequences_with_no_blast_hits.pl [blast output file]
# [fasta file that was input to blast]
# [minimum percent identity for a blast hit to be counted]
# [minimum query coverage for a blast hit to be counted] > [output fasta file path]
use strict;
use warnings;
my $blast_output = $ARGV[0]; # blast output format: qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue
my $fasta_file = $ARGV[1]; # contains all sequences included in broad-database blast output--should be the fasta file used as input to broad-database blast
my $minimum_pident = $ARGV[2]; # minimum percent identity to count a blast hit
my $minimum_qcovs = $ARGV[3]; # minimum query coverage to count a blast hit
my $NEWLINE = "\n";
my $DELIMITER = "\t";
# blast file
my $SEQUENCE_NAME_COLUMN = 0; # qseqid
my $MATCHED_TAXONID_COLUMN = 3; # staxids (Subject Taxonomy ID(s), separated by a ';')
my $SUPERKINGDOM_COLUMN = 5; # sskingdoms (Subject Super Kingdom(s), separated by a ';' (in alphabetical order))
my $PERCENT_ID_COLUMN = 9; # pident
my $QUERY_COVERAGE_COLUMN = 10; # qcovs
my $EVALUE_COLUMN = 11; # evalue
# verifies that input files exist and are not empty
if(!$fasta_file or !-e $fasta_file or -z $fasta_file)
{
print STDERR "Error: fasta file not provided, does not exist, or empty:\n\t"
.$fasta_file."\nExiting.\n";
die;
}
if(!$blast_output or !-e $blast_output or -z $blast_output)
{
print STDERR "Error: blast output file not provided, does not exist, or empty:\n\t"
.$blast_output."\nExiting.\n";
die;
}
# sets thresholds to default values if not provided
if(!$minimum_pident)
{
$minimum_pident = 0;
}
if(!$minimum_qcovs)
{
$minimum_qcovs = 0;
}
# retrieves list of all sequences that were input to blast (all sequences to examine here)
my %sequence_has_no_or_poor_blast_hits = (); # key: sequence name -> value: 1 if sequence has no blast hits or only blast hits that do not pass thresholds
open FASTA, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
while(<FASTA>) # for each row in the file
{
chomp;
if($_ =~ /^>(.*)$/) # sequence name
{
my $sequence_name = $1;
$sequence_has_no_or_poor_blast_hits{$sequence_name} = 1;
}
}
close FASTA;
# reads in blast output: removes sequences with any matches
open BLAST_OUTPUT, "<$blast_output" || die "Could not open $blast_output to read\n";
while(<BLAST_OUTPUT>)
{
chomp;
if($_ =~ /\S/)
{
my @items = split($DELIMITER, $_);
my $sequence_name = $items[$SEQUENCE_NAME_COLUMN];
my $percent_id = $items[$PERCENT_ID_COLUMN];
my $query_coverage = $items[$QUERY_COVERAGE_COLUMN];
my $evalue = $items[$EVALUE_COLUMN];
my $matched_taxon_id = $items[$MATCHED_TAXONID_COLUMN];
# marks sequence to not be output
if($percent_id >= $minimum_pident and $query_coverage >= $minimum_qcovs)
{
$sequence_has_no_or_poor_blast_hits{$sequence_name} = 0;
}
}
}
close BLAST_OUTPUT;
# reads in fasa file; prints sequences with no hits
open FASTA, "<$fasta_file" || die "Could not open $fasta_file to read; terminating =(\n";
my $current_sequence_included = 0;
my %sequence_printed = ();
while(<FASTA>) # for each row in the file
{
chomp;
if($_ =~ /^>(.*)$/) # sequence name
{
my $sequence_name = $1;
if($sequence_has_no_or_poor_blast_hits{$sequence_name})
{
$current_sequence_included = 1;
$sequence_printed{$sequence_name} = 1;
}
else
{
$current_sequence_included = 0;
}
}
if($current_sequence_included)
{
print $_;
print $NEWLINE;
}
}
close FASTA;
# verifies that all expected sequences from blast output have been found in the fasta file
foreach my $sequence_name(keys %sequence_has_no_or_poor_blast_hits)
{
if($sequence_has_no_or_poor_blast_hits{$sequence_name}
and !$sequence_printed{$sequence_name})
{
print STDERR "Error: sequence ".$sequence_name." seen in blast output but "
."not in fasta.\n";
}
}
# September 1, 2020
# January 20, 2022