-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_top_blast_hit_for_each_sequence.pl
68 lines (49 loc) · 1.69 KB
/
retrieve_top_blast_hit_for_each_sequence.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env perl
# Retrieves top hit for each sequence (assumes they are in order in blast output).
# Usage:
# perl retrieve_top_blast_hit_for_each_sequence.pl [blast output]
# Prints to console. To print to file, use
# perl retrieve_top_blast_hit_for_each_sequence.pl [blast output]
# > [output subset of blast output table]
use strict;
use warnings;
my $blast_output = $ARGV[0]; # format: qseqid sacc stitle staxids sscinames sskingdoms qlen slen length pident qcovs evalue
my $NO_DATA = "NA";
my $NEWLINE = "\n";
my $DELIMITER = "\t";
my $TAXONID_SEPARATOR = ";"; # in blast file
# blast file
my $SEQUENCE_NAME_COLUMN = 0; # qseqid
my $MATCHED_TAXONID_COLUMN = 3; # staxids (Subject Taxonomy ID(s), separated by a ';')
my $SUPERKINGDOM_COLUMN = 5; # sskingdoms (Subject Super Kingdom(s), separated by a ';' (in alphabetical order))
my $PERCENT_ID_COLUMN = 9; # pident
my $QUERY_COVERAGE_COLUMN = 10; # qcovs
my $EVALUE_COLUMN = 11; # evalue
# verifies that input file exists and is not empty
if(!$blast_output or !-e $blast_output or -z $blast_output)
{
print STDERR "Error: blast output not provided, does not exist, or empty:\n\t"
.$blast_output."\nExiting.\n";
die;
}
# reads in blast output and extracts top blast hit for each sequence
open BLAST_OUTPUT, "<$blast_output" || die "Could not open $blast_output to read\n";
my $previous_sequence_name = "";
while(<BLAST_OUTPUT>)
{
chomp;
if($_ =~ /\S/)
{
my @items = split($DELIMITER, $_);
my $sequence_name = $items[$SEQUENCE_NAME_COLUMN];
if($sequence_name ne $previous_sequence_name)
{
print $_;
print $NEWLINE;
$previous_sequence_name = $sequence_name;
}
}
}
close BLAST_OUTPUT;
# August 30, 2020
# January 18, 2022