-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_taxonids_from_accession_numbers.pl
88 lines (68 loc) · 2.57 KB
/
retrieve_taxonids_from_accession_numbers.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
#!/usr/bin/env perl
# Given a list of accession numbers, one per line, retrieves corresponding taxon ids.
# Outputs tab-separating map with sequence name and taxon id.
# Install Entrez before running. Use either of these two commands:
# sh -c "$(curl -fsSL ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"
# sh -c "$(wget -q ftp://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh -O -)"
# More info: https://www.ncbi.nlm.nih.gov/books/NBK179288/
# Usage:
# perl retrieve_taxonids_from_accession_numbers.pl
# [path of file with list of accession numbers, one per line] [database (nuccore by default)]
# Prints to console. To print to file, use
# perl retrieve_taxonids_from_accession_numbers.pl
# [path of file with list of accession numbers, one per line]
# [database (nuccore by default)] > [output mapping table path]
use strict;
use warnings;
my $accession_numbers_file = $ARGV[0]; # list of accession numbers, one per line
my $database = $ARGV[1]; # nuccore (default) or protein or other
if(!$database)
{
$database = "nuccore";
}
my $NO_DATA = "NA";
my $NEWLINE = "\n";
my $DELIMITER = "\t";
# generates accession number to taxon id mapping
# (D90600.1 becomes D90600; rows printed out of order)
my $sacc_to_taxon_id_string = `cat $accession_numbers_file | epost -db $database | esummary | xtract -pattern DocumentSummary -element Caption,TaxId`;
# reads in accession number to taxon id mapping
my %accession_number_without_version_to_taxon_id = (); # key: accession number -> value: taxon id
foreach my $line(split($NEWLINE, $sacc_to_taxon_id_string))
{
if($line =~ /\S/)
{
my @items = split($DELIMITER, $line);
my $accession_number = $items[0];
my $taxonid = $items[1];
$accession_number_without_version_to_taxon_id{$accession_number} = $taxonid;
}
}
# retrieves original accession numbers in their original order
open ACCESSION_NUMBERS, "<$accession_numbers_file" || die "Could not open $accession_numbers_file to read\n";
while(<ACCESSION_NUMBERS>)
{
chomp;
if($_ =~ /\S/)
{
# retrieves accession number without version
my $accession_number = $_;
my $accession_number_without_version = $accession_number;
if($accession_number_without_version =~ /^(.*)[.]\d+/)
{
$accession_number_without_version = $1;
}
# retrieves taxon id
my $taxonid = $accession_number_without_version_to_taxon_id{$accession_number_without_version};
# prints accession number and txaon id
print $accession_number.$DELIMITER;
if(defined $taxonid)
{
print $taxonid;
}
print $NEWLINE;
}
}
close ACCESSION_NUMBERS;
# December 1, 2022
# December 27, 2022