Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow https download and GITHUB_TOKEN in VEP Install #1805

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
230 changes: 206 additions & 24 deletions INSTALL.pl
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ =head1 NAME
use Net::FTP;
use Cwd;
use Scalar::Util qw(looks_like_number);
use List::Util qw(first);
use Bio::EnsEMBL::VEP::Utils qw(get_version_data get_version_string);

our (
Expand Down Expand Up @@ -79,6 +80,8 @@ =head1 NAME
$REALPATH_DEST_DIR,
$NO_TEST,
$NO_BIOPERL,
$USE_HTTPS_PROTO,
$GITHUB_TOKEN,
$ua,

$CAN_USE_CURL,
Expand All @@ -90,6 +93,7 @@ =head1 NAME
$CAN_USE_TAR,
$CAN_USE_DBI,
$CAN_USE_DBD_MYSQL,
$CAN_USE_HTTML_EXTRACT
);


Expand Down Expand Up @@ -120,6 +124,10 @@ BEGIN
$CAN_USE_DBD_MYSQL = 1;
}

if(eval q{ use HTML::TableExtract; 1 }) {
$CAN_USE_HTTML_EXTRACT = 1;
}

$CAN_USE_CURL = 1 if `which curl` =~ /\/curl/;
$CAN_USE_HTTP_TINY = 1 if eval q{ use HTTP::Tiny; 1 };
$CAN_USE_ARCHIVE = 1 if eval q{ use Archive::Extract; 1 };
Expand All @@ -146,7 +154,7 @@ BEGIN
my $git_api_root = 'https://api.github.com/repos/Ensembl/';
my $VEP_MODULE_NAME = 'ensembl-vep';

our (@store_species, @indexes, @files, $ftp, $dirname);
our (@store_species, @indexes, @files, %file_sizes, $ftp, $dirname);

my $config = {};
GetOptions(
Expand All @@ -172,7 +180,9 @@ BEGIN
'TEST',
'NO_HTSLIB|l',
'NO_TEST',
'NO_BIOPERL'
'NO_BIOPERL',
'USE_HTTPS_PROTO',
'GITHUBTOKEN|GITHUB_TOKEN=s',
) or die("ERROR: Failed to parse arguments");

# Read configuration from environment variables starting with VEP_
Expand All @@ -181,8 +191,8 @@ sub read_config_from_environment {
my $config = shift;

for my $key (keys %ENV) {
# Look for environment variables that start with VEP_
next unless $key =~ "^VEP_";
# Look for environment variables that start with VEP_ unless it is GITHUB_TOKEN
next unless ($key =~ "^VEP_" or $key eq "GITHUB_TOKEN");

# Avoid setting empty strings
my $value = $ENV{$key};
Expand Down Expand Up @@ -219,6 +229,8 @@ sub read_config_from_environment {
$NO_HTSLIB ||= $config->{NO_HTSLIB};
$NO_TEST ||= $config->{NO_TEST};
$NO_BIOPERL ||= $config->{NO_BIOPERL};
$USE_HTTPS_PROTO ||= $config->{USE_HTTPS_PROTO};
$GITHUB_TOKEN ||= $config->{GITHUBTOKEN};

# load version data
our $CURRENT_VERSION_DATA = get_version_data($RealBin.'/.version');
Expand Down Expand Up @@ -1078,6 +1090,22 @@ ()
# CACHE FILES
#############

sub convert_file_size {
# convert file size string to byte

my $size = shift;
my @units = ( 'K', 'M', 'G', 'T' );

$size =~ m/\s?(\d+\.?\d+)([T,G,M,K]?)/;

# in bytes already
return $1 unless defined $2;

# convert to bytes
my $scale = first {$units[$_] eq $2} 0..$#units;
return $scale ? $1 * (1024**($scale + 1)) : 0;
}

sub format_file_size {
# Format $size (in bytes) based on most adequate unit, e.g.:
# 0 => 0 bytes
Expand All @@ -1101,6 +1129,11 @@ sub format_file_size {

sub cache() {

if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) {
print "Cannot use HTTPS protocol for downloading cache without HTML::TableExtract Perl library installed\nSkipping cache installation\n" unless $QUIET;
return;
}

my $ok;

if($AUTO) {
Expand Down Expand Up @@ -1152,16 +1185,38 @@ ()
my $URL_TO_USE = (-e $tabix) ? $CACHE_URL_INDEXED : $CACHE_URL;

if(is_url($URL_TO_USE)) {
$URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();
if($USE_HTTPS_PROTO) {
# get cache file list in HTML format
my $ftp_html = get_html($URL_TO_USE);
unless (defined $ftp_html) {
print "curl failed to retrieve file list from FTP - $URL_TO_USE\n" unless $QUIET;
return;
}

# parse HTML cache file list
my $te = HTML::TableExtract->new();
$te->parse($ftp_html);

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
foreach my $rows ($te->first_table_found->rows){
my ($name, $size) = ($rows->[1], $rows->[3]);

next unless $name =~ /tar.gz/;
push @files, $name;
$file_sizes{$name} = convert_file_size($size);
}
}
else {
$URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
}

push @files, grep {$_ =~ /tar.gz/} $ftp->ls;
push @files, grep {$_ =~ /tar.gz/} $ftp->ls;
}
}
else {
opendir DIR, $URL_TO_USE;
Expand Down Expand Up @@ -1193,7 +1248,16 @@ ()
my $size;
my $total_size = 0;
foreach my $file(@files) {
$size = defined $ftp ? $ftp->size($file) : 0;
if (defined $ftp) {
$size = $ftp->size($file);
}
elsif (%file_sizes) {
$size = $file_sizes{$file};
}
else {
$size = 0;
}

$total_size += $size;
$size = $size ? " (" . format_file_size($size) . ")" : "";
$species_list .= ++$num . " : $file$size\n";
Expand Down Expand Up @@ -1313,11 +1377,21 @@ ()
if(is_url($URL_TO_USE)) {
print " - downloading $URL_TO_USE/$file_path\n" unless $QUIET;
if(!$TEST) {
$ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file);
if ($USE_HTTPS_PROTO) {
download_to_file("$URL_TO_USE/$file_path", $target_file);
}
else {
$ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file);
}

my $checksums = "CHECKSUMS";
my $checksums_target_file = "$CACHE_DIR/tmp/$checksums";
$ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
if ($USE_HTTPS_PROTO) {
download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
}
else {
$ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file);
}
if (-e $checksums_target_file) {
my $sum_download = `sum $target_file`;
$sum_download =~ m/([0-9]+)(\s+)([0-9]+)/;
Expand Down Expand Up @@ -1368,6 +1442,11 @@ ()
#############
sub fasta() {

if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) {
print "Cannot use HTTPS protocol for downloading FASTA without HTML::TableExtract Perl library installed\nSkipping FASTA installation\n" unless $QUIET;
return;
}

### SPECIAL CASE GRCh37
if((grep {$files[$_ - 1] =~ /GRCh37/} @indexes) || (defined($ASSEMBLY) && $ASSEMBLY eq 'GRCh37')) {

Expand Down Expand Up @@ -1402,17 +1481,39 @@ ()
}

my @dirs = ();

if(is_url($FASTA_URL)) {
$FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();
if($USE_HTTPS_PROTO) {
# get species list in HTML format
my $ftp_html = get_html($FASTA_URL);
unless (defined $ftp_html) {
print "curl failed to retrieve species list from FTP - $FASTA_URL\n" unless $QUIET;
return;
}

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
# parse HTML species list
my $te = HTML::TableExtract->new();
$te->parse($ftp_html);

foreach my $rows ($te->first_table_found->rows){
my $name = $rows->[1];

next unless $name =~ /\/$/;
next if $name =~ /ancestral_alleles/;

push @dirs, substr($name, 0, -1);
}
}
else {
$FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/;
$ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n";
$ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n";
$ftp->binary();

foreach my $sub(split /\//, $3) {
$ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n";
}
push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls;
}
push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls;
}
else {
opendir DIR, $FASTA_URL;
Expand Down Expand Up @@ -1461,6 +1562,25 @@ ()
@files = $ftp->ls;
$dna_path = $ftp->pwd =~ /dna_index/ ? 'dna_index' : 'dna';
}
elsif($USE_HTTPS_PROTO) {
# get FASTA file list in HTML format
my $ftp_html = get_html("$FASTA_URL/$species/dna");
unless (defined $ftp_html) {
print "curl failed to retrieve file list from FTP - $FASTA_URL/$species/dna\n" unless $QUIET;
return;
}

# parse HTML FASTA file list
my $te = HTML::TableExtract->new();
$te->parse($ftp_html);

foreach my $rows ($te->first_table_found->rows){
my $name = $rows->[1];
next unless $name =~ /fa.gz/;

push @files, $name;
}
}
else {
if(!opendir DIR, "$FASTA_URL/$species/dna") {
warn "WARNING: Could not read from directory $FASTA_URL/$species/dna\n$@\n";
Expand Down Expand Up @@ -1519,6 +1639,10 @@ ()
$ftp->get($file, $ex) or download_to_file("$FASTA_URL/$species/$dna_path/$file", $ex);
}
}
elsif($USE_HTTPS_PROTO) {
print " - downloading $file\n" unless $QUIET;
download_to_file("$FASTA_URL/$species/dna/$file", $ex);
}
else {
print " - copying $file\n" unless $QUIET;
copy("$FASTA_URL/$species/dna/$file", $ex) unless $TEST;
Expand Down Expand Up @@ -1552,7 +1676,12 @@ ()
if(!$TEST) {
$index_file =~ /$file(\..+)/;
print " - downloading $index_file\n" unless $QUIET;
$ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
if($USE_HTTPS_PROTO) {
download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
}
else {
$ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1);
}
$got_indexes++;
}
}
Expand Down Expand Up @@ -1804,7 +1933,8 @@ sub download_to_file {
my ($url, $file) = @_;

if($CAN_USE_CURL) {
my $response = `curl -s -o $file -w '%{http_code}' --location "$url" `;
my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : "";
my $response = `curl -s -o $file -w '%{http_code}' $header --location "$url" `;
if ( $response != 200 && $response != 226) {
print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET;
$CAN_USE_CURL = 0;
Expand Down Expand Up @@ -1858,6 +1988,52 @@ sub download_to_file {
}
}

sub get_html {
my $url = shift;

if($CAN_USE_CURL) {
my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : "";
my $response = `curl -s -w '%{http_code}' $header --location "$url" `;
my @lines = split(/\n/, $response);
my $status_code = pop @lines;
if ( $status_code != 200 && $status_code != 226) {
print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET;
$CAN_USE_CURL = 0;
get_html($url);
}

return join("\n", @lines);
}

elsif($CAN_USE_LWP) {
my $req = HTTP::Request->new(GET => $url);
my $response = $ua->request($req);

unless($response->is_success) {
print "LWP::Simple failed ($response), trying to fetch using HTTP::Tiny\n" unless $QUIET;
$CAN_USE_LWP = 0;
get_html($url);
}

return $response->content;
}
elsif($CAN_USE_HTTP_TINY) {
my $response = HTTP::Tiny->new()->get($url);

unless($response->{success}) {
print("Failed last resort of using HTTP::Tiny to download $url\n");

return undef;
}

return $response->{content};
}
else {
print("Failed to get HTML content without curl, LWP or HTTP::Tiny installed\n");
return undef;
}
}

# unpack a tarball
sub unpack_arch {
my ($arch_file, $dir) = @_;
Expand Down Expand Up @@ -1938,6 +2114,12 @@ sub usage {
a remote (e.g. FTP) address. The FASTA URL/directory must have
gzipped FASTA files under the following structure:
[species]/[dna]/
--USE_HTTPS_PROTO Download cache and FASTA file using HTTPs protocol instead of FTP.
Useful for networks where FTP port is blocked by firewall.


--GITHUBTOKEN Set token to use for authentication when querying GitHub API. Authenticated
user have increased rate-limit. PLEASE NOTE: use token with read-only access.
END

print $usage;
Expand Down