diff --git a/INSTALL.pl b/INSTALL.pl index a92f5066b..f68138465 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -45,6 +45,7 @@ =head1 NAME use Net::FTP; use Cwd; use Scalar::Util qw(looks_like_number); +use List::Util qw(first); use Bio::EnsEMBL::VEP::Utils qw(get_version_data get_version_string); our ( @@ -79,6 +80,8 @@ =head1 NAME $REALPATH_DEST_DIR, $NO_TEST, $NO_BIOPERL, + $USE_HTTPS_PROTO, + $GITHUB_TOKEN, $ua, $CAN_USE_CURL, @@ -90,6 +93,7 @@ =head1 NAME $CAN_USE_TAR, $CAN_USE_DBI, $CAN_USE_DBD_MYSQL, + $CAN_USE_HTTML_EXTRACT ); @@ -120,6 +124,10 @@ BEGIN $CAN_USE_DBD_MYSQL = 1; } + if(eval q{ use HTML::TableExtract; 1 }) { + $CAN_USE_HTTML_EXTRACT = 1; + } + $CAN_USE_CURL = 1 if `which curl` =~ /\/curl/; $CAN_USE_HTTP_TINY = 1 if eval q{ use HTTP::Tiny; 1 }; $CAN_USE_ARCHIVE = 1 if eval q{ use Archive::Extract; 1 }; @@ -146,7 +154,7 @@ BEGIN my $git_api_root = 'https://api.github.com/repos/Ensembl/'; my $VEP_MODULE_NAME = 'ensembl-vep'; -our (@store_species, @indexes, @files, $ftp, $dirname); +our (@store_species, @indexes, @files, %file_sizes, $ftp, $dirname); my $config = {}; GetOptions( @@ -172,7 +180,9 @@ BEGIN 'TEST', 'NO_HTSLIB|l', 'NO_TEST', - 'NO_BIOPERL' + 'NO_BIOPERL', + 'USE_HTTPS_PROTO', + 'GITHUBTOKEN|GITHUB_TOKEN=s', ) or die("ERROR: Failed to parse arguments"); # Read configuration from environment variables starting with VEP_ @@ -181,8 +191,8 @@ sub read_config_from_environment { my $config = shift; for my $key (keys %ENV) { - # Look for environment variables that start with VEP_ - next unless $key =~ "^VEP_"; + # Look for environment variables that start with VEP_ unless it is GITHUB_TOKEN + next unless ($key =~ "^VEP_" or $key eq "GITHUB_TOKEN"); # Avoid setting empty strings my $value = $ENV{$key}; @@ -219,6 +229,8 @@ sub read_config_from_environment { $NO_HTSLIB ||= $config->{NO_HTSLIB}; $NO_TEST ||= $config->{NO_TEST}; $NO_BIOPERL ||= $config->{NO_BIOPERL}; +$USE_HTTPS_PROTO ||= $config->{USE_HTTPS_PROTO}; +$GITHUB_TOKEN ||= $config->{GITHUBTOKEN}; # load version data our $CURRENT_VERSION_DATA = get_version_data($RealBin.'/.version'); @@ -1078,6 +1090,22 @@ () # CACHE FILES ############# +sub convert_file_size { + # convert file size string to byte + + my $size = shift; + my @units = ( 'K', 'M', 'G', 'T' ); + + $size =~ m/\s?(\d+\.?\d+)([T,G,M,K]?)/; + + # in bytes already + return $1 unless defined $2; + + # convert to bytes + my $scale = first {$units[$_] eq $2} 0..$#units; + return $scale ? $1 * (1024**($scale + 1)) : 0; +} + sub format_file_size { # Format $size (in bytes) based on most adequate unit, e.g.: # 0 => 0 bytes @@ -1101,6 +1129,11 @@ sub format_file_size { sub cache() { + if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) { + print "Cannot use HTTPS protocol for downloading cache without HTML::TableExtract Perl library installed\nSkipping cache installation\n" unless $QUIET; + return; + } + my $ok; if($AUTO) { @@ -1152,16 +1185,38 @@ () my $URL_TO_USE = (-e $tabix) ? $CACHE_URL_INDEXED : $CACHE_URL; if(is_url($URL_TO_USE)) { - $URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/; - $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; - $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; - $ftp->binary(); + if($USE_HTTPS_PROTO) { + # get cache file list in HTML format + my $ftp_html = get_html($URL_TO_USE); + unless (defined $ftp_html) { + print "curl failed to retrieve file list from FTP - $URL_TO_USE\n" unless $QUIET; + return; + } + + # parse HTML cache file list + my $te = HTML::TableExtract->new(); + $te->parse($ftp_html); - foreach my $sub(split /\//, $3) { - $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + foreach my $rows ($te->first_table_found->rows){ + my ($name, $size) = ($rows->[1], $rows->[3]); + + next unless $name =~ /tar.gz/; + push @files, $name; + $file_sizes{$name} = convert_file_size($size); + } } + else { + $URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/; + $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; + $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; + $ftp->binary(); + + foreach my $sub(split /\//, $3) { + $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + } - push @files, grep {$_ =~ /tar.gz/} $ftp->ls; + push @files, grep {$_ =~ /tar.gz/} $ftp->ls; + } } else { opendir DIR, $URL_TO_USE; @@ -1193,7 +1248,16 @@ () my $size; my $total_size = 0; foreach my $file(@files) { - $size = defined $ftp ? $ftp->size($file) : 0; + if (defined $ftp) { + $size = $ftp->size($file); + } + elsif (%file_sizes) { + $size = $file_sizes{$file}; + } + else { + $size = 0; + } + $total_size += $size; $size = $size ? " (" . format_file_size($size) . ")" : ""; $species_list .= ++$num . " : $file$size\n"; @@ -1313,11 +1377,21 @@ () if(is_url($URL_TO_USE)) { print " - downloading $URL_TO_USE/$file_path\n" unless $QUIET; if(!$TEST) { - $ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file); + if ($USE_HTTPS_PROTO) { + download_to_file("$URL_TO_USE/$file_path", $target_file); + } + else { + $ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file); + } my $checksums = "CHECKSUMS"; my $checksums_target_file = "$CACHE_DIR/tmp/$checksums"; - $ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file); + if ($USE_HTTPS_PROTO) { + download_to_file("$URL_TO_USE/$checksums", $checksums_target_file); + } + else { + $ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file); + } if (-e $checksums_target_file) { my $sum_download = `sum $target_file`; $sum_download =~ m/([0-9]+)(\s+)([0-9]+)/; @@ -1368,6 +1442,11 @@ () ############# sub fasta() { + if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) { + print "Cannot use HTTPS protocol for downloading FASTA without HTML::TableExtract Perl library installed\nSkipping FASTA installation\n" unless $QUIET; + return; + } + ### SPECIAL CASE GRCh37 if((grep {$files[$_ - 1] =~ /GRCh37/} @indexes) || (defined($ASSEMBLY) && $ASSEMBLY eq 'GRCh37')) { @@ -1402,17 +1481,39 @@ () } my @dirs = (); - if(is_url($FASTA_URL)) { - $FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/; - $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; - $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; - $ftp->binary(); + if($USE_HTTPS_PROTO) { + # get species list in HTML format + my $ftp_html = get_html($FASTA_URL); + unless (defined $ftp_html) { + print "curl failed to retrieve species list from FTP - $FASTA_URL\n" unless $QUIET; + return; + } - foreach my $sub(split /\//, $3) { - $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + # parse HTML species list + my $te = HTML::TableExtract->new(); + $te->parse($ftp_html); + + foreach my $rows ($te->first_table_found->rows){ + my $name = $rows->[1]; + + next unless $name =~ /\/$/; + next if $name =~ /ancestral_alleles/; + + push @dirs, substr($name, 0, -1); + } + } + else { + $FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/; + $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; + $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; + $ftp->binary(); + + foreach my $sub(split /\//, $3) { + $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + } + push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls; } - push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls; } else { opendir DIR, $FASTA_URL; @@ -1461,6 +1562,25 @@ () @files = $ftp->ls; $dna_path = $ftp->pwd =~ /dna_index/ ? 'dna_index' : 'dna'; } + elsif($USE_HTTPS_PROTO) { + # get FASTA file list in HTML format + my $ftp_html = get_html("$FASTA_URL/$species/dna"); + unless (defined $ftp_html) { + print "curl failed to retrieve file list from FTP - $FASTA_URL/$species/dna\n" unless $QUIET; + return; + } + + # parse HTML FASTA file list + my $te = HTML::TableExtract->new(); + $te->parse($ftp_html); + + foreach my $rows ($te->first_table_found->rows){ + my $name = $rows->[1]; + next unless $name =~ /fa.gz/; + + push @files, $name; + } + } else { if(!opendir DIR, "$FASTA_URL/$species/dna") { warn "WARNING: Could not read from directory $FASTA_URL/$species/dna\n$@\n"; @@ -1519,6 +1639,10 @@ () $ftp->get($file, $ex) or download_to_file("$FASTA_URL/$species/$dna_path/$file", $ex); } } + elsif($USE_HTTPS_PROTO) { + print " - downloading $file\n" unless $QUIET; + download_to_file("$FASTA_URL/$species/dna/$file", $ex); + } else { print " - copying $file\n" unless $QUIET; copy("$FASTA_URL/$species/dna/$file", $ex) unless $TEST; @@ -1552,7 +1676,12 @@ () if(!$TEST) { $index_file =~ /$file(\..+)/; print " - downloading $index_file\n" unless $QUIET; - $ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1); + if($USE_HTTPS_PROTO) { + download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1); + } + else { + $ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1); + } $got_indexes++; } } @@ -1804,7 +1933,8 @@ sub download_to_file { my ($url, $file) = @_; if($CAN_USE_CURL) { - my $response = `curl -s -o $file -w '%{http_code}' --location "$url" `; + my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : ""; + my $response = `curl -s -o $file -w '%{http_code}' $header --location "$url" `; if ( $response != 200 && $response != 226) { print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET; $CAN_USE_CURL = 0; @@ -1858,6 +1988,52 @@ sub download_to_file { } } +sub get_html { + my $url = shift; + + if($CAN_USE_CURL) { + my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : ""; + my $response = `curl -s -w '%{http_code}' $header --location "$url" `; + my @lines = split(/\n/, $response); + my $status_code = pop @lines; + if ( $status_code != 200 && $status_code != 226) { + print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET; + $CAN_USE_CURL = 0; + get_html($url); + } + + return join("\n", @lines); + } + + elsif($CAN_USE_LWP) { + my $req = HTTP::Request->new(GET => $url); + my $response = $ua->request($req); + + unless($response->is_success) { + print "LWP::Simple failed ($response), trying to fetch using HTTP::Tiny\n" unless $QUIET; + $CAN_USE_LWP = 0; + get_html($url); + } + + return $response->content; + } + elsif($CAN_USE_HTTP_TINY) { + my $response = HTTP::Tiny->new()->get($url); + + unless($response->{success}) { + print("Failed last resort of using HTTP::Tiny to download $url\n"); + + return undef; + } + + return $response->{content}; + } + else { + print("Failed to get HTML content without curl, LWP or HTTP::Tiny installed\n"); + return undef; + } +} + # unpack a tarball sub unpack_arch { my ($arch_file, $dir) = @_; @@ -1938,6 +2114,12 @@ sub usage { a remote (e.g. FTP) address. The FASTA URL/directory must have gzipped FASTA files under the following structure: [species]/[dna]/ +--USE_HTTPS_PROTO Download cache and FASTA file using HTTPs protocol instead of FTP. + Useful for networks where FTP port is blocked by firewall. + + +--GITHUBTOKEN Set token to use for authentication when querying GitHub API. Authenticated + user have increased rate-limit. PLEASE NOTE: use token with read-only access. END print $usage;