From d8377c61f56bf6d4b39306751b7f148bb2d49185 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Wed, 27 Nov 2024 18:37:40 +0000 Subject: [PATCH 1/8] Allow using https to list and download file --- INSTALL.pl | 212 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 191 insertions(+), 21 deletions(-) diff --git a/INSTALL.pl b/INSTALL.pl index a92f5066b..2e3f97558 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -45,6 +45,7 @@ =head1 NAME use Net::FTP; use Cwd; use Scalar::Util qw(looks_like_number); +use List::Util qw(first); use Bio::EnsEMBL::VEP::Utils qw(get_version_data get_version_string); our ( @@ -79,6 +80,7 @@ =head1 NAME $REALPATH_DEST_DIR, $NO_TEST, $NO_BIOPERL, + $USE_HTTPS_PROTO, $ua, $CAN_USE_CURL, @@ -90,6 +92,7 @@ =head1 NAME $CAN_USE_TAR, $CAN_USE_DBI, $CAN_USE_DBD_MYSQL, + $CAN_USE_HTTML_EXTRACT ); @@ -120,6 +123,10 @@ BEGIN $CAN_USE_DBD_MYSQL = 1; } + if(eval q{ use HTML::TableExtract; 1 }) { + $CAN_USE_HTTML_EXTRACT = 1; + } + $CAN_USE_CURL = 1 if `which curl` =~ /\/curl/; $CAN_USE_HTTP_TINY = 1 if eval q{ use HTTP::Tiny; 1 }; $CAN_USE_ARCHIVE = 1 if eval q{ use Archive::Extract; 1 }; @@ -146,7 +153,7 @@ BEGIN my $git_api_root = 'https://api.github.com/repos/Ensembl/'; my $VEP_MODULE_NAME = 'ensembl-vep'; -our (@store_species, @indexes, @files, $ftp, $dirname); +our (@store_species, @indexes, @files, %file_sizes, $ftp, $dirname); my $config = {}; GetOptions( @@ -172,7 +179,8 @@ BEGIN 'TEST', 'NO_HTSLIB|l', 'NO_TEST', - 'NO_BIOPERL' + 'NO_BIOPERL', + 'USE_HTTPS_PROTO' ) or die("ERROR: Failed to parse arguments"); # Read configuration from environment variables starting with VEP_ @@ -219,6 +227,7 @@ sub read_config_from_environment { $NO_HTSLIB ||= $config->{NO_HTSLIB}; $NO_TEST ||= $config->{NO_TEST}; $NO_BIOPERL ||= $config->{NO_BIOPERL}; +$USE_HTTPS_PROTO ||= $config->{USE_HTTPS_PROTO}; # load version data our $CURRENT_VERSION_DATA = get_version_data($RealBin.'/.version'); @@ -1078,6 +1087,22 @@ () # CACHE FILES ############# +sub convert_file_size { + # convert file size string to byte + + my $size = shift; + my @units = ( 'K', 'M', 'G', 'T' ); + + $size =~ m/\s?(\d+\.?\d+)([T,G,M,K]?)/; + + # in bytes already + return $1 unless defined $2; + + # convert to bytes + my $scale = first {$units[$_] eq $2} 0..$#units; + return $scale ? $1 * (1024**($scale + 1)) : 0; +} + sub format_file_size { # Format $size (in bytes) based on most adequate unit, e.g.: # 0 => 0 bytes @@ -1101,6 +1126,11 @@ sub format_file_size { sub cache() { + if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) { + print "Cannot use HTTPS protocol for downloading cache without HTML::TableExtract Perl library installed\nSkipping cache installation\n" unless $QUIET; + return; + } + my $ok; if($AUTO) { @@ -1152,16 +1182,38 @@ () my $URL_TO_USE = (-e $tabix) ? $CACHE_URL_INDEXED : $CACHE_URL; if(is_url($URL_TO_USE)) { - $URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/; - $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; - $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; - $ftp->binary(); + if($USE_HTTPS_PROTO) { + # get cache file list in HTML format + my $ftp_html = get_html($URL_TO_USE); + unless (defined $ftp_html) { + print "curl failed to retrieve file list from FTP - $URL_TO_USE\n" unless $QUIET; + return; + } + + # parse HTML cache file list + my $te = HTML::TableExtract->new(); + $te->parse($ftp_html); + + foreach my $rows ($te->first_table_found->rows){ + my ($name, $size) = ($rows->[1], $rows->[3]); - foreach my $sub(split /\//, $3) { - $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + next unless $name =~ /tar.gz/; + push @files, $name; + $file_sizes{$name} = convert_file_size($size); + } } + else { + $URL_TO_USE =~ m/(.*:\/\/)?(.+?)\/(.+)/; + $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; + $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; + $ftp->binary(); - push @files, grep {$_ =~ /tar.gz/} $ftp->ls; + foreach my $sub(split /\//, $3) { + $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + } + + push @files, grep {$_ =~ /tar.gz/} $ftp->ls; + } } else { opendir DIR, $URL_TO_USE; @@ -1193,7 +1245,16 @@ () my $size; my $total_size = 0; foreach my $file(@files) { - $size = defined $ftp ? $ftp->size($file) : 0; + if (defined $ftp) { + $size = $ftp->size($file); + } + elsif (%file_sizes) { + $size = $file_sizes{$file}; + } + else { + $size = 0; + } + $total_size += $size; $size = $size ? " (" . format_file_size($size) . ")" : ""; $species_list .= ++$num . " : $file$size\n"; @@ -1313,11 +1374,21 @@ () if(is_url($URL_TO_USE)) { print " - downloading $URL_TO_USE/$file_path\n" unless $QUIET; if(!$TEST) { - $ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file); + if ($USE_HTTPS_PROTO) { + download_to_file("$URL_TO_USE/$file_path", $target_file); + } + else { + $ftp->get($file_name, $target_file) or download_to_file("$URL_TO_USE/$file_path", $target_file); + } my $checksums = "CHECKSUMS"; my $checksums_target_file = "$CACHE_DIR/tmp/$checksums"; - $ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file); + if ($USE_HTTPS_PROTO) { + download_to_file("$URL_TO_USE/$checksums", $checksums_target_file); + } + else { + $ftp->get($checksums, $checksums_target_file) or download_to_file("$URL_TO_USE/$checksums", $checksums_target_file); + } if (-e $checksums_target_file) { my $sum_download = `sum $target_file`; $sum_download =~ m/([0-9]+)(\s+)([0-9]+)/; @@ -1368,6 +1439,11 @@ () ############# sub fasta() { + if ($USE_HTTPS_PROTO and !$CAN_USE_HTTML_EXTRACT) { + print "Cannot use HTTPS protocol for downloading FASTA without HTML::TableExtract Perl library installed\nSkipping FASTA installation\n" unless $QUIET; + return; + } + ### SPECIAL CASE GRCh37 if((grep {$files[$_ - 1] =~ /GRCh37/} @indexes) || (defined($ASSEMBLY) && $ASSEMBLY eq 'GRCh37')) { @@ -1402,17 +1478,39 @@ () } my @dirs = (); - if(is_url($FASTA_URL)) { - $FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/; - $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; - $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; - $ftp->binary(); + if($USE_HTTPS_PROTO) { + # get species list in HTML format + my $ftp_html = get_html($FASTA_URL); + unless (defined $ftp_html) { + print "curl failed to retrieve species list from FTP - $FASTA_URL\n" unless $QUIET; + return; + } + + # parse HTML cache file list + my $te = HTML::TableExtract->new(); + $te->parse($ftp_html); + + foreach my $rows ($te->first_table_found->rows){ + my $name = $rows->[1]; + + next unless $name =~ /\/$/; + next if $name =~ /ancestral_alleles/; - foreach my $sub(split /\//, $3) { - $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + push @dirs, substr($name, 0, -1); + } + } + else { + $FASTA_URL =~ m/(.*:\/\/)?(.+?)\/(.+)/; + $ftp = Net::FTP->new($2, Passive => 1) or die "ERROR: Could not connect to FTP host $2\n$@\n"; + $ftp->login($FTP_USER) or die "ERROR: Could not login as $FTP_USER\n$@\n"; + $ftp->binary(); + + foreach my $sub(split /\//, $3) { + $ftp->cwd($sub) or die "ERROR: Could not change directory to $sub\n$@\n"; + } + push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls; } - push @dirs, grep { !/ancestral_alleles/i } sort $ftp->ls; } else { opendir DIR, $FASTA_URL; @@ -1461,6 +1559,25 @@ () @files = $ftp->ls; $dna_path = $ftp->pwd =~ /dna_index/ ? 'dna_index' : 'dna'; } + elsif($USE_HTTPS_PROTO) { + # get fasta file list in HTML format + my $ftp_html = get_html("$FASTA_URL/$species/dna"); + unless (defined $ftp_html) { + print "curl failed to retrieve file list from FTP - $FASTA_URL/$species/dna\n" unless $QUIET; + return; + } + + # parse HTML cache file list + my $te = HTML::TableExtract->new(); + $te->parse($ftp_html); + + foreach my $rows ($te->first_table_found->rows){ + my $name = $rows->[1]; + next unless $name =~ /fa.gz/; + + push @files, $name; + } + } else { if(!opendir DIR, "$FASTA_URL/$species/dna") { warn "WARNING: Could not read from directory $FASTA_URL/$species/dna\n$@\n"; @@ -1519,6 +1636,9 @@ () $ftp->get($file, $ex) or download_to_file("$FASTA_URL/$species/$dna_path/$file", $ex); } } + elsif($USE_HTTPS_PROTO) { + download_to_file("$FASTA_URL/$species/dna/$file", $ex); + } else { print " - copying $file\n" unless $QUIET; copy("$FASTA_URL/$species/dna/$file", $ex) unless $TEST; @@ -1552,7 +1672,12 @@ () if(!$TEST) { $index_file =~ /$file(\..+)/; print " - downloading $index_file\n" unless $QUIET; - $ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1); + if($USE_HTTPS_PROTO) { + download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1); + } + else { + $ftp->get($index_file, $ex.$1) or download_to_file("$FASTA_URL/$species/$dna_path/$index_file", $ex.$1); + } $got_indexes++; } } @@ -1858,6 +1983,51 @@ sub download_to_file { } } +sub get_html { + my $url = shift; + + if($CAN_USE_CURL) { + my $response = `curl -s -w '%{http_code}' --location "$url" `; + my @lines = split(/\n/, $response); + my $status_code = pop @lines; + if ( $status_code != 200 && $status_code != 226) { + print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET; + $CAN_USE_CURL = 0; + get_html($url); + } + + return join("\n", @lines); + } + + elsif($CAN_USE_LWP) { + my $req = HTTP::Request->new(GET => $url); + my $response = $ua->request($req); + + unless($response->is_success) { + print "LWP::Simple failed ($response), trying to fetch using HTTP::Tiny\n" unless $QUIET; + $CAN_USE_LWP = 0; + get_html($url); + } + + return $response->content; + } + elsif($CAN_USE_HTTP_TINY) { + my $response = HTTP::Tiny->new()->get($url); + + unless($response->{success}) { + print("Failed last resort of using HTTP::Tiny to download $url\n"); + + return undef; + } + + return $response->{content}; + } + else { + print("Failed to get HTML content without curl, LWP or HTTP::Tiny installed\n"); + return undef; + } +} + # unpack a tarball sub unpack_arch { my ($arch_file, $dir) = @_; From 4d2ba24c8856157a14a29a0ed07c766e5d95f693 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Wed, 27 Nov 2024 18:39:27 +0000 Subject: [PATCH 2/8] Add log --- INSTALL.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/INSTALL.pl b/INSTALL.pl index 2e3f97558..dd6f5d1d8 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -1637,6 +1637,7 @@ () } } elsif($USE_HTTPS_PROTO) { + print " - downloading $file\n" unless $QUIET; download_to_file("$FASTA_URL/$species/dna/$file", $ex); } else { From b5a259d713ed9d6cf2f01d51a6103f7d12c318e0 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Wed, 27 Nov 2024 18:51:34 +0000 Subject: [PATCH 3/8] Allow github token --- INSTALL.pl | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/INSTALL.pl b/INSTALL.pl index dd6f5d1d8..62a790fec 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -81,6 +81,7 @@ =head1 NAME $NO_TEST, $NO_BIOPERL, $USE_HTTPS_PROTO, + $GITHUB_TOKEN, $ua, $CAN_USE_CURL, @@ -180,7 +181,8 @@ BEGIN 'NO_HTSLIB|l', 'NO_TEST', 'NO_BIOPERL', - 'USE_HTTPS_PROTO' + 'USE_HTTPS_PROTO', + 'GITHUBTOKEN|GITHUB_TOKEN=s', ) or die("ERROR: Failed to parse arguments"); # Read configuration from environment variables starting with VEP_ @@ -189,8 +191,8 @@ sub read_config_from_environment { my $config = shift; for my $key (keys %ENV) { - # Look for environment variables that start with VEP_ - next unless $key =~ "^VEP_"; + # Look for environment variables that start with VEP_ unless it is GITHUB_TOKEN + next unless ($key =~ "^VEP_" or $key eq "GITHUB_TOKEN"); # Avoid setting empty strings my $value = $ENV{$key}; @@ -228,6 +230,7 @@ sub read_config_from_environment { $NO_TEST ||= $config->{NO_TEST}; $NO_BIOPERL ||= $config->{NO_BIOPERL}; $USE_HTTPS_PROTO ||= $config->{USE_HTTPS_PROTO}; +$GITHUB_TOKEN ||= $config->{GITHUBTOKEN}; # load version data our $CURRENT_VERSION_DATA = get_version_data($RealBin.'/.version'); @@ -1930,7 +1933,8 @@ sub download_to_file { my ($url, $file) = @_; if($CAN_USE_CURL) { - my $response = `curl -s -o $file -w '%{http_code}' --location "$url" `; + my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : ""; + my $response = `curl -s -o $file -w '%{http_code}' $header --location "$url" `; if ( $response != 200 && $response != 226) { print "curl failed ($response), trying to fetch using LWP::Simple\n" unless $QUIET; $CAN_USE_CURL = 0; @@ -1988,7 +1992,8 @@ sub get_html { my $url = shift; if($CAN_USE_CURL) { - my $response = `curl -s -w '%{http_code}' --location "$url" `; + my $header = defined $GITHUB_TOKEN ? "--header \"Authorization: Bearer $GITHUB_TOKEN\"" : ""; + my $response = `curl -s -w '%{http_code}' $header --location "$url" `; my @lines = split(/\n/, $response); my $status_code = pop @lines; if ( $status_code != 200 && $status_code != 226) { From 1e1cd979531de1daf8e062bdfbdd04044aab5ce8 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Thu, 28 Nov 2024 09:44:19 +0000 Subject: [PATCH 4/8] Fix comment --- INSTALL.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/INSTALL.pl b/INSTALL.pl index 62a790fec..ef692727a 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -1490,7 +1490,7 @@ () return; } - # parse HTML cache file list + # parse HTML species list my $te = HTML::TableExtract->new(); $te->parse($ftp_html); @@ -1563,14 +1563,14 @@ () $dna_path = $ftp->pwd =~ /dna_index/ ? 'dna_index' : 'dna'; } elsif($USE_HTTPS_PROTO) { - # get fasta file list in HTML format + # get FASTA file list in HTML format my $ftp_html = get_html("$FASTA_URL/$species/dna"); unless (defined $ftp_html) { print "curl failed to retrieve file list from FTP - $FASTA_URL/$species/dna\n" unless $QUIET; return; } - # parse HTML cache file list + # parse HTML FASTA file list my $te = HTML::TableExtract->new(); $te->parse($ftp_html); From 6dd26d42050c1a186f6290483984b0b70f1b6410 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Thu, 12 Dec 2024 11:04:30 +0000 Subject: [PATCH 5/8] Update usage --- INSTALL.pl | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/INSTALL.pl b/INSTALL.pl index ef692727a..f68138465 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -2114,6 +2114,12 @@ sub usage { a remote (e.g. FTP) address. The FASTA URL/directory must have gzipped FASTA files under the following structure: [species]/[dna]/ +--USE_HTTPS_PROTO Download cache and FASTA file using HTTPs protocol instead of FTP. + Useful for networks where FTP port is blocked by firewall. + + +--GITHUBTOKEN Set token to use for authentication when querying GitHub API. Authenticated + user have increased rate-limit. PLEASE NOTE: use token with read-only access. END print $usage; From 143ea74b0268c83a30087ea1880499e13f5f5b96 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Tue, 7 Jan 2025 09:32:52 +0000 Subject: [PATCH 6/8] Update usage text --- INSTALL.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.pl b/INSTALL.pl index f68138465..c301904b4 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -2119,7 +2119,7 @@ sub usage { --GITHUBTOKEN Set token to use for authentication when querying GitHub API. Authenticated - user have increased rate-limit. PLEASE NOTE: use token with read-only access. + user have increased rate-limit. NOTE: use token with read-only access. END print $usage; From 3131a002856c3b02920f1f4d204bdd1f8e0d3c4c Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Tue, 7 Jan 2025 09:35:56 +0000 Subject: [PATCH 7/8] Fix the scale calculation --- INSTALL.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.pl b/INSTALL.pl index c301904b4..60e779a71 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -1103,7 +1103,7 @@ sub convert_file_size { # convert to bytes my $scale = first {$units[$_] eq $2} 0..$#units; - return $scale ? $1 * (1024**($scale + 1)) : 0; + return defined $scale ? $1 * (1024**($scale + 1)) : 0; } sub format_file_size { From 97655316f2369ddef6152d8725878ed2d746f9d0 Mon Sep 17 00:00:00 2001 From: Syed Nakib Hossain Date: Tue, 7 Jan 2025 09:39:53 +0000 Subject: [PATCH 8/8] Remove redundant comma in regex --- INSTALL.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/INSTALL.pl b/INSTALL.pl index 60e779a71..c059324af 100755 --- a/INSTALL.pl +++ b/INSTALL.pl @@ -1096,7 +1096,7 @@ sub convert_file_size { my $size = shift; my @units = ( 'K', 'M', 'G', 'T' ); - $size =~ m/\s?(\d+\.?\d+)([T,G,M,K]?)/; + $size =~ m/\s?(\d+\.?\d+)([TGMK]?)/; # in bytes already return $1 unless defined $2;