Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve portability of reproducible tarballs by replacing external tar command with tarfile module #4660

Open
wants to merge 10 commits into
base: 5.0.x
Choose a base branch
from
Open
112 changes: 81 additions & 31 deletions easybuild/tools/filetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,15 @@
import inspect
import itertools
import os
import pathlib
import platform
import re
import shutil
import signal
import stat
import ssl
import sys
import tarfile
import tempfile
import time
import zlib
Expand Down Expand Up @@ -1402,18 +1404,20 @@ def get_local_dirs_purged():
return new_dir


def find_extension(filename):
def find_extension(filename, required=True):
"""Find best match for filename extension."""
# sort by length, so longest file extensions get preference
suffixes = sorted(EXTRACT_CMDS.keys(), key=len, reverse=True)
pat = r'(?P<ext>%s)$' % '|'.join([s.replace('.', '\\.') for s in suffixes])
res = re.search(pat, filename, flags=re.IGNORECASE)

if res:
ext = res.group('ext')
else:
return res.group('ext')

if required:
raise EasyBuildError("%s has unknown file extension", filename)

return ext
return None


def extract_cmd(filepath, overwrite=False):
Expand Down Expand Up @@ -2644,7 +2648,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
"""
Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it

:param filename: name of the archive to save the code to (must be .tar.gz)
:param filename: name of the archive to save the code to (must be extensionless)
:param target_dir: target directory where to save the archive to
:param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit
"""
Expand Down Expand Up @@ -2680,8 +2684,10 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
if not url:
raise EasyBuildError("url not specified in git_config parameter")

if not filename.endswith('.tar.gz'):
raise EasyBuildError("git_config currently only supports filename ending in .tar.gz")
file_ext = find_extension(filename, required=False)
if file_ext:
print_warning(f"Ignoring extension of filename '{filename}' set in git_config parameter")
filename = filename[:-len(file_ext)]

# prepare target directory and clone repository
mkdir(target_dir, parents=True)
Expand Down Expand Up @@ -2768,37 +2774,81 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
run_shell_cmd(cmd, work_dir=work_dir, hidden=True, verbose_dry_run=True)

# Create archive
archive_path = os.path.join(target_dir, filename)

if keep_git_dir:
# create archive of git repo including .git directory
tar_cmd = ['tar', 'cfvz', archive_path, repo_name]
else:
# create reproducible archive
# see https://reproducible-builds.org/docs/archives/
tar_cmd = [
# print names of all files and folders excluding .git directory
'find', repo_name, '-name ".git"', '-prune', '-o', '-print0',
# reset access and modification timestamps to epoch 0 (equivalent to --mtime in GNU tar)
'-exec', 'touch', '--date=@0', '{}', r'\;',
# reset file permissions of cloned repo (equivalent to --mode in GNU tar)
'-exec', 'chmod', '"go+u,go-w"', '{}', r'\;', '|',
# sort file list (equivalent to --sort in GNU tar)
'LC_ALL=C', 'sort', '--zero-terminated', '|',
# create tarball in GNU format with ownership and permissions reset
'tar', '--create', '--no-recursion', '--owner=0', '--group=0', '--numeric-owner',
'--format=gnu', '--null', '--files-from', '-', '|',
# compress tarball with gzip without original file name and timestamp
'gzip', '--no-name', '>', archive_path
]
run_shell_cmd(' '.join(tar_cmd), work_dir=tmpdir, hidden=True, verbose_dry_run=True)
repo_path = os.path.join(tmpdir, repo_name)
archive_path = make_archive(repo_path, archive_name=filename, archive_dir=target_dir, reproducible=not keep_git_dir)

# cleanup (repo_name dir does not exist in dry run mode)
remove(tmpdir)

return archive_path


def make_archive(dir_path, archive_name=None, archive_dir=None, reproducible=False):
"""
Create a compressed tar archive in XZ format.

:dir_path: string with path to directory to be archived
:archive_name: string with extensionless filename of archive
:archive_dir: string with path to directory to place the archive
:reproducuble: make a tarball that is reproducible accross systems
see https://reproducible-builds.org/docs/archives/

Archive is compressed with LZMA into a .xz because that is compatible with
a reproducible archive. Other formats like .gz are not reproducible due to
arbitrary strings and timestamps getting added into their metadata.
"""
def reproducible_filter(tarinfo):
"Filter out system-dependent data from tarball"
# contents of '.git' subdir are inherently system dependent
if "/.git/" in tarinfo.name or tarinfo.name.endswith("/.git"):
return None
# set timestamp to epoch 0
tarinfo.mtime = 0
# reset file permissions by applying go+u,go-w
user_mode = tarinfo.mode & stat.S_IRWXU
group_mode = (user_mode >> 3) & ~stat.S_IWGRP # user mode without write
other_mode = group_mode >> 3 # user mode without write
tarinfo.mode = (tarinfo.mode & ~0o77) | group_mode | other_mode
# reset ownership numeric UID/GID 0
tarinfo.uid = tarinfo.gid = 0
tarinfo.uname = tarinfo.gname = ""
return tarinfo

if archive_name is None:
archive_name = os.path.basename(dir_path)

archive_ext = ".tar.xz"
archive_filename = archive_name + archive_ext
archive_path = archive_filename if archive_dir is None else os.path.join(archive_dir, archive_filename)

if build_option('extended_dry_run'):
# early return in dry run mode
dry_run_msg("Archiving '%s' into '%s'...", dir_path, archive_path)
return archive_path

# TODO: replace with TarFile.add(recursive=True) when support for Python 3.6 drops
# since Python v3.7 tarfile automatically orders the list of files added to the archive
dir_files = [dir_path]
# pathlib's glob includes hidden files
dir_files.extend([str(filepath) for filepath in pathlib.Path(dir_path).glob("**/*")])
dir_files.sort() # independent of locale

dir_path_prefix = os.path.dirname(dir_path)
archive_filter = reproducible_filter if reproducible else None

_log.info("Archiving '%s' into '%s'...", dir_path, archive_path)
with tarfile.open(archive_path, "w:xz", format=tarfile.GNU_FORMAT, encoding="utf-8", preset=6) as archive:
for filepath in dir_files:
# archive with target directory in its top level, remove any prefix in path
file_name = os.path.relpath(filepath, start=dir_path_prefix)
archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter)
_log.debug("File/folder added to archive '%s': %s", archive_filename, filepath)

_log.info("Archive '%s' created successfully", archive_filename)

return archive_path


def move_file(path, target_path, force_in_dry_run=False):
"""
Move a file from path to target_path
Expand Down
Loading
Loading