Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve portability of reproducible tarballs by replacing external tar command with tarfile module from Python standard library #4660

Merged
merged 31 commits into from
Dec 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
ec1ede9
use more portable --date argument for touch command used in reproduci…
lexming Sep 27, 2024
e7f3bbd
stop reproducible tarball generation command on any failure in the pipe
lexming Sep 27, 2024
ca09f4e
move command to make reproducible archives to its own generator metho…
lexming Sep 27, 2024
d0a55ba
replace harcoded pattern for reproducible archives command for call t…
lexming Sep 27, 2024
d7195c7
use tarfile module instead of executing external shell commands to cr…
lexming Oct 7, 2024
87b733a
add required flag to filetools.find_extension() method
lexming Oct 8, 2024
dd28095
add support for extended_dry_run mode to filetools.make_archive()
lexming Oct 8, 2024
a37af5a
add unit test for filetools.make_archive()
lexming Oct 8, 2024
f2296de
make test for github_get_source_tarball_from_git compatible with make…
lexming Oct 8, 2024
980f618
improve reliability of bit-wise operations setting file mode in repro…
lexming Oct 8, 2024
ddb9cae
Merge branch '5.0.x' into reprod-tarballs-mac
lexming Nov 5, 2024
a26c71e
set reproducible flag of make_archives from a specific variable
lexming Nov 5, 2024
3936a6e
simplify logic in EasyBlock.get_checksum_for and improve its logging
lexming Nov 5, 2024
6188cc7
ignore checksums of sources from git repos prior to Python 3.9
lexming Nov 5, 2024
ee772f7
only run checksum assertions in test_make_archive on Python 3.9+
lexming Nov 5, 2024
27ea1d0
add test_fetch_sources_git to easyblock suite
lexming Nov 5, 2024
ad47cac
push deprecation of cheksum check of git repo with Python < 3.9 to Ea…
lexming Nov 10, 2024
1c44ad7
respect file extensions in archive filenames of filetools.make_archive()
lexming Dec 3, 2024
fbbb632
add fallback value to filename retrieval in get_checksum_for
lexming Dec 3, 2024
4f30b16
expand text in deprecation warning about checksum verification of git…
lexming Dec 3, 2024
8658497
add link to issue about undeterministic behaviour of gzip in cpython
lexming Dec 3, 2024
ed26075
clarify comment about permission changes in reproducible tarballs
lexming Dec 3, 2024
85c96a1
explain behaviour behind numeric permission reset in reproducible tar…
lexming Dec 3, 2024
fb75353
replace issue tag with full URL
lexming Dec 3, 2024
d9e7682
add link TarFile documentation explaining sorting of files
lexming Dec 3, 2024
e178bd0
improve deprecation warning when skipping checksum verification for g…
boegel Dec 18, 2024
c66ce9c
minor tweaks to make_archive
boegel Dec 18, 2024
848de66
cleanup in test_fetch_sources_git
boegel Dec 18, 2024
8918005
fix long line in test_make_archive
boegel Dec 18, 2024
671d3dd
update test_github_get_source_tarball_from_git to properly test suppo…
lexming Dec 18, 2024
db0704a
re-enable all asserts in test_github_get_source_tarball_from_git
lexming Dec 18, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 34 additions & 18 deletions easybuild/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
import os
import re
import stat
import sys
import tempfile
import time
import traceback
Expand Down Expand Up @@ -358,34 +359,49 @@ def get_checksum_for(self, checksums, filename=None, index=None):
:param filename: name of the file to obtain checksum for
:param index: index of file in list
"""
checksum = None

# sometimes, filename are specified as a dict
chksum_input = filename
chksum_input_git = None
# if filename is provided as dict, take 'filename' key
if isinstance(filename, dict):
filename = filename['filename']
chksum_input = filename.get('filename', None)
chksum_input_git = filename.get('git_config', None)
# early return if no filename given
if chksum_input is None:
self.log.debug("Cannot get checksum without a file name")
return None

if sys.version_info[0] >= 3 and sys.version_info[1] < 9:
# ignore any checksum for given filename due to changes in https://github.com/python/cpython/issues/90021
# tarballs made for git repos are not reproducible when created with Python < 3.9
if chksum_input_git is not None:
self.log.deprecated(
"Reproducible tarballs of Git repos are only possible when using Python 3.9+ to run EasyBuild. "
f"Skipping checksum verification of {chksum_input} since Python < 3.9 is used.",
'6.0'
)
return None

checksum = None
# if checksums are provided as a dict, lookup by source filename as key
if isinstance(checksums, dict):
if filename is not None and filename in checksums:
checksum = checksums[filename]
else:
checksum = None
elif isinstance(checksums, (list, tuple)):
if index is not None and index < len(checksums) and (index >= 0 or abs(index) <= len(checksums)):
try:
checksum = checksums[chksum_input]
except KeyError:
self.log.debug("Checksum not found for file: %s", chksum_input)
elif isinstance(checksums, (list, tuple)) and index is not None:
try:
checksum = checksums[index]
else:
checksum = None
elif checksums is None:
checksum = None
else:
except IndexError:
self.log.debug("Checksum not found for index list: %s", index)
elif checksums is not None:
raise EasyBuildError("Invalid type for checksums (%s), should be dict, list, tuple or None.",
type(checksums))

if checksum is None or build_option("checksum_priority") == CHECKSUM_PRIORITY_JSON:
json_checksums = self.get_checksums_from_json()
return json_checksums.get(filename, None)
else:
return checksum
return json_checksums.get(chksum_input, None)

return checksum

def get_checksums_from_json(self, always_read=False):
"""
Expand Down
161 changes: 130 additions & 31 deletions easybuild/tools/filetools.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,13 +48,15 @@
import inspect
import itertools
import os
import pathlib
import platform
import re
import shutil
import signal
import stat
import ssl
import sys
import tarfile
import tempfile
import time
import zlib
Expand Down Expand Up @@ -1408,13 +1410,12 @@ def find_extension(filename):
suffixes = sorted(EXTRACT_CMDS.keys(), key=len, reverse=True)
pat = r'(?P<ext>%s)$' % '|'.join([s.replace('.', '\\.') for s in suffixes])
res = re.search(pat, filename, flags=re.IGNORECASE)

if res:
ext = res.group('ext')
return res.group('ext')
else:
raise EasyBuildError("%s has unknown file extension", filename)

return ext


def extract_cmd(filepath, overwrite=False):
"""
Expand Down Expand Up @@ -2644,7 +2645,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
"""
Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it

:param filename: name of the archive to save the code to (must be .tar.gz)
:param filename: name of the archive file to save the code to (including extension)
:param target_dir: target directory where to save the archive to
:param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit
"""
Expand Down Expand Up @@ -2680,9 +2681,6 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
if not url:
raise EasyBuildError("url not specified in git_config parameter")

if not filename.endswith('.tar.gz'):
raise EasyBuildError("git_config currently only supports filename ending in .tar.gz")

# prepare target directory and clone repository
mkdir(target_dir, parents=True)

Expand Down Expand Up @@ -2768,37 +2766,138 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
run_shell_cmd(cmd, work_dir=work_dir, hidden=True, verbose_dry_run=True)

# Create archive
archive_path = os.path.join(target_dir, filename)

if keep_git_dir:
# create archive of git repo including .git directory
tar_cmd = ['tar', 'cfvz', archive_path, repo_name]
else:
# create reproducible archive
# see https://reproducible-builds.org/docs/archives/
tar_cmd = [
# print names of all files and folders excluding .git directory
'find', repo_name, '-name ".git"', '-prune', '-o', '-print0',
# reset access and modification timestamps to epoch 0 (equivalent to --mtime in GNU tar)
'-exec', 'touch', '--date=@0', '{}', r'\;',
# reset file permissions of cloned repo (equivalent to --mode in GNU tar)
'-exec', 'chmod', '"go+u,go-w"', '{}', r'\;', '|',
# sort file list (equivalent to --sort in GNU tar)
'LC_ALL=C', 'sort', '--zero-terminated', '|',
# create tarball in GNU format with ownership and permissions reset
'tar', '--create', '--no-recursion', '--owner=0', '--group=0', '--numeric-owner',
'--format=gnu', '--null', '--files-from', '-', '|',
# compress tarball with gzip without original file name and timestamp
'gzip', '--no-name', '>', archive_path
]
run_shell_cmd(' '.join(tar_cmd), work_dir=tmpdir, hidden=True, verbose_dry_run=True)
repo_path = os.path.join(tmpdir, repo_name)
reproducible = not keep_git_dir # presence of .git directory renders repo unreproducible
archive_path = make_archive(repo_path, archive_file=filename, archive_dir=target_dir, reproducible=reproducible)

# cleanup (repo_name dir does not exist in dry run mode)
remove(tmpdir)

return archive_path


def make_archive(source_dir, archive_file=None, archive_dir=None, reproducible=True):
"""
Create an archive file of the given directory
The format of the tarball is defined by the extension of the archive file name

:source_dir: string with path to directory to be archived
:archive_file: string with filename of archive
:archive_dir: string with path to directory to place the archive
:reproducible: make a tarball that is reproducible accross systems
- see https://reproducible-builds.org/docs/archives/
- requires uncompressed or LZMA compressed archive images
- gzip is currently not supported due to undeterministic data injected in its headers
see https://github.com/python/cpython/issues/112346

Default behaviour: reproducible tarball in .tar.xz
"""
def reproducible_filter(tarinfo):
"Filter out system-dependent data from tarball"
# contents of '.git' subdir are inherently system dependent
if "/.git/" in tarinfo.name or tarinfo.name.endswith("/.git"):
return None
# set timestamp to epoch 0
tarinfo.mtime = 0
# reset file permissions by applying go+u,go-w
user_mode = tarinfo.mode & stat.S_IRWXU
group_mode = (user_mode >> 3) & ~stat.S_IWGRP # user mode without write
other_mode = group_mode >> 3 # same as group mode
tarinfo.mode = (tarinfo.mode & ~0o77) | group_mode | other_mode
# reset ownership to numeric UID/GID 0
# equivalent in GNU tar to 'tar --owner=0 --group=0 --numeric-owner'
tarinfo.uid = tarinfo.gid = 0
tarinfo.uname = tarinfo.gname = ""
return tarinfo

ext_compression_map = {
# taken from EXTRACT_CMDS
'.gtgz': 'gz',
'.tar.gz': 'gz',
'.tgz': 'gz',
'.tar.bz2': 'bz2',
'.tb2': 'bz2',
'.tbz': 'bz2',
'.tbz2': 'bz2',
'.tar.xz': 'xz',
'.txz': 'xz',
'.tar': '',
}
reproducible_compression = ['', 'xz']
default_ext = '.tar.xz'

if archive_file is None:
archive_file = os.path.basename(source_dir) + default_ext

try:
archive_ext = find_extension(archive_file)
except EasyBuildError:
if '.' in archive_file:
# archive filename has unknown extension (set for raise)
archive_ext = ''
else:
# archive filename has no extension, use default one
archive_ext = default_ext
archive_file += archive_ext

if archive_ext not in ext_compression_map:
# archive filename has unsupported extension
supported_exts = ', '.join(ext_compression_map)
raise EasyBuildError(
f"Unsupported archive format: {archive_file}. Supported tarball extensions: {supported_exts}"
)
compression = ext_compression_map[archive_ext]
_log.debug(f"Archive extension and compression: {archive_ext} in {compression}")

archive_path = archive_file if archive_dir is None else os.path.join(archive_dir, archive_file)

archive_specs = {
'name': archive_path,
'mode': f"w:{compression}",
'format': tarfile.GNU_FORMAT,
'encoding': "utf-8",
}

if reproducible:
if compression == 'xz':
# ensure a consistent compression level in reproducible tarballs with XZ
archive_specs['preset'] = 6
elif compression not in reproducible_compression:
# requested archive compression cannot be made reproducible
print_warning(
f"Can not create reproducible archive due to unsupported file compression ({compression}). "
"Please use XZ instead."
)
reproducible = False

archive_filter = reproducible_filter if reproducible else None

if build_option('extended_dry_run'):
# early return in dry run mode
dry_run_msg("Archiving '%s' into '%s'...", source_dir, archive_path)
return archive_path
_log.info("Archiving '%s' into '%s'...", source_dir, archive_path)

# TODO: replace with TarFile.add(recursive=True) when support for Python 3.6 drops
# since Python v3.7 tarfile automatically orders the list of files added to the archive
# see Tarfile.add documentation: https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.add
source_files = [source_dir]
# pathlib's glob includes hidden files
source_files.extend([str(filepath) for filepath in pathlib.Path(source_dir).glob("**/*")])
source_files.sort() # independent of locale

with tarfile.open(**archive_specs) as tar_archive:
for filepath in source_files:
# archive with target directory in its top level, remove any prefix in path
file_name = os.path.relpath(filepath, start=os.path.dirname(source_dir))
tar_archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter)
_log.debug("File/folder added to archive '%s': %s", archive_file, filepath)

_log.info("Archive '%s' created successfully", archive_file)

return archive_path


def move_file(path, target_path, force_in_dry_run=False):
"""
Move a file from path to target_path
Expand Down
39 changes: 39 additions & 0 deletions test/framework/easyblock.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import sys
import tempfile
from inspect import cleandoc
from test.framework.github import requires_github_access
from test.framework.utilities import EnhancedTestCase, TestLoaderFiltered, init_config
from unittest import TextTestRunner

Expand Down Expand Up @@ -1618,6 +1619,44 @@ def test_fetch_sources(self):
error_pattern = "Found one or more unexpected keys in 'sources' specification: {'nosuchkey': 'foobar'}"
self.assertErrorRegex(EasyBuildError, error_pattern, eb.fetch_sources, sources, checksums=[])

@requires_github_access()
def test_fetch_sources_git(self):
"""Test fetch_sources method from git repo."""

testdir = os.path.abspath(os.path.dirname(__file__))
ec = process_easyconfig(os.path.join(testdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb'))[0]
eb = get_easyblock_instance(ec)
eb.src = []
sources = [
{
'filename': 'testrepository.tar.xz',
'git_config': {
'repo_name': 'testrepository',
'url': 'https://github.com/easybuilders',
'tag': 'branch_tag_for_test',
}
}
]
checksums = ["00000000"]
with self.mocked_stdout_stderr():
eb.fetch_sources(sources, checksums=checksums)

self.assertEqual(len(eb.src), 1)
self.assertEqual(eb.src[0]['name'], "testrepository.tar.xz")
self.assertExists(eb.src[0]['path'])
self.assertEqual(eb.src[0]['cmd'], None)

reference_checksum = "00000000"
if sys.version_info[0] >= 3 and sys.version_info[1] < 9:
# checksums of tarballs made by EB cannot be reliably checked prior to Python 3.9
# due to changes introduced in python/cpython#90021
reference_checksum = None

self.assertEqual(eb.src[0]['checksum'], reference_checksum)

# cleanup
remove_file(eb.src[0]['path'])

def test_download_instructions(self):
"""Test use of download_instructions easyconfig parameter."""

Expand Down
Loading
Loading