Skip to content

Commit 9fcee6b

Browse files
authoredDec 18, 2024
Merge pull request #4660 from lexming/reprod-tarballs-mac
improve portability of reproducible tarballs by replacing external `tar` command with `tarfile` module
2 parents d21467f + db0704a commit 9fcee6b

File tree

4 files changed

+365
-113
lines changed

4 files changed

+365
-113
lines changed
 

‎easybuild/framework/easyblock.py

+34-18
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
import os
5151
import re
5252
import stat
53+
import sys
5354
import tempfile
5455
import time
5556
import traceback
@@ -365,34 +366,49 @@ def get_checksum_for(self, checksums, filename=None, index=None):
365366
:param filename: name of the file to obtain checksum for
366367
:param index: index of file in list
367368
"""
368-
checksum = None
369-
370-
# sometimes, filename are specified as a dict
369+
chksum_input = filename
370+
chksum_input_git = None
371+
# if filename is provided as dict, take 'filename' key
371372
if isinstance(filename, dict):
372-
filename = filename['filename']
373+
chksum_input = filename.get('filename', None)
374+
chksum_input_git = filename.get('git_config', None)
375+
# early return if no filename given
376+
if chksum_input is None:
377+
self.log.debug("Cannot get checksum without a file name")
378+
return None
379+
380+
if sys.version_info[0] >= 3 and sys.version_info[1] < 9:
381+
# ignore any checksum for given filename due to changes in https://github.com/python/cpython/issues/90021
382+
# tarballs made for git repos are not reproducible when created with Python < 3.9
383+
if chksum_input_git is not None:
384+
self.log.deprecated(
385+
"Reproducible tarballs of Git repos are only possible when using Python 3.9+ to run EasyBuild. "
386+
f"Skipping checksum verification of {chksum_input} since Python < 3.9 is used.",
387+
'6.0'
388+
)
389+
return None
373390

391+
checksum = None
374392
# if checksums are provided as a dict, lookup by source filename as key
375393
if isinstance(checksums, dict):
376-
if filename is not None and filename in checksums:
377-
checksum = checksums[filename]
378-
else:
379-
checksum = None
380-
elif isinstance(checksums, (list, tuple)):
381-
if index is not None and index < len(checksums) and (index >= 0 or abs(index) <= len(checksums)):
394+
try:
395+
checksum = checksums[chksum_input]
396+
except KeyError:
397+
self.log.debug("Checksum not found for file: %s", chksum_input)
398+
elif isinstance(checksums, (list, tuple)) and index is not None:
399+
try:
382400
checksum = checksums[index]
383-
else:
384-
checksum = None
385-
elif checksums is None:
386-
checksum = None
387-
else:
401+
except IndexError:
402+
self.log.debug("Checksum not found for index list: %s", index)
403+
elif checksums is not None:
388404
raise EasyBuildError("Invalid type for checksums (%s), should be dict, list, tuple or None.",
389405
type(checksums))
390406

391407
if checksum is None or build_option("checksum_priority") == CHECKSUM_PRIORITY_JSON:
392408
json_checksums = self.get_checksums_from_json()
393-
return json_checksums.get(filename, None)
394-
else:
395-
return checksum
409+
return json_checksums.get(chksum_input, None)
410+
411+
return checksum
396412

397413
def get_checksums_from_json(self, always_read=False):
398414
"""

‎easybuild/tools/filetools.py

+130-31
Original file line numberDiff line numberDiff line change
@@ -48,13 +48,15 @@
4848
import inspect
4949
import itertools
5050
import os
51+
import pathlib
5152
import platform
5253
import re
5354
import shutil
5455
import signal
5556
import stat
5657
import ssl
5758
import sys
59+
import tarfile
5860
import tempfile
5961
import time
6062
import zlib
@@ -1408,13 +1410,12 @@ def find_extension(filename):
14081410
suffixes = sorted(EXTRACT_CMDS.keys(), key=len, reverse=True)
14091411
pat = r'(?P<ext>%s)$' % '|'.join([s.replace('.', '\\.') for s in suffixes])
14101412
res = re.search(pat, filename, flags=re.IGNORECASE)
1413+
14111414
if res:
1412-
ext = res.group('ext')
1415+
return res.group('ext')
14131416
else:
14141417
raise EasyBuildError("%s has unknown file extension", filename)
14151418

1416-
return ext
1417-
14181419

14191420
def extract_cmd(filepath, overwrite=False):
14201421
"""
@@ -2644,7 +2645,7 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
26442645
"""
26452646
Downloads a git repository, at a specific tag or commit, recursively or not, and make an archive with it
26462647
2647-
:param filename: name of the archive to save the code to (must be .tar.gz)
2648+
:param filename: name of the archive file to save the code to (including extension)
26482649
:param target_dir: target directory where to save the archive to
26492650
:param git_config: dictionary containing url, repo_name, recursive, and one of tag or commit
26502651
"""
@@ -2680,9 +2681,6 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
26802681
if not url:
26812682
raise EasyBuildError("url not specified in git_config parameter")
26822683

2683-
if not filename.endswith('.tar.gz'):
2684-
raise EasyBuildError("git_config currently only supports filename ending in .tar.gz")
2685-
26862684
# prepare target directory and clone repository
26872685
mkdir(target_dir, parents=True)
26882686

@@ -2768,37 +2766,138 @@ def get_source_tarball_from_git(filename, target_dir, git_config):
27682766
run_shell_cmd(cmd, work_dir=work_dir, hidden=True, verbose_dry_run=True)
27692767

27702768
# Create archive
2771-
archive_path = os.path.join(target_dir, filename)
2772-
2773-
if keep_git_dir:
2774-
# create archive of git repo including .git directory
2775-
tar_cmd = ['tar', 'cfvz', archive_path, repo_name]
2776-
else:
2777-
# create reproducible archive
2778-
# see https://reproducible-builds.org/docs/archives/
2779-
tar_cmd = [
2780-
# print names of all files and folders excluding .git directory
2781-
'find', repo_name, '-name ".git"', '-prune', '-o', '-print0',
2782-
# reset access and modification timestamps to epoch 0 (equivalent to --mtime in GNU tar)
2783-
'-exec', 'touch', '--date=@0', '{}', r'\;',
2784-
# reset file permissions of cloned repo (equivalent to --mode in GNU tar)
2785-
'-exec', 'chmod', '"go+u,go-w"', '{}', r'\;', '|',
2786-
# sort file list (equivalent to --sort in GNU tar)
2787-
'LC_ALL=C', 'sort', '--zero-terminated', '|',
2788-
# create tarball in GNU format with ownership and permissions reset
2789-
'tar', '--create', '--no-recursion', '--owner=0', '--group=0', '--numeric-owner',
2790-
'--format=gnu', '--null', '--files-from', '-', '|',
2791-
# compress tarball with gzip without original file name and timestamp
2792-
'gzip', '--no-name', '>', archive_path
2793-
]
2794-
run_shell_cmd(' '.join(tar_cmd), work_dir=tmpdir, hidden=True, verbose_dry_run=True)
2769+
repo_path = os.path.join(tmpdir, repo_name)
2770+
reproducible = not keep_git_dir # presence of .git directory renders repo unreproducible
2771+
archive_path = make_archive(repo_path, archive_file=filename, archive_dir=target_dir, reproducible=reproducible)
27952772

27962773
# cleanup (repo_name dir does not exist in dry run mode)
27972774
remove(tmpdir)
27982775

27992776
return archive_path
28002777

28012778

2779+
def make_archive(source_dir, archive_file=None, archive_dir=None, reproducible=True):
2780+
"""
2781+
Create an archive file of the given directory
2782+
The format of the tarball is defined by the extension of the archive file name
2783+
2784+
:source_dir: string with path to directory to be archived
2785+
:archive_file: string with filename of archive
2786+
:archive_dir: string with path to directory to place the archive
2787+
:reproducible: make a tarball that is reproducible accross systems
2788+
- see https://reproducible-builds.org/docs/archives/
2789+
- requires uncompressed or LZMA compressed archive images
2790+
- gzip is currently not supported due to undeterministic data injected in its headers
2791+
see https://github.com/python/cpython/issues/112346
2792+
2793+
Default behaviour: reproducible tarball in .tar.xz
2794+
"""
2795+
def reproducible_filter(tarinfo):
2796+
"Filter out system-dependent data from tarball"
2797+
# contents of '.git' subdir are inherently system dependent
2798+
if "/.git/" in tarinfo.name or tarinfo.name.endswith("/.git"):
2799+
return None
2800+
# set timestamp to epoch 0
2801+
tarinfo.mtime = 0
2802+
# reset file permissions by applying go+u,go-w
2803+
user_mode = tarinfo.mode & stat.S_IRWXU
2804+
group_mode = (user_mode >> 3) & ~stat.S_IWGRP # user mode without write
2805+
other_mode = group_mode >> 3 # same as group mode
2806+
tarinfo.mode = (tarinfo.mode & ~0o77) | group_mode | other_mode
2807+
# reset ownership to numeric UID/GID 0
2808+
# equivalent in GNU tar to 'tar --owner=0 --group=0 --numeric-owner'
2809+
tarinfo.uid = tarinfo.gid = 0
2810+
tarinfo.uname = tarinfo.gname = ""
2811+
return tarinfo
2812+
2813+
ext_compression_map = {
2814+
# taken from EXTRACT_CMDS
2815+
'.gtgz': 'gz',
2816+
'.tar.gz': 'gz',
2817+
'.tgz': 'gz',
2818+
'.tar.bz2': 'bz2',
2819+
'.tb2': 'bz2',
2820+
'.tbz': 'bz2',
2821+
'.tbz2': 'bz2',
2822+
'.tar.xz': 'xz',
2823+
'.txz': 'xz',
2824+
'.tar': '',
2825+
}
2826+
reproducible_compression = ['', 'xz']
2827+
default_ext = '.tar.xz'
2828+
2829+
if archive_file is None:
2830+
archive_file = os.path.basename(source_dir) + default_ext
2831+
2832+
try:
2833+
archive_ext = find_extension(archive_file)
2834+
except EasyBuildError:
2835+
if '.' in archive_file:
2836+
# archive filename has unknown extension (set for raise)
2837+
archive_ext = ''
2838+
else:
2839+
# archive filename has no extension, use default one
2840+
archive_ext = default_ext
2841+
archive_file += archive_ext
2842+
2843+
if archive_ext not in ext_compression_map:
2844+
# archive filename has unsupported extension
2845+
supported_exts = ', '.join(ext_compression_map)
2846+
raise EasyBuildError(
2847+
f"Unsupported archive format: {archive_file}. Supported tarball extensions: {supported_exts}"
2848+
)
2849+
compression = ext_compression_map[archive_ext]
2850+
_log.debug(f"Archive extension and compression: {archive_ext} in {compression}")
2851+
2852+
archive_path = archive_file if archive_dir is None else os.path.join(archive_dir, archive_file)
2853+
2854+
archive_specs = {
2855+
'name': archive_path,
2856+
'mode': f"w:{compression}",
2857+
'format': tarfile.GNU_FORMAT,
2858+
'encoding': "utf-8",
2859+
}
2860+
2861+
if reproducible:
2862+
if compression == 'xz':
2863+
# ensure a consistent compression level in reproducible tarballs with XZ
2864+
archive_specs['preset'] = 6
2865+
elif compression not in reproducible_compression:
2866+
# requested archive compression cannot be made reproducible
2867+
print_warning(
2868+
f"Can not create reproducible archive due to unsupported file compression ({compression}). "
2869+
"Please use XZ instead."
2870+
)
2871+
reproducible = False
2872+
2873+
archive_filter = reproducible_filter if reproducible else None
2874+
2875+
if build_option('extended_dry_run'):
2876+
# early return in dry run mode
2877+
dry_run_msg("Archiving '%s' into '%s'...", source_dir, archive_path)
2878+
return archive_path
2879+
_log.info("Archiving '%s' into '%s'...", source_dir, archive_path)
2880+
2881+
# TODO: replace with TarFile.add(recursive=True) when support for Python 3.6 drops
2882+
# since Python v3.7 tarfile automatically orders the list of files added to the archive
2883+
# see Tarfile.add documentation: https://docs.python.org/3/library/tarfile.html#tarfile.TarFile.add
2884+
source_files = [source_dir]
2885+
# pathlib's glob includes hidden files
2886+
source_files.extend([str(filepath) for filepath in pathlib.Path(source_dir).glob("**/*")])
2887+
source_files.sort() # independent of locale
2888+
2889+
with tarfile.open(**archive_specs) as tar_archive:
2890+
for filepath in source_files:
2891+
# archive with target directory in its top level, remove any prefix in path
2892+
file_name = os.path.relpath(filepath, start=os.path.dirname(source_dir))
2893+
tar_archive.add(filepath, arcname=file_name, recursive=False, filter=archive_filter)
2894+
_log.debug("File/folder added to archive '%s': %s", archive_file, filepath)
2895+
2896+
_log.info("Archive '%s' created successfully", archive_file)
2897+
2898+
return archive_path
2899+
2900+
28022901
def move_file(path, target_path, force_in_dry_run=False):
28032902
"""
28042903
Move a file from path to target_path

‎test/framework/easyblock.py

+39
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
import sys
3737
import tempfile
3838
from inspect import cleandoc
39+
from test.framework.github import requires_github_access
3940
from test.framework.utilities import EnhancedTestCase, TestLoaderFiltered, init_config
4041
from unittest import TextTestRunner
4142

@@ -1639,6 +1640,44 @@ def test_fetch_sources(self):
16391640
error_pattern = "Found one or more unexpected keys in 'sources' specification: {'nosuchkey': 'foobar'}"
16401641
self.assertErrorRegex(EasyBuildError, error_pattern, eb.fetch_sources, sources, checksums=[])
16411642

1643+
@requires_github_access()
1644+
def test_fetch_sources_git(self):
1645+
"""Test fetch_sources method from git repo."""
1646+
1647+
testdir = os.path.abspath(os.path.dirname(__file__))
1648+
ec = process_easyconfig(os.path.join(testdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb'))[0]
1649+
eb = get_easyblock_instance(ec)
1650+
eb.src = []
1651+
sources = [
1652+
{
1653+
'filename': 'testrepository.tar.xz',
1654+
'git_config': {
1655+
'repo_name': 'testrepository',
1656+
'url': 'https://github.com/easybuilders',
1657+
'tag': 'branch_tag_for_test',
1658+
}
1659+
}
1660+
]
1661+
checksums = ["00000000"]
1662+
with self.mocked_stdout_stderr():
1663+
eb.fetch_sources(sources, checksums=checksums)
1664+
1665+
self.assertEqual(len(eb.src), 1)
1666+
self.assertEqual(eb.src[0]['name'], "testrepository.tar.xz")
1667+
self.assertExists(eb.src[0]['path'])
1668+
self.assertEqual(eb.src[0]['cmd'], None)
1669+
1670+
reference_checksum = "00000000"
1671+
if sys.version_info[0] >= 3 and sys.version_info[1] < 9:
1672+
# checksums of tarballs made by EB cannot be reliably checked prior to Python 3.9
1673+
# due to changes introduced in python/cpython#90021
1674+
reference_checksum = None
1675+
1676+
self.assertEqual(eb.src[0]['checksum'], reference_checksum)
1677+
1678+
# cleanup
1679+
remove_file(eb.src[0]['path'])
1680+
16421681
def test_download_instructions(self):
16431682
"""Test use of download_instructions easyconfig parameter."""
16441683

0 commit comments

Comments
 (0)