Skip to content

Commit 04bedfa

Browse files
committed
Issue python#27199: TarFile expose copyfileobj bufsize to improve throughput
Patch by Jason Fried.
1 parent f578195 commit 04bedfa

File tree

2 files changed

+21
-15
lines changed

2 files changed

+21
-15
lines changed

Lib/tarfile.py

Lines changed: 18 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -228,21 +228,21 @@ def calc_chksums(buf):
228228
signed_chksum = 256 + sum(struct.unpack_from("148b8x356b", buf))
229229
return unsigned_chksum, signed_chksum
230230

231-
def copyfileobj(src, dst, length=None, exception=OSError):
231+
def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
232232
"""Copy length bytes from fileobj src to fileobj dst.
233233
If length is None, copy the entire content.
234234
"""
235+
bufsize = bufsize or 16 * 1024
235236
if length == 0:
236237
return
237238
if length is None:
238-
shutil.copyfileobj(src, dst)
239+
shutil.copyfileobj(src, dst, bufsize)
239240
return
240241

241-
BUFSIZE = 16 * 1024
242-
blocks, remainder = divmod(length, BUFSIZE)
242+
blocks, remainder = divmod(length, bufsize)
243243
for b in range(blocks):
244-
buf = src.read(BUFSIZE)
245-
if len(buf) < BUFSIZE:
244+
buf = src.read(bufsize)
245+
if len(buf) < bufsize:
246246
raise exception("unexpected end of data")
247247
dst.write(buf)
248248

@@ -1403,7 +1403,8 @@ class TarFile(object):
14031403

14041404
def __init__(self, name=None, mode="r", fileobj=None, format=None,
14051405
tarinfo=None, dereference=None, ignore_zeros=None, encoding=None,
1406-
errors="surrogateescape", pax_headers=None, debug=None, errorlevel=None):
1406+
errors="surrogateescape", pax_headers=None, debug=None,
1407+
errorlevel=None, copybufsize=None):
14071408
"""Open an (uncompressed) tar archive `name'. `mode' is either 'r' to
14081409
read from an existing archive, 'a' to append data to an existing
14091410
file or 'w' to create a new file overwriting an existing one. `mode'
@@ -1459,6 +1460,7 @@ def __init__(self, name=None, mode="r", fileobj=None, format=None,
14591460
self.errorlevel = errorlevel
14601461

14611462
# Init datastructures.
1463+
self.copybufsize = copybufsize
14621464
self.closed = False
14631465
self.members = [] # list of members as TarInfo objects
14641466
self._loaded = False # flag if all members have been read
@@ -1558,7 +1560,7 @@ def open(cls, name=None, mode="r", fileobj=None, bufsize=RECORDSIZE, **kwargs):
15581560
saved_pos = fileobj.tell()
15591561
try:
15601562
return func(name, "r", fileobj, **kwargs)
1561-
except (ReadError, CompressionError) as e:
1563+
except (ReadError, CompressionError):
15621564
if fileobj is not None:
15631565
fileobj.seek(saved_pos)
15641566
continue
@@ -1963,10 +1965,10 @@ def addfile(self, tarinfo, fileobj=None):
19631965
buf = tarinfo.tobuf(self.format, self.encoding, self.errors)
19641966
self.fileobj.write(buf)
19651967
self.offset += len(buf)
1966-
1968+
bufsize=self.copybufsize
19671969
# If there's data to follow, append it.
19681970
if fileobj is not None:
1969-
copyfileobj(fileobj, self.fileobj, tarinfo.size)
1971+
copyfileobj(fileobj, self.fileobj, tarinfo.size, bufsize=bufsize)
19701972
blocks, remainder = divmod(tarinfo.size, BLOCKSIZE)
19711973
if remainder > 0:
19721974
self.fileobj.write(NUL * (BLOCKSIZE - remainder))
@@ -2148,15 +2150,16 @@ def makefile(self, tarinfo, targetpath):
21482150
"""
21492151
source = self.fileobj
21502152
source.seek(tarinfo.offset_data)
2153+
bufsize = self.copybufsize
21512154
with bltn_open(targetpath, "wb") as target:
21522155
if tarinfo.sparse is not None:
21532156
for offset, size in tarinfo.sparse:
21542157
target.seek(offset)
2155-
copyfileobj(source, target, size, ReadError)
2158+
copyfileobj(source, target, size, ReadError, bufsize)
21562159
target.seek(tarinfo.size)
21572160
target.truncate()
21582161
else:
2159-
copyfileobj(source, target, tarinfo.size, ReadError)
2162+
copyfileobj(source, target, tarinfo.size, ReadError, bufsize)
21602163

21612164
def makeunknown(self, tarinfo, targetpath):
21622165
"""Make a file from a TarInfo object with an unknown type
@@ -2235,7 +2238,7 @@ def chown(self, tarinfo, targetpath, numeric_owner):
22352238
os.lchown(targetpath, u, g)
22362239
else:
22372240
os.chown(targetpath, u, g)
2238-
except OSError as e:
2241+
except OSError:
22392242
raise ExtractError("could not change owner")
22402243

22412244
def chmod(self, tarinfo, targetpath):
@@ -2244,7 +2247,7 @@ def chmod(self, tarinfo, targetpath):
22442247
if hasattr(os, 'chmod'):
22452248
try:
22462249
os.chmod(targetpath, tarinfo.mode)
2247-
except OSError as e:
2250+
except OSError:
22482251
raise ExtractError("could not change mode")
22492252

22502253
def utime(self, tarinfo, targetpath):
@@ -2254,7 +2257,7 @@ def utime(self, tarinfo, targetpath):
22542257
return
22552258
try:
22562259
os.utime(targetpath, (tarinfo.mtime, tarinfo.mtime))
2257-
except OSError as e:
2260+
except OSError:
22582261
raise ExtractError("could not change modification time")
22592262

22602263
#--------------------------------------------------------------------------

Misc/NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@ What's New in Python 3.6.0 beta 1
1010
Core and Builtins
1111
-----------------
1212

13+
- Issue #27199: In tarfile, expose copyfileobj bufsize to improve throughput.
14+
Patch by Jason Fried.
15+
1316
- Issue #27948: In f-strings, only allow backslashes inside the braces
1417
(where the expressions are). This is a breaking change from the 3.6
1518
alpha releases, where backslashes are allowed anywhere in an

0 commit comments

Comments
 (0)