Skip to content

Commit 2cf26d0

Browse files
[3.14] gh-151497: Avoid huge pre-allocation for oversized tarfile extended headers (GH-151498) (GH-151979)
tarfile reads a member's extended header (a GNU long name/link or a pax header) with a single read sized by the header's size field: buf = tarfile.fileobj.read(self._block(self.size)) The size is taken from the archive and is not validated, so a ~512-byte crafted file can claim several gigabytes (or, via base-256 encoding, far more) and make read() pre-allocate that much memory -- on open/iterate, before any extraction filter runs. Read the extended-header data in bounded chunks instead, so an oversized or truncated header can no longer force a huge allocation. The bytes returned for valid archives are unchanged. (cherry picked from commit da99711) Co-authored-by: Shardul Deshpande <iamsharduld@users.noreply.github.com>
1 parent 59ff73a commit 2cf26d0

3 files changed

Lines changed: 79 additions & 2 deletions

File tree

Lib/tarfile.py

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,32 @@ def copyfileobj(src, dst, length=None, exception=OSError, bufsize=None):
258258
dst.write(buf)
259259
return
260260

261+
# Maximum number of bytes read in a single call when reading a member's
262+
# extended header (a GNU long name/link or a pax header). The size of such
263+
# a header is taken from the archive and is not trustworthy, so it is read in
264+
# bounded chunks to avoid a huge up-front allocation when a crafted or
265+
# truncated archive claims far more data than the file actually contains
266+
# (gh-151497).
267+
_EXTHEADER_READ_CHUNK = 1024 * 1024 # 1 MiB
268+
269+
def _safe_read(fileobj, size):
270+
"""Read up to *size* bytes from *fileobj* in bounded chunks.
271+
272+
Returns the same bytes as ``fileobj.read(size)`` would (including a short
273+
result at end of file), but limits pre-allocation, so an
274+
oversized size field in a crafted header cannot force a huge allocation.
275+
"""
276+
if size <= _EXTHEADER_READ_CHUNK:
277+
return fileobj.read(size)
278+
chunks = []
279+
while size > 0:
280+
chunk = fileobj.read(min(size, _EXTHEADER_READ_CHUNK))
281+
if not chunk:
282+
break
283+
chunks.append(chunk)
284+
size -= len(chunk)
285+
return b"".join(chunks)
286+
261287
def _safe_print(s):
262288
encoding = getattr(sys.stdout, 'encoding', None)
263289
if encoding is not None:
@@ -1431,7 +1457,7 @@ def _proc_gnulong(self, tarfile):
14311457
"""Process the blocks that hold a GNU longname
14321458
or longlink member.
14331459
"""
1434-
buf = tarfile.fileobj.read(self._block(self.size))
1460+
buf = _safe_read(tarfile.fileobj, self._block(self.size))
14351461

14361462
# Fetch the next header and process it.
14371463
try:
@@ -1487,7 +1513,7 @@ def _proc_pax(self, tarfile):
14871513
POSIX.1-2008.
14881514
"""
14891515
# Read the header information.
1490-
buf = tarfile.fileobj.read(self._block(self.size))
1516+
buf = _safe_read(tarfile.fileobj, self._block(self.size))
14911517

14921518
# A pax header stores supplemental information for either
14931519
# the following file (extended) or all following files

Lib/test/test_tarfile.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,53 @@ def test_extractfile_attrs(self):
549549
self.assertIs(fobj.seekable(), True)
550550

551551

552+
class ReadSizeRecorder(io.BytesIO):
553+
# Records the largest size ever passed to read(), so a test can check
554+
# that tarfile does not request far more data than the archive holds
555+
# (which on a real file would pre-allocate it).
556+
def __init__(self, *args, **kwargs):
557+
super().__init__(*args, **kwargs)
558+
self.max_read_size = 0
559+
560+
def read(self, size=-1):
561+
if size is not None and size >= 0:
562+
self.max_read_size = max(self.max_read_size, size)
563+
return super().read(size)
564+
565+
566+
@support.cpython_only
567+
class ExtendedHeaderMemoryTest(unittest.TestCase):
568+
# gh-151497: the size of a GNU long name/link or a pax extended header is
569+
# read from the archive and is untrusted. A crafted header can claim a
570+
# size far larger than the file actually contains; opening such an archive
571+
# must not try to read (and so pre-allocate) the claimed size in one go.
572+
573+
def crafted_archive(self, hdrtype):
574+
tarinfo = tarfile.TarInfo("A")
575+
tarinfo.type = hdrtype
576+
tarinfo.size = 0xFFFFFFFF # ~4 GiB claimed in a 512-byte header
577+
return tarinfo.tobuf(format=tarfile.GNU_FORMAT)
578+
579+
def check(self, hdrtype):
580+
fobj = ReadSizeRecorder(self.crafted_archive(hdrtype))
581+
try:
582+
with tarfile.open(fileobj=fobj, mode="r:") as tar:
583+
tar.getmembers()
584+
except tarfile.ReadError:
585+
pass # a truncated header is fine; we only check the allocation
586+
# The bogus ~4 GiB size must never reach a single read() call.
587+
self.assertLessEqual(fobj.max_read_size, tarfile._EXTHEADER_READ_CHUNK)
588+
589+
def test_gnu_longname_oversized_size(self):
590+
self.check(tarfile.GNUTYPE_LONGNAME)
591+
592+
def test_gnu_longlink_oversized_size(self):
593+
self.check(tarfile.GNUTYPE_LONGLINK)
594+
595+
def test_pax_header_oversized_size(self):
596+
self.check(tarfile.XHDTYPE)
597+
598+
552599
class MiscReadTestBase(CommonReadTest):
553600
is_stream = False
554601

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
Opening a :mod:`tarfile` archive no longer attempts to pre-allocate a huge
2+
buffer when a crafted or truncated member claims an oversized extended header
3+
(a GNU long name/link or a pax header). The extended header is now read in
4+
bounded chunks, so its size field can no longer trigger memory exhaustion.

0 commit comments

Comments
 (0)