Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions changes/3580.bugfix.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Fix `ZipStore` leaving duplicate entries in the zip central directory when array metadata is written more than once (e.g. after `resize()`). The central directory is now deduplicated on `close()`, keeping only the most recent entry for each filename.
36 changes: 35 additions & 1 deletion src/zarr/storage/_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import shutil
import threading
import time
import warnings
import zipfile
from pathlib import Path
from typing import TYPE_CHECKING, Any, Literal
Expand Down Expand Up @@ -122,8 +123,36 @@ def close(self) -> None:
# docstring inherited
super().close()
with self._lock:
self._dedup_central_directory()
self._zf.close()

def _dedup_central_directory(self) -> None:
"""Remove duplicate entries from the zip central directory.

When an array is resized or metadata is updated multiple times, ZipStore
writes a new entry for the same key each time (ZIP files are append-only).
This leaves duplicate filenames in the central directory, which confuses
many zip readers and wastes space. Before closing, we keep only the
*last* (most recent) entry for every filename so that the on-disk central
directory is clean.

``NameToInfo`` is already a ``{filename: latest_ZipInfo}`` mapping because
each ``writestr``/``write`` call does ``NameToInfo[name] = zinfo``, so
later writes overwrite earlier ones. We therefore only need to rebuild
``filelist`` from the values of ``NameToInfo``; ``NameToInfo`` itself
requires no update.

The new list is computed before being assigned so that the ZipFile object
is never left in an inconsistent state if an exception is raised.
"""
if self._zf.mode not in ("w", "a", "x"):
return
# namelist() and getinfo() are public API.
# dict.fromkeys preserves first-seen order while deduplicating names;
# getinfo() returns the latest ZipInfo for each name (NameToInfo[name]).
unique_names = dict.fromkeys(self._zf.namelist())
self._zf.filelist = [self._zf.getinfo(name) for name in unique_names]

async def clear(self) -> None:
# docstring inherited
with self._lock:
Expand Down Expand Up @@ -205,7 +234,12 @@ def _set(self, key: str, value: Buffer) -> None:
keyinfo.external_attr |= 0x10 # MS-DOS directory flag
else:
keyinfo.external_attr = 0o644 << 16 # ?rw-r--r--
self._zf.writestr(keyinfo, value.to_bytes())
# ZIP files are append-only; writing an existing key creates a second entry.
# We intentionally allow this and remove duplicates in _dedup_central_directory()
# when the store is closed.
with warnings.catch_warnings():
warnings.simplefilter("ignore", UserWarning)
self._zf.writestr(keyinfo, value.to_bytes())

async def set(self, key: str, value: Buffer) -> None:
# docstring inherited
Expand Down
25 changes: 25 additions & 0 deletions tests/test_store/test_zip.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,3 +152,28 @@ async def test_move(self, tmp_path: Path) -> None:
assert destination.exists()
assert not origin.exists()
assert np.array_equal(array[...], np.arange(10))

def test_no_duplicate_entries_after_resize(self, tmp_path: Path) -> None:
# Regression test for https://github.com/zarr-developers/zarr-python/issues/3580
# Resizing an array (which rewrites zarr.json) used to leave duplicate
# filenames in the zip central directory.
zip_path = tmp_path / "data.zip"
store = ZipStore(zip_path, mode="w")
arr = zarr.create_array(store, shape=(5,), chunks=(5,), dtype="i4")
arr[:] = np.arange(5)

# Resize triggers metadata rewrite, producing a second zarr.json entry
arr.resize((10,))
arr[5:] = np.arange(5, 10)
store.close()

# Verify no duplicate filenames in the central directory
with zipfile.ZipFile(zip_path, "r") as zf:
names = [info.filename for info in zf.infolist()]
assert len(names) == len(set(names)), f"Duplicate entries found: {names}"

# Verify data integrity
store2 = ZipStore(zip_path, mode="r")
arr2 = zarr.open_array(store2)
assert arr2.shape == (10,)
assert np.array_equal(arr2[:], np.arange(10))
Loading