diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 0425452de8d..33021c64234 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -75,6 +75,14 @@ New Features or a fixed ``(width, height)`` tuple instead of computing figure size from ``size`` and ``aspect`` (:issue:`11103`). By `Kristian Kollsga `_. +- Added ``group_filter`` keyword to :py:func:`open_datatree` and + :py:func:`open_groups`, accepting an ``fnmatch``-style glob pattern + (e.g. ``"*/leaf_0"``) to selectively open matching groups. Mutually + exclusive with ``group``, which keeps its exact-path semantics. + Groups whose names literally contain ``*`` or ``?`` are reachable via + character-class escapes (``[*]``, ``[?]``) + (:issue:`11196`, :pull:`11302`). + By `Alfonso Ladino `_. Breaking Changes ~~~~~~~~~~~~~~~~ diff --git a/xarray/backends/api.py b/xarray/backends/api.py index fd992f3e5d8..31ef5454726 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -1021,8 +1021,21 @@ def open_datatree( Additional keyword arguments passed on to the engine open function. For example: - - 'group': path to the group in the given file to open as the root group as - a str. + - 'group': path to the group in the given file to open as the root + group as a str. Mutually exclusive with ``'group_filter'``. + - 'group_filter': non-empty glob pattern matched against every + group path in the file. Only groups whose paths match the + pattern are loaded, along with their ancestors so the resulting + tree stays connected. Matching follows + :py:meth:`pathlib.PurePath.match` semantics: the pattern is + anchored on the right, so ``group_filter="*/leaf_0"`` matches + any group whose path ends in ``/leaf_0`` at any depth. + Group names that contain literal glob metacharacters can be + targeted with character-class escapes: ``[*]`` matches a literal + ``*``, ``[?]`` a literal ``?``, and ``[[]`` a literal ``[``. For + example, ``group_filter="group_[*]_01"`` matches a group + literally named ``group_*_01``. Mutually exclusive with + ``'group'``. - 'lock': resource lock to use when reading data from disk. Only relevant when using dask or another form of parallelism. By default, appropriate locks are chosen to safely read and write files with the @@ -1265,8 +1278,21 @@ def open_groups( Additional keyword arguments passed on to the engine open function. For example: - - 'group': path to the group in the given file to open as the root group as - a str. + - 'group': path to the group in the given file to open as the root + group as a str. Mutually exclusive with ``'group_filter'``. + - 'group_filter': non-empty glob pattern matched against every + group path in the file. Only groups whose paths match the + pattern are loaded, along with their ancestors so the resulting + tree stays connected. Matching follows + :py:meth:`pathlib.PurePath.match` semantics: the pattern is + anchored on the right, so ``group_filter="*/leaf_0"`` matches + any group whose path ends in ``/leaf_0`` at any depth. + Group names that contain literal glob metacharacters can be + targeted with character-class escapes: ``[*]`` matches a literal + ``*``, ``[?]`` a literal ``?``, and ``[[]`` a literal ``[``. For + example, ``group_filter="group_[*]_01"`` matches a group + literally named ``group_*_01``. Mutually exclusive with + ``'group'``. - 'lock': resource lock to use when reading data from disk. Only relevant when using dask or another form of parallelism. By default, appropriate locks are chosen to safely read and write files with the diff --git a/xarray/backends/common.py b/xarray/backends/common.py index f2580ea2a43..8c9664b8262 100644 --- a/xarray/backends/common.py +++ b/xarray/backends/common.py @@ -249,6 +249,41 @@ def _iter_nc_groups(root, parent="/"): yield from _iter_nc_groups(group, parent=gpath) +def _check_group_filter_mutex(group: str | None, group_filter: str | None) -> None: + """Validate ``group`` / ``group_filter`` are not both set, and ``group_filter`` + is non-empty when provided. + """ + if group is not None and group_filter is not None: + raise ValueError( + "group and group_filter are mutually exclusive: " + "group selects an exact group path while group_filter " + "is a glob pattern over all group paths." + ) + if group_filter == "": + raise ValueError("group_filter must be a non-empty glob pattern") + + +def _filter_group_paths(group_paths: Sequence[str], pattern: str) -> list[str]: + """Return the subset of ``group_paths`` whose paths match ``pattern``, + plus every ancestor of a match (so the resulting tree stays + connected). The root path ``"/"`` is always included. + + ``pattern`` is matched with :py:meth:`pathlib.PurePath.match` semantics, + so it is anchored on the right: ``"*/leaf_0"`` matches a group whose + path ends in any single segment followed by ``leaf_0`` at any depth. + """ + from xarray.core.treenode import NodePath + + matched: set[str] = {"/"} + for path in group_paths: + np_ = NodePath(path) + if np_.match(pattern): + matched.add(path) + matched.update(str(p) for p in np_.parents) + + return [p for p in group_paths if p in matched] + + def find_root_and_group(ds): """Find the root and group name of a netCDF4/h5netcdf dataset.""" hierarchy = () diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py index 9b828c8e236..4c51b7c1d93 100644 --- a/xarray/backends/h5netcdf_.py +++ b/xarray/backends/h5netcdf_.py @@ -602,6 +602,7 @@ def open_datatree( decode_timedelta=None, format="NETCDF4", group: str | None = None, + group_filter: str | None = None, lock=None, invalid_netcdf=None, phony_dims=None, @@ -621,6 +622,7 @@ def open_datatree( decode_timedelta=decode_timedelta, format=format, group=group, + group_filter=group_filter, lock=lock, invalid_netcdf=invalid_netcdf, phony_dims=phony_dims, @@ -645,6 +647,7 @@ def open_groups_as_dict( decode_timedelta=None, format="NETCDF4", group: str | None = None, + group_filter: str | None = None, lock=None, invalid_netcdf=None, phony_dims=None, @@ -655,15 +658,22 @@ def open_groups_as_dict( open_kwargs: dict[str, Any] | None = None, **kwargs, ) -> dict[str, Dataset]: - from xarray.backends.common import _iter_nc_groups + from xarray.backends.common import ( + _check_group_filter_mutex, + _filter_group_paths, + _iter_nc_groups, + ) from xarray.core.treenode import NodePath from xarray.core.utils import close_on_error + _check_group_filter_mutex(group, group_filter) + # Keep this message for some versions # remove and set phony_dims="access" above emit_phony_dims_warning, phony_dims = _check_phony_dims(phony_dims) filename_or_obj = _normalize_filename_or_obj(filename_or_obj) + store = H5NetCDFStore.open( filename_or_obj, format=format, @@ -678,15 +688,18 @@ def open_groups_as_dict( open_kwargs=open_kwargs, ) - # Check for a group and make it a parent if it exists - if group: + if group is not None: parent = NodePath("/") / NodePath(group) else: parent = NodePath("/") manager = store._manager + group_paths = list(_iter_nc_groups(store.ds, parent=parent)) + if group_filter is not None: + group_paths = _filter_group_paths(group_paths, group_filter) + groups_dict = {} - for path_group in _iter_nc_groups(store.ds, parent=parent): + for path_group in group_paths: group_store = H5NetCDFStore(manager, group=path_group, **kwargs) store_entrypoint = StoreBackendEntrypoint() with close_on_error(group_store): @@ -701,7 +714,7 @@ def open_groups_as_dict( decode_timedelta=decode_timedelta, ) - if group: + if group is not None: group_name = str(NodePath(path_group).relative_to(parent)) else: group_name = str(NodePath(path_group)) diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py index 39dedd139c0..6eab3814058 100644 --- a/xarray/backends/netCDF4_.py +++ b/xarray/backends/netCDF4_.py @@ -807,6 +807,7 @@ def open_datatree( use_cftime=None, decode_timedelta=None, group: str | None = None, + group_filter: str | None = None, format="NETCDF4", clobber=True, diskless=False, @@ -826,6 +827,7 @@ def open_datatree( use_cftime=use_cftime, decode_timedelta=decode_timedelta, group=group, + group_filter=group_filter, format=format, clobber=clobber, diskless=diskless, @@ -850,6 +852,7 @@ def open_groups_as_dict( use_cftime=None, decode_timedelta=None, group: str | None = None, + group_filter: str | None = None, format="NETCDF4", clobber=True, diskless=False, @@ -859,10 +862,17 @@ def open_groups_as_dict( autoclose=False, **kwargs, ) -> dict[str, Dataset]: - from xarray.backends.common import _iter_nc_groups + from xarray.backends.common import ( + _check_group_filter_mutex, + _filter_group_paths, + _iter_nc_groups, + ) from xarray.core.treenode import NodePath + _check_group_filter_mutex(group, group_filter) + filename_or_obj = _normalize_path(filename_or_obj) + store = NetCDF4DataStore.open( filename_or_obj, group=group, @@ -875,15 +885,18 @@ def open_groups_as_dict( autoclose=autoclose, ) - # Check for a group and make it a parent if it exists - if group: + if group is not None: parent = NodePath("/") / NodePath(group) else: parent = NodePath("/") manager = store._manager + group_paths = list(_iter_nc_groups(store.ds, parent=parent)) + if group_filter is not None: + group_paths = _filter_group_paths(group_paths, group_filter) + groups_dict = {} - for path_group in _iter_nc_groups(store.ds, parent=parent): + for path_group in group_paths: group_store = NetCDF4DataStore(manager, group=path_group, **kwargs) store_entrypoint = StoreBackendEntrypoint() with close_on_error(group_store): @@ -897,7 +910,7 @@ def open_groups_as_dict( use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) - if group: + if group is not None: group_name = str(NodePath(path_group).relative_to(parent)) else: group_name = str(NodePath(path_group)) diff --git a/xarray/backends/zarr.py b/xarray/backends/zarr.py index 6f3e1ad4eb4..d4d7ef2aa5c 100644 --- a/xarray/backends/zarr.py +++ b/xarray/backends/zarr.py @@ -680,6 +680,7 @@ def open_store( mode: ZarrWriteModes = "r", synchronizer=None, group=None, + group_filter: str | None = None, consolidated=False, consolidate_on_close=False, chunk_store=None, @@ -715,8 +716,15 @@ def open_store( from zarr import Group + from xarray.backends.common import _filter_group_paths + group_members: dict[str, Group] = {} group_paths = list(_iter_zarr_groups(zarr_group, parent=group)) + # Filter before materializing child Group objects: each + # ``zarr_group[rel_path]`` lookup triggers metadata I/O, so + # pruning paths up-front skips the cost for groups we'd discard. + if group_filter is not None: + group_paths = _filter_group_paths(group_paths, group_filter) for path in group_paths: if path == group: group_members[path] = zarr_group @@ -1779,6 +1787,7 @@ def open_datatree( use_cftime=None, decode_timedelta=None, group: str | None = None, + group_filter: str | None = None, mode="r", synchronizer=None, consolidated=None, @@ -1798,6 +1807,7 @@ def open_datatree( use_cftime=use_cftime, decode_timedelta=decode_timedelta, group=group, + group_filter=group_filter, mode=mode, synchronizer=synchronizer, consolidated=consolidated, @@ -1821,6 +1831,7 @@ def open_groups_as_dict( use_cftime=None, decode_timedelta=None, group: str | None = None, + group_filter: str | None = None, mode="r", synchronizer=None, consolidated=None, @@ -1829,10 +1840,13 @@ def open_groups_as_dict( zarr_version=None, zarr_format=None, ) -> dict[str, Dataset]: + from xarray.backends.common import _check_group_filter_mutex + + _check_group_filter_mutex(group, group_filter) + filename_or_obj = _normalize_path(filename_or_obj) - # Check for a group and make it a parent if it exists - if group: + if group is not None: parent = str(NodePath("/") / NodePath(group)) else: parent = str(NodePath("/")) @@ -1840,6 +1854,7 @@ def open_groups_as_dict( stores = ZarrStore.open_store( filename_or_obj, group=parent, + group_filter=group_filter, mode=mode, synchronizer=synchronizer, consolidated=consolidated, @@ -1850,8 +1865,11 @@ def open_groups_as_dict( zarr_format=zarr_format, ) + group_paths = list(stores.keys()) + groups_dict = {} - for path_group, store in stores.items(): + for path_group in group_paths: + store = stores[path_group] store_entrypoint = StoreBackendEntrypoint() with close_on_error(store): @@ -1865,7 +1883,7 @@ def open_groups_as_dict( use_cftime=use_cftime, decode_timedelta=decode_timedelta, ) - if group: + if group is not None: group_name = str(NodePath(path_group).relative_to(parent)) else: group_name = str(NodePath(path_group)) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 32f224e89a6..d773d956bbf 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -377,6 +377,191 @@ def test_open_datatree_specific_group(self, tmpdir, simple_datatree) -> None: assert subgroup_tree.root.parent is None assert_equal(subgroup_tree, expected_subtree) + def test_open_datatree_group_filter(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/leaf_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/leaf_1": xr.Dataset({"data": ("x", [3, 4])}), + "/B": xr.Dataset({"b_var": 3}), + "/B/leaf_0": xr.Dataset({"data": ("x", [5, 6])}), + } + ) + filepath = tmpdir / "filter.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree( + filepath, group_filter="*/leaf_0", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/", "/A", "/A/leaf_0", "/B", "/B/leaf_0"} + + def test_open_datatree_group_filter_no_match(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + } + ) + filepath = tmpdir / "filter_nomatch.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree( + filepath, group_filter="*/nonexistent", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/"} + + def test_open_datatree_group_filter_preserves_data(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/leaf_0": xr.Dataset({"data": ("x", [1, 2])}), + } + ) + filepath = tmpdir / "filter_data.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree( + filepath, group_filter="*/leaf_0", engine=self.engine + ) as tree: + assert tree["/A"].dataset["a_var"].item() == 2 + np.testing.assert_array_equal( + tree["/A/leaf_0"].dataset["data"].values, [1, 2] + ) + + def test_open_groups_group_filter(self, tmpdir) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/leaf_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/leaf_1": xr.Dataset({"data": ("x", [3, 4])}), + } + ) + filepath = tmpdir / "filter_groups.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + groups = open_groups(filepath, group_filter="*/leaf_0", engine=self.engine) + try: + assert set(groups.keys()) == {"/", "/A", "/A/leaf_0"} + finally: + for ds in groups.values(): + ds.close() + + def test_open_datatree_group_filter_char_class_escape(self, tmpdir) -> None: + # Group names that literally contain glob metacharacters (*, ?, [) + # are reachable via character-class escaping ([*], [?], [[]), + # mirroring fnmatch / PurePath.match semantics. + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/group_*_01": xr.Dataset({"data": ("x", [1, 2])}), + "/group_*_02": xr.Dataset({"data": ("x", [3, 4])}), + "/group_?_01": xr.Dataset({"data": ("x", [5, 6])}), + "/plain_01": xr.Dataset({"data": ("x", [7, 8])}), + } + ) + filepath = tmpdir / "filter_escape.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree( + filepath, group_filter="group_[*]_01", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" not in paths + assert "/group_?_01" not in paths + + with open_datatree( + filepath, group_filter="group_[*]_*", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" in paths + assert "/group_?_01" not in paths + assert "/plain_01" not in paths + + with open_datatree( + filepath, group_filter="group_[?]_01", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_?_01" in paths + assert "/group_*_01" not in paths + + def test_open_datatree_group_with_literal_metachar(self, tmpdir) -> None: + # With explicit ``group=``, the value is an exact path — glob + # metacharacters in the name are taken literally, not as a + # pattern. Confirms the API ambiguity Stephan flagged is gone. + # + # Sibling groups ``/weird_X_name`` and ``/weird_Y_name`` would + # both match a glob ``weird_*_name``, so the data assertion on + # the ``[1, 2]`` payload distinguishes literal-path lookup from + # accidental glob behavior. + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/weird_*_name": xr.Dataset({"data": ("x", [1, 2])}), + "/weird_X_name": xr.Dataset({"data": ("x", [3, 4])}), + "/weird_Y_name": xr.Dataset({"data": ("x", [5, 6])}), + } + ) + filepath = tmpdir / "literal.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree(filepath, group="weird_*_name", engine=self.engine) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/"} + np.testing.assert_array_equal(tree.dataset["data"].values, [1, 2]) + + def test_open_datatree_group_and_group_filter_mutually_exclusive( + self, tmpdir + ) -> None: + original_dt = DataTree.from_dict( + {"/": xr.Dataset({"root_var": 1}), "/A": xr.Dataset({"a_var": 2})} + ) + filepath = tmpdir / "mutex.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with pytest.raises(ValueError, match="mutually exclusive"): + open_datatree(filepath, group="A", group_filter="*/A", engine=self.engine) + with pytest.raises(ValueError, match="mutually exclusive"): + open_groups(filepath, group="A", group_filter="*/A", engine=self.engine) + + def test_open_datatree_group_filter_match_is_right_anchored(self, tmpdir) -> None: + # ``NodePath.match`` is anchored on the right, so ``*/leaf_0`` + # matches a ``leaf_0`` at any depth as long as the parent + # segment matches ``*``. This pins the documented semantics. + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/x": xr.Dataset({"v": 0}), + "/x/leaf_0": xr.Dataset({"data": ("x", [1, 2])}), + "/x/y": xr.Dataset({"v": 0}), + "/x/y/leaf_0": xr.Dataset({"data": ("x", [3, 4])}), + "/x/y/z": xr.Dataset({"v": 0}), + "/x/y/z/leaf_0": xr.Dataset({"data": ("x", [5, 6])}), + } + ) + filepath = tmpdir / "anchor.nc" + original_dt.to_netcdf(filepath, engine=self.engine) + + with open_datatree( + filepath, group_filter="*/leaf_0", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert paths == { + "/", + "/x", + "/x/leaf_0", + "/x/y", + "/x/y/leaf_0", + "/x/y/z", + "/x/y/z/leaf_0", + } + @requires_h5netcdf_or_netCDF4 class TestGenericNetCDFIO(NetCDFIOBase): @@ -1025,6 +1210,170 @@ def test_open_datatree_specific_group( assert subgroup_tree.root.parent is None assert_equal(subgroup_tree, expected_subtree) + def test_open_datatree_group_filter(self, tmpdir, zarr_format) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/leaf_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/leaf_1": xr.Dataset({"data": ("x", [3, 4])}), + "/B": xr.Dataset({"b_var": 3}), + "/B/leaf_0": xr.Dataset({"data": ("x", [5, 6])}), + } + ) + filepath = str(tmpdir / "filter.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + with open_datatree( + filepath, group_filter="*/leaf_0", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/", "/A", "/A/leaf_0", "/B", "/B/leaf_0"} + + def test_open_datatree_group_filter_no_match(self, tmpdir, zarr_format) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + } + ) + filepath = str(tmpdir / "filter_nomatch.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + with open_datatree( + filepath, group_filter="*/nonexistent", engine=self.engine + ) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/"} + + def test_open_groups_group_filter(self, tmpdir, zarr_format) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/leaf_0": xr.Dataset({"data": ("x", [1, 2])}), + "/A/leaf_1": xr.Dataset({"data": ("x", [3, 4])}), + } + ) + filepath = str(tmpdir / "filter_groups.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + groups = open_groups(filepath, group_filter="*/leaf_0", engine=self.engine) + try: + assert set(groups.keys()) == {"/", "/A", "/A/leaf_0"} + finally: + for ds in groups.values(): + ds.close() + + def test_open_datatree_group_filter_char_class_escape(self, zarr_format) -> None: + # In-memory store: Windows disallows "*" and "?" in directory names. + from zarr.storage import MemoryStore + + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/group_*_01": xr.Dataset({"data": ("x", [1, 2])}), + "/group_*_02": xr.Dataset({"data": ("x", [3, 4])}), + "/group_?_01": xr.Dataset({"data": ("x", [5, 6])}), + "/plain_01": xr.Dataset({"data": ("x", [7, 8])}), + } + ) + store = MemoryStore() + original_dt.to_zarr(store, zarr_format=zarr_format) + + with open_datatree( + store, # type: ignore[arg-type] + group_filter="group_[*]_01", + engine=self.engine, + zarr_format=zarr_format, + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" not in paths + assert "/group_?_01" not in paths + + with open_datatree( + store, # type: ignore[arg-type] + group_filter="group_[*]_*", + engine=self.engine, + zarr_format=zarr_format, + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_*_01" in paths + assert "/group_*_02" in paths + assert "/group_?_01" not in paths + assert "/plain_01" not in paths + + with open_datatree( + store, # type: ignore[arg-type] + group_filter="group_[?]_01", + engine=self.engine, + zarr_format=zarr_format, + ) as tree: + paths = {node.path for node in tree.subtree} + assert "/group_?_01" in paths + assert "/group_*_01" not in paths + + def test_open_datatree_group_filter_preserves_data( + self, tmpdir, zarr_format + ) -> None: + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/A": xr.Dataset({"a_var": 2}), + "/A/leaf_0": xr.Dataset({"data": ("x", [1, 2])}), + } + ) + filepath = str(tmpdir / "filter_data.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + with open_datatree( + filepath, group_filter="*/leaf_0", engine=self.engine + ) as tree: + assert tree["/A"].dataset["a_var"].item() == 2 + np.testing.assert_array_equal( + tree["/A/leaf_0"].dataset["data"].values, [1, 2] + ) + + def test_open_datatree_group_with_literal_metachar(self, zarr_format) -> None: + # MemoryStore: Windows disallows ``*``/``?`` in directory names. + from zarr.storage import MemoryStore + + original_dt = DataTree.from_dict( + { + "/": xr.Dataset({"root_var": 1}), + "/weird_*_name": xr.Dataset({"data": ("x", [1, 2])}), + "/weird_X_name": xr.Dataset({"data": ("x", [3, 4])}), + "/weird_Y_name": xr.Dataset({"data": ("x", [5, 6])}), + } + ) + store = MemoryStore() + original_dt.to_zarr(store, zarr_format=zarr_format) + + with open_datatree( + store, # type: ignore[arg-type] + group="weird_*_name", + engine=self.engine, + zarr_format=zarr_format, + ) as tree: + paths = {node.path for node in tree.subtree} + assert paths == {"/"} + np.testing.assert_array_equal(tree.dataset["data"].values, [1, 2]) + + def test_open_datatree_group_and_group_filter_mutually_exclusive( + self, tmpdir, zarr_format + ) -> None: + original_dt = DataTree.from_dict( + {"/": xr.Dataset({"root_var": 1}), "/A": xr.Dataset({"a_var": 2})} + ) + filepath = str(tmpdir / "mutex.zarr") + original_dt.to_zarr(filepath, zarr_format=zarr_format) + + with pytest.raises(ValueError, match="mutually exclusive"): + open_datatree(filepath, group="A", group_filter="*/A", engine=self.engine) + with pytest.raises(ValueError, match="mutually exclusive"): + open_groups(filepath, group="A", group_filter="*/A", engine=self.engine) + @requires_dask def test_open_groups_chunks(self, tmpdir, zarr_format) -> None: """Test `open_groups` with chunks on a zarr store.""" @@ -1142,3 +1491,134 @@ def test_zarr_engine_recognised(self, tmpdir, zarr_format) -> None: with open_datatree(filepath) as roundtrip_dt: assert_identical(original_dt, roundtrip_dt) + + +class TestGroupFilterHelpers: + """Unit tests for the helpers in ``xarray/backends/common.py``.""" + + def test_filter_group_paths(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/A/leaf_0", "/A/leaf_1", "/B", "/B/leaf_0"] + result = _filter_group_paths(paths, "*/leaf_0") + assert result == ["/", "/A", "/A/leaf_0", "/B", "/B/leaf_0"] + + def test_filter_group_paths_no_match(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/B"] + result = _filter_group_paths(paths, "*/nonexistent") + assert result == ["/"] + + def test_filter_group_paths_question_mark(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/B", "/AB"] + result = _filter_group_paths(paths, "?") + assert result == ["/", "/A", "/B"] + + def test_filter_group_paths_bracket(self) -> None: + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/A/leaf_0", "/A/leaf_1", "/A/leaf_2"] + result = _filter_group_paths(paths, "*/leaf_[01]") + assert result == ["/", "/A", "/A/leaf_0", "/A/leaf_1"] + + def test_filter_group_paths_literal_metachar_via_char_class(self) -> None: + from xarray.backends.common import _filter_group_paths + + # Group names that literally contain glob metacharacters are + # reachable via character-class escaping (inherited from + # fnmatch / PurePath.match semantics). + paths = ["/", "/group_*_01", "/group_*_02", "/group_?_01", "/plain_01"] + + assert _filter_group_paths(paths, "group_[*]_01") == [ + "/", + "/group_*_01", + ] + assert _filter_group_paths(paths, "group_[*]_*") == [ + "/", + "/group_*_01", + "/group_*_02", + ] + assert _filter_group_paths(paths, "group_[?]_01") == [ + "/", + "/group_?_01", + ] + + @pytest.mark.parametrize( + "group, group_filter", + [ + (None, None), + ("A", None), + (None, "*/leaf_0"), + # Empty ``group`` is not None — pins the ``is not None`` check + # for ``group`` against a refactor to plain truthiness. + ("", None), + ], + ) + def test_check_group_filter_mutex_passes(self, group, group_filter) -> None: + from xarray.backends.common import _check_group_filter_mutex + + _check_group_filter_mutex(group, group_filter) # should not raise + + @pytest.mark.parametrize( + "group, group_filter", + [ + ("A", "*/leaf_0"), + # Empty strings still count as "set" — exclusivity is by + # ``is not None``, not truthiness. + ("", ""), + ("A", ""), + ("", "*/leaf_0"), + ], + ) + def test_check_group_filter_mutex_raises_when_both_set( + self, group, group_filter + ) -> None: + from xarray.backends.common import _check_group_filter_mutex + + with pytest.raises(ValueError, match="mutually exclusive"): + _check_group_filter_mutex(group, group_filter) + + def test_check_group_filter_mutex_rejects_empty_pattern(self) -> None: + from xarray.backends.common import _check_group_filter_mutex + + with pytest.raises(ValueError, match="non-empty"): + _check_group_filter_mutex(None, "") + + def test_filter_group_paths_match_is_right_anchored(self) -> None: + # ``NodePath.match`` is anchored on the right, so ``*/leaf_0`` + # matches a ``leaf_0`` at any depth as long as the parent + # segment matches ``*``. (The root-level ``/leaf_0`` is omitted + # because :py:meth:`pathlib.PurePath.match` semantics for that + # corner case differ between Python 3.11 and 3.13.) + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/a", "/a/leaf_0", "/a/b", "/a/b/leaf_0"] + assert _filter_group_paths(paths, "*/leaf_0") == [ + "/", + "/a", + "/a/leaf_0", + "/a/b", + "/a/b/leaf_0", + ] + + def test_filter_group_paths_leading_slash_pattern(self) -> None: + # Patterns starting with ``/`` are fully anchored — they match + # the exact absolute path and nothing else. + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/A/leaf_0", "/B", "/B/leaf_0"] + assert _filter_group_paths(paths, "/A/leaf_0") == [ + "/", + "/A", + "/A/leaf_0", + ] + + def test_filter_group_paths_recursive_glob(self) -> None: + # ``**`` selects every path (pinning ``NodePath.match`` behavior). + from xarray.backends.common import _filter_group_paths + + paths = ["/", "/A", "/A/leaf_0", "/B/leaf_1"] + assert _filter_group_paths(paths, "**") == paths