From 6f078ddf017beb12863ebbe7dec6c08f6776b89f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 1 Apr 2026 21:08:24 +0200 Subject: [PATCH 01/68] Initial index implementation --- bench/ndarray/index_query_bench.py | 93 +++++ src/blosc2/indexing.py | 580 +++++++++++++++++++++++++++++ src/blosc2/lazyexpr.py | 21 ++ src/blosc2/ndarray.py | 65 +++- tests/ndarray/test_indexing.py | 82 ++++ 5 files changed, 838 insertions(+), 3 deletions(-) create mode 100644 bench/ndarray/index_query_bench.py create mode 100644 src/blosc2/indexing.py create mode 100644 tests/ndarray/test_indexing.py diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py new file mode 100644 index 00000000..9f559862 --- /dev/null +++ b/bench/ndarray/index_query_bench.py @@ -0,0 +1,93 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import statistics +import time + +import numpy as np + +import blosc2 + + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +CHUNK_LEN = 100_000 +BLOCK_LEN = 20_000 +REPEATS = 5 + + +def build_array(size: int) -> blosc2.NDArray: + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.empty(size, dtype=dtype) + data["id"] = np.arange(size, dtype=np.int64) + data["payload"] = (np.arange(size, dtype=np.float32) % 1024) / 1024 + return blosc2.asarray(data, chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) + + +def benchmark_once(expr, *, use_index: bool) -> tuple[float, int]: + start = time.perf_counter() + result = expr.compute(_use_index=use_index)[:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def benchmark_size(size: int) -> dict: + arr = build_array(size) + lo = size // 2 + width = max(10_000, size // 1_000) + hi = min(size, lo + width) + + build_start = time.perf_counter() + arr.create_index(field="id") + build_time = time.perf_counter() - build_start + + expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields).where(arr) + explanation = expr.explain() + + warm_scan, scan_len = benchmark_once(expr, use_index=False) + warm_index, index_len = benchmark_once(expr, use_index=True) + assert scan_len == index_len + del warm_scan, warm_index + + scan_runs = [benchmark_once(expr, use_index=False)[0] for _ in range(REPEATS)] + index_runs = [benchmark_once(expr, use_index=True)[0] for _ in range(REPEATS)] + + return { + "size": size, + "query_rows": index_len, + "build_s": build_time, + "scan_ms": statistics.median(scan_runs) * 1_000, + "index_ms": statistics.median(index_runs) * 1_000, + "speedup": statistics.median(scan_runs) / statistics.median(index_runs), + "candidate_chunks": explanation["candidate_chunks"], + "total_chunks": explanation["total_chunks"], + } + + +def main() -> None: + print("Structured range-query benchmark with chunk zone-map indexes") + print(f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={REPEATS}") + print( + "size,query_rows,build_s,scan_ms,index_ms,speedup,candidate_chunks,total_chunks" + ) + for size in SIZES: + result = benchmark_size(size) + print( + f"{result['size']}," + f"{result['query_rows']}," + f"{result['build_s']:.4f}," + f"{result['scan_ms']:.3f}," + f"{result['index_ms']:.3f}," + f"{result['speedup']:.2f}," + f"{result['candidate_chunks']}," + f"{result['total_chunks']}" + ) + + +if __name__ == "__main__": + main() diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py new file mode 100644 index 00000000..a7075313 --- /dev/null +++ b/src/blosc2/indexing.py @@ -0,0 +1,580 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import ast +import math +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +import blosc2 + +INDEXES_VLMETA_KEY = "blosc2_indexes" +INDEX_FORMAT_VERSION = 1 +INDEX_KIND_ZONE_MAP = "zone-map" + +FLAG_ALL_NAN = np.uint8(1 << 0) +FLAG_HAS_NAN = np.uint8(1 << 1) + +_IN_MEMORY_INDEXES: dict[int, dict] = {} +_SUMMARY_CACHE: dict[tuple[int, str | None], np.ndarray] = {} + + +@dataclass(slots=True) +class IndexPlan: + usable: bool + reason: str + candidate_chunks: np.ndarray | None = None + descriptor: dict | None = None + field: str | None = None + total_chunks: int = 0 + selected_chunks: int = 0 + + +@dataclass(slots=True) +class PredicatePlan: + base: blosc2.NDArray + candidate_chunks: np.ndarray + descriptor: dict + field: str | None + + +def _default_index_store() -> dict: + return {"version": INDEX_FORMAT_VERSION, "indexes": {}} + + +def _array_key(array: blosc2.NDArray) -> int: + return id(array) + + +def _field_token(field: str | None) -> str: + return "__self__" if field is None else field + + +def _copy_descriptor(descriptor: dict) -> dict: + summary = descriptor.get("summary") + descriptor = descriptor.copy() + if summary is not None: + descriptor["summary"] = summary.copy() + return descriptor + + +def _is_persistent_array(array: blosc2.NDArray) -> bool: + return array.urlpath is not None + + +def _load_store(array: blosc2.NDArray) -> dict: + if _is_persistent_array(array): + try: + store = array.schunk.vlmeta[INDEXES_VLMETA_KEY] + except KeyError: + return _default_index_store() + if not isinstance(store, dict): + return _default_index_store() + store.setdefault("version", INDEX_FORMAT_VERSION) + store.setdefault("indexes", {}) + return store + return _IN_MEMORY_INDEXES.get(_array_key(array), _default_index_store()) + + +def _save_store(array: blosc2.NDArray, store: dict) -> None: + store.setdefault("version", INDEX_FORMAT_VERSION) + store.setdefault("indexes", {}) + if _is_persistent_array(array): + array.schunk.vlmeta[INDEXES_VLMETA_KEY] = store + else: + _IN_MEMORY_INDEXES[_array_key(array)] = store + + +def _supported_index_dtype(dtype: np.dtype) -> bool: + dtype = np.dtype(dtype) + return dtype.kind in {"b", "i", "u", "f", "m", "M"} + + +def _field_dtype(array: blosc2.NDArray, field: str | None) -> np.dtype: + if field is None: + return np.dtype(array.dtype) + if array.dtype.fields is None: + raise TypeError("field indexes require a structured dtype") + if field not in array.dtype.fields: + raise ValueError(f"field {field!r} is not present in the dtype") + return np.dtype(array.dtype.fields[field][0]) + + +def _validate_index_target(array: blosc2.NDArray, field: str | None) -> np.dtype: + if not isinstance(array, blosc2.NDArray): + raise TypeError("indexes are only supported on NDArray") + if array.ndim != 1: + raise ValueError("indexes are only supported on 1-D NDArray objects") + dtype = _field_dtype(array, field) + if not _supported_index_dtype(dtype): + raise TypeError(f"dtype {dtype} is not supported by the current index engine") + return dtype + + +def _sanitize_sidecar_root(urlpath: str | Path) -> tuple[Path, str]: + path = Path(urlpath) + suffix = "".join(path.suffixes) + if suffix: + root = path.name[: -len(suffix)] + else: + root = path.name + return path, root + + +def _summary_sidecar_path(array: blosc2.NDArray, field: str | None, kind: str) -> str: + path, root = _sanitize_sidecar_root(array.urlpath) + token = _field_token(field) + return str(path.with_name(f"{root}.__index__.{token}.{kind}.b2nd")) + + +def _summary_cache_key(array: blosc2.NDArray, field: str | None) -> tuple[int, str | None]: + return (_array_key(array), field) + + +def _compute_chunk_summaries(array: blosc2.NDArray, field: str | None, dtype: np.dtype) -> np.ndarray: + chunk_len = array.chunks[0] + nchunks = math.ceil(array.shape[0] / chunk_len) + summary_dtype = np.dtype([("min", dtype), ("max", dtype), ("flags", np.uint8)]) + summaries = np.empty(nchunks, dtype=summary_dtype) + + for nchunk in range(nchunks): + start = nchunk * chunk_len + stop = min(start + chunk_len, array.shape[0]) + chunk = array[start:stop] + if field is not None: + chunk = chunk[field] + flags = np.uint8(0) + if dtype.kind == "f": + valid = ~np.isnan(chunk) + if not np.all(valid): + flags |= FLAG_HAS_NAN + if not np.any(valid): + flags |= FLAG_ALL_NAN + value = np.zeros((), dtype=dtype)[()] + summaries[nchunk] = (value, value, flags) + continue + chunk = chunk[valid] + summaries[nchunk] = (chunk.min(), chunk.max(), flags) + return summaries + + +def _store_summary_data( + array: blosc2.NDArray, + field: str | None, + summaries: np.ndarray, + persistent: bool, + kind: str, +) -> dict: + if persistent: + summary_path = _summary_sidecar_path(array, field, kind) + blosc2.remove_urlpath(summary_path) + blosc2.asarray(summaries, urlpath=summary_path, mode="w") + _SUMMARY_CACHE[_summary_cache_key(array, field)] = summaries + return {"path": summary_path, "dtype": summaries.dtype.descr} + _SUMMARY_CACHE[_summary_cache_key(array, field)] = summaries + return {"path": None, "dtype": summaries.dtype.descr} + + +def _clear_summary_cache(array: blosc2.NDArray, field: str | None) -> None: + _SUMMARY_CACHE.pop(_summary_cache_key(array, field), None) + + +def _get_summary_data(array: blosc2.NDArray, descriptor: dict) -> np.ndarray: + field = descriptor["field"] + cache_key = _summary_cache_key(array, field) + cached = _SUMMARY_CACHE.get(cache_key) + if cached is not None: + return cached + summary_path = descriptor["summary"]["path"] + if summary_path is None: + raise RuntimeError("in-memory index metadata is missing from the current process") + summaries = blosc2.open(summary_path)[:] + _SUMMARY_CACHE[cache_key] = summaries + return summaries + + +def _build_descriptor( + array: blosc2.NDArray, + field: str | None, + dtype: np.dtype, + kind: str, + optlevel: int, + granularity: str, + persistent: bool, + name: str | None, + summary: dict, +) -> dict: + return { + "name": name or _field_token(field), + "field": field, + "kind": INDEX_KIND_ZONE_MAP, + "requested_kind": kind, + "version": INDEX_FORMAT_VERSION, + "optlevel": optlevel, + "granularity": granularity, + "persistent": persistent, + "stale": False, + "dtype": np.dtype(dtype).str, + "shape": tuple(array.shape), + "chunks": tuple(array.chunks), + "nchunks": math.ceil(array.shape[0] / array.chunks[0]), + "summary": summary, + } + + +def create_index( + array: blosc2.NDArray, + field: str | None = None, + kind: str = "light", + optlevel: int = 3, + granularity: str = "chunk", + persistent: bool | None = None, + name: str | None = None, + **kwargs, +) -> dict: + del kwargs + dtype = _validate_index_target(array, field) + if kind not in {"ultralight", "light", "medium"}: + raise NotImplementedError("only zone-map style indexes are implemented for now") + if granularity != "chunk": + raise NotImplementedError("only chunk-granularity indexes are implemented for now") + if persistent is None: + persistent = _is_persistent_array(array) + + summaries = _compute_chunk_summaries(array, field, dtype) + summary = _store_summary_data(array, field, summaries, persistent, kind) + descriptor = _build_descriptor( + array, field, dtype, kind, optlevel, granularity, persistent, name, summary + ) + + store = _load_store(array) + store["indexes"][_field_token(field)] = descriptor + _save_store(array, store) + return _copy_descriptor(descriptor) + + +def create_csindex(array: blosc2.NDArray, field: str | None = None, **kwargs) -> dict: + del array, field, kwargs + raise NotImplementedError("full permutation indexes are not implemented yet") + + +def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: + store = _load_store(array) + token = _field_token(field) if field is not None or name is None else None + if token is None: + for key, descriptor in store["indexes"].items(): + if descriptor.get("name") == name: + token = key + break + if token is None or token not in store["indexes"]: + raise KeyError("index not found") + + descriptor = store["indexes"].pop(token) + _save_store(array, store) + _clear_summary_cache(array, descriptor["field"]) + summary_path = descriptor["summary"]["path"] + if summary_path: + blosc2.remove_urlpath(summary_path) + + +def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> dict: + store = _load_store(array) + token = _field_token(field) if field is not None or name is None else None + if token is None: + for key, descriptor in store["indexes"].items(): + if descriptor.get("name") == name: + token = key + field = descriptor["field"] + break + if token is None or token not in store["indexes"]: + raise KeyError("index not found") + descriptor = store["indexes"][token] + drop_index(array, field=descriptor["field"], name=descriptor["name"]) + return create_index( + array, + field=descriptor["field"], + kind=descriptor["requested_kind"], + optlevel=descriptor["optlevel"], + granularity=descriptor["granularity"], + persistent=descriptor["persistent"], + name=descriptor["name"], + ) + + +def get_indexes(array: blosc2.NDArray) -> list[dict]: + store = _load_store(array) + return [_copy_descriptor(store["indexes"][key]) for key in sorted(store["indexes"])] + + +def mark_indexes_stale(array: blosc2.NDArray) -> None: + store = _load_store(array) + if not store["indexes"]: + return + changed = False + for descriptor in store["indexes"].values(): + if not descriptor.get("stale", False): + descriptor["stale"] = True + changed = True + if changed: + _save_store(array, store) + + +def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: + store = _load_store(array) + descriptor = store["indexes"].get(_field_token(field)) + if descriptor is None: + return None + if descriptor.get("stale", False): + return None + if tuple(descriptor.get("shape", ())) != tuple(array.shape): + return None + if tuple(descriptor.get("chunks", ())) != tuple(array.chunks): + return None + return descriptor + + +def _normalize_scalar(value, dtype: np.dtype): + if isinstance(value, np.generic): + return value.item() + if dtype.kind == "f" and isinstance(value, float) and np.isnan(value): + raise ValueError("NaN comparisons are not indexable") + arr = np.asarray(value, dtype=dtype) + return arr[()] + + +def _candidate_chunks_from_summary(summaries: np.ndarray, op: str, value, dtype: np.dtype) -> np.ndarray: + mins = summaries["min"] + maxs = summaries["max"] + flags = summaries["flags"] + valid = (flags & FLAG_ALL_NAN) == 0 + value = _normalize_scalar(value, dtype) + if op == "==": + return valid & (mins <= value) & (value <= maxs) + if op == "<": + return valid & (mins < value) + if op == "<=": + return valid & (mins <= value) + if op == ">": + return valid & (maxs > value) + if op == ">=": + return valid & (maxs >= value) + raise ValueError(f"unsupported comparison operator {op!r}") + + +def _operand_target(operand) -> tuple[blosc2.NDArray, str | None] | None: + if isinstance(operand, blosc2.NDField): + return operand.ndarr, operand.field + if isinstance(operand, blosc2.NDArray): + return operand, None + return None + + +def _literal_value(node: ast.AST): + if isinstance(node, ast.Constant): + return node.value + if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub): + value = _literal_value(node.operand) + if isinstance(value, bool): + raise ValueError("boolean negation is not a scalar literal here") + return -value + if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.UAdd): + return _literal_value(node.operand) + raise ValueError("node is not a supported scalar literal") + + +def _flip_operator(op: str) -> str: + return {"<": ">", "<=": ">=", ">": "<", ">=": "<=", "==": "=="}[op] + + +def _compare_operator(node: ast.AST) -> str | None: + if isinstance(node, ast.Eq): + return "==" + if isinstance(node, ast.Lt): + return "<" + if isinstance(node, ast.LtE): + return "<=" + if isinstance(node, ast.Gt): + return ">" + if isinstance(node, ast.GtE): + return ">=" + return None + + +def _plan_compare(node: ast.Compare, operands: dict) -> PredicatePlan | None: + if len(node.ops) != 1 or len(node.comparators) != 1: + return None + op = _compare_operator(node.ops[0]) + if op is None: + return None + + left_target = operands.get(node.left.id) if isinstance(node.left, ast.Name) else None + right_target = ( + operands.get(node.comparators[0].id) if isinstance(node.comparators[0], ast.Name) else None + ) + + try: + if left_target is not None: + value = _literal_value(node.comparators[0]) + target = _operand_target(left_target) + elif right_target is not None: + value = _literal_value(node.left) + target = _operand_target(right_target) + op = _flip_operator(op) + else: + return None + except ValueError: + return None + + if target is None: + return None + base, field = target + if base.ndim != 1: + return None + descriptor = _descriptor_for(base, field) + if descriptor is None: + return None + dtype = np.dtype(descriptor["dtype"]) + try: + summaries = _get_summary_data(base, descriptor) + mask = _candidate_chunks_from_summary(summaries, op, value, dtype) + except (RuntimeError, ValueError, TypeError): + return None + return PredicatePlan(base=base, candidate_chunks=mask, descriptor=descriptor, field=field) + + +def _same_target(left: PredicatePlan, right: PredicatePlan) -> bool: + return left.base is right.base and left.base.chunks == right.base.chunks + + +def _merge_plans(left: PredicatePlan, right: PredicatePlan, op: str) -> PredicatePlan | None: + if not _same_target(left, right): + return None + if op == "and": + candidate_chunks = left.candidate_chunks & right.candidate_chunks + else: + candidate_chunks = left.candidate_chunks | right.candidate_chunks + return PredicatePlan( + base=left.base, + candidate_chunks=candidate_chunks, + descriptor=left.descriptor, + field=left.field, + ) + + +def _plan_boolop(node: ast.BoolOp, operands: dict) -> PredicatePlan | None: + op = "and" if isinstance(node.op, ast.And) else "or" if isinstance(node.op, ast.Or) else None + if op is None: + return None + + plans = [_plan_node(value, operands) for value in node.values] + if op == "and": + plans = [plan for plan in plans if plan is not None] + if not plans: + return None + elif any(plan is None for plan in plans): + return None + + plan = plans[0] + for other in plans[1:]: + merged = _merge_plans(plan, other, op) + if merged is None: + return None + plan = merged + return plan + + +def _plan_bitop(node: ast.BinOp, operands: dict) -> PredicatePlan | None: + if isinstance(node.op, ast.BitAnd): + op = "and" + elif isinstance(node.op, ast.BitOr): + op = "or" + else: + return None + + left = _plan_node(node.left, operands) + right = _plan_node(node.right, operands) + if left is None: + return right if op == "and" else None + if right is None: + return left if op == "and" else None + return _merge_plans(left, right, op) + + +def _plan_node(node: ast.AST, operands: dict) -> PredicatePlan | None: + if isinstance(node, ast.Compare): + return _plan_compare(node, operands) + if isinstance(node, ast.BoolOp): + return _plan_boolop(node, operands) + if isinstance(node, ast.BinOp): + return _plan_bitop(node, operands) + return None + + +def plan_query( + expression: str, + operands: dict, + where: dict | None, + *, + use_index: bool = True, +) -> IndexPlan: + if not use_index: + return IndexPlan(False, "index usage disabled for this query") + if where is None or len(where) != 1: + return IndexPlan(False, "indexing is only available for where(x) style filtering") + + try: + tree = ast.parse(expression, mode="eval") + except SyntaxError: + return IndexPlan(False, "expression is not valid Python syntax for planning") + + plan = _plan_node(tree.body, operands) + if plan is None: + return IndexPlan(False, "no usable index was found for this predicate") + + total_chunks = len(plan.candidate_chunks) + selected_chunks = int(np.count_nonzero(plan.candidate_chunks)) + if selected_chunks == total_chunks: + return IndexPlan( + False, + "available index does not prune any chunks for this predicate", + candidate_chunks=plan.candidate_chunks, + descriptor=_copy_descriptor(plan.descriptor), + field=plan.field, + total_chunks=total_chunks, + selected_chunks=selected_chunks, + ) + return IndexPlan( + True, + "zone-map index selected", + candidate_chunks=plan.candidate_chunks, + descriptor=_copy_descriptor(plan.descriptor), + field=plan.field, + total_chunks=total_chunks, + selected_chunks=selected_chunks, + ) + + +def will_use_index(expr) -> bool: + where = getattr(expr, "_where_args", None) + return plan_query(expr.expression, expr.operands, where).usable + + +def explain_query(expr) -> dict: + where = getattr(expr, "_where_args", None) + plan = plan_query(expr.expression, expr.operands, where) + return { + "will_use_index": plan.usable, + "reason": plan.reason, + "field": plan.field, + "candidate_chunks": plan.selected_chunks, + "total_chunks": plan.total_chunks, + "descriptor": plan.descriptor, + } diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 63d656bc..45d6f889 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1758,6 +1758,7 @@ def slices_eval( # noqa: C901 ne_args = {} chunks = kwargs.get("chunks") where: dict | None = kwargs.pop("_where_args", None) + use_index = kwargs.pop("_use_index", True) _indices = kwargs.pop("_indices", False) if _indices and (not where or len(where) != 1): raise NotImplementedError("Indices can only be used with one where condition") @@ -1836,6 +1837,11 @@ def slices_eval( # noqa: C901 if 0 not in chunks else np.asarray(shape) ) + index_plan = None + if where is not None and len(where) == 1 and use_index: + from . import indexing + + index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) for chunk_slice in intersecting_chunks: # Check whether current cslice intersects with _slice @@ -1851,6 +1857,10 @@ def slices_eval( # noqa: C901 offset = tuple(s.start for s in cslice) # offset for the udf cslice_shape = tuple(s.stop - s.start for s in cslice) len_chunk = math.prod(cslice_shape) + if index_plan is not None and index_plan.usable and not index_plan.candidate_chunks[nchunk]: + if _indices or _order: + leninputs += len_chunk + continue # get local index of part of out that is to be updated cslice_subidx = ( ndindex.ndindex(cslice).as_subindex(_slice).raw @@ -3687,6 +3697,16 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray: lazy_expr._order = order return lazy_expr + def will_use_index(self) -> bool: + from . import indexing + + return indexing.will_use_index(self) + + def explain(self) -> dict: + from . import indexing + + return indexing.explain_query(self) + def compute( self, item=(), @@ -3735,6 +3755,7 @@ def compute( "_indices", "_order", "_ne_args", + "_use_index", "dtype", "shape", "fp_accuracy", diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index c972e03a..257bb22d 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4409,14 +4409,22 @@ def __setitem__( _slice = ndindex.ndindex(()).expand(self.shape) # just get whole array else: # do nothing return self - return self._get_set_findex_default(_slice, value=value) + result = self._get_set_findex_default(_slice, value=value) + from . import indexing + + indexing.mark_indexes_stale(self) + return result start, stop, step, none_mask = get_ndarray_start_stop(self.ndim, key_, self.shape) if step != (1,) * self.ndim: # handle non-unit or negative steps if np.any(none_mask): raise ValueError("Cannot mix non-unit steps and None indexing for __setitem__.") - return self._get_set_nonunit_steps((start, stop, step, mask), value=value) + result = self._get_set_nonunit_steps((start, stop, step, mask), value=value) + from . import indexing + + indexing.mark_indexes_stale(self) + return result shape = [sp - st for sp, st in zip(stop, start, strict=False)] if isinstance(value, blosc2.Operand): # handles SimpleProxy, NDArray, LazyExpr etc. @@ -4431,7 +4439,11 @@ def __setitem__( # when using complex functions (e.g. conj) with real arrays value = value.real.astype(self.dtype) - return super().set_slice((start, stop), value) + result = super().set_slice((start, stop), value) + from . import indexing + + indexing.mark_indexes_stale(self) + return result def __iter__(self): """Iterate over the (outer) elements of the array. @@ -4705,6 +4717,50 @@ def save(self, urlpath: str, contiguous=True, **kwargs: Any) -> None: super().copy(self.dtype, cparams=asdict(self.cparams), **kwargs) + def create_index( + self, + field: str | None = None, + kind: str = "light", + optlevel: int = 3, + granularity: str = "chunk", + persistent: bool | None = None, + name: str | None = None, + **kwargs: Any, + ) -> dict: + from . import indexing + + return indexing.create_index( + self, + field=field, + kind=kind, + optlevel=optlevel, + granularity=granularity, + persistent=persistent, + name=name, + **kwargs, + ) + + def create_csindex(self, field: str | None = None, **kwargs: Any) -> dict: + from . import indexing + + return indexing.create_csindex(self, field=field, **kwargs) + + def drop_index(self, field: str | None = None, name: str | None = None) -> None: + from . import indexing + + indexing.drop_index(self, field=field, name=name) + + def rebuild_index(self, field: str | None = None, name: str | None = None) -> dict: + from . import indexing + + return indexing.rebuild_index(self, field=field, name=name) + + @property + def indexes(self) -> list[dict]: + from . import indexing + + return indexing.get_indexes(self) + def resize(self, newshape: tuple | list) -> None: """Change the shape of the array by growing or shrinking one or more dimensions. @@ -4745,6 +4801,9 @@ def resize(self, newshape: tuple | list) -> None: ) blosc2_ext.check_access_mode(self.schunk.urlpath, self.schunk.mode) super().resize(newshape) + from . import indexing + + indexing.mark_indexes_stale(self) def slice(self, key: int | slice | Sequence[slice], **kwargs: Any) -> NDArray: """Get a (multidimensional) slice as a new :ref:`NDArray`. diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py new file mode 100644 index 00000000..0c114fd1 --- /dev/null +++ b/tests/ndarray/test_indexing.py @@ -0,0 +1,82 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np + +import blosc2 + + +def test_scalar_zone_map_index_matches_scan(): + data = np.arange(200_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(10_000,), blocks=(2_000,)) + descriptor = arr.create_index() + + assert descriptor["kind"] == "zone-map" + assert descriptor["field"] is None + assert len(arr.indexes) == 1 + + expr = ((arr >= 120_000) & (arr < 125_000)).where(arr) + assert expr.will_use_index() is True + explanation = expr.explain() + assert explanation["candidate_chunks"] < explanation["total_chunks"] + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data >= 120_000) & (data < 125_000)]) + + +def test_structured_field_index_matches_scan(): + dtype = np.dtype([("id", np.int64), ("payload", np.float64)]) + data = np.empty(120_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + data["payload"] = np.linspace(0, 1, data.shape[0], dtype=np.float64) + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + arr.create_index(field="id") + + expr = blosc2.lazyexpr("(id >= 48_000) & (id < 51_000)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= 48_000) & (data["id"] < 51_000)]) + + +def test_persistent_index_survives_reopen(tmp_path): + path = tmp_path / "indexed_array.b2nd" + data = np.arange(80_000, dtype=np.int64) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) + descriptor = arr.create_index() + + assert descriptor["summary"]["path"] is not None + + reopened = blosc2.open(path, mode="a") + assert len(reopened.indexes) == 1 + assert reopened.indexes[0]["summary"]["path"] == descriptor["summary"]["path"] + + expr = (reopened >= 72_000).where(reopened) + assert expr.will_use_index() is True + np.testing.assert_array_equal(expr.compute()[:], data[data >= 72_000]) + + +def test_mutation_marks_index_stale_and_rebuild_restores_it(): + data = np.arange(50_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,)) + arr.create_index() + + arr[:25] = -1 + assert arr.indexes[0]["stale"] is True + + expr = (arr < 0).where(arr) + assert expr.will_use_index() is False + np.testing.assert_array_equal(expr.compute()[:], np.full(25, -1, dtype=np.int64)) + + rebuilt = arr.rebuild_index() + assert rebuilt["stale"] is False + assert expr.will_use_index() is True From 37ec18c09ea5ec52ab525ed0ee4fdd55c7b8e9a4 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 1 Apr 2026 22:13:16 +0200 Subject: [PATCH 02/68] More bench, and moderate index improvements --- bench/ndarray/index_query_bench.py | 274 +++++++++++-- src/blosc2/indexing.py | 632 ++++++++++++++++++++++------- src/blosc2/lazyexpr.py | 14 +- tests/ndarray/test_indexing.py | 23 +- 4 files changed, 745 insertions(+), 198 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 9f559862..45b8c6a5 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -7,28 +7,67 @@ from __future__ import annotations +import argparse +import os import statistics +import tempfile import time +from pathlib import Path import numpy as np import blosc2 - SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) CHUNK_LEN = 100_000 BLOCK_LEN = 20_000 REPEATS = 5 +KINDS = ("ultralight", "light", "medium", "full") +DISTS = ("sorted", "block-shuffled", "random") +RNG_SEED = 0 + + +def fill_ids(ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: + size = ids.shape[0] + if dist == "sorted": + ids[:] = np.arange(size, dtype=np.int64) + return + + if dist == "block-shuffled": + nblocks = (size + BLOCK_LEN - 1) // BLOCK_LEN + order = rng.permutation(nblocks) + dest = 0 + for src_block in order: + src_start = int(src_block) * BLOCK_LEN + src_stop = min(src_start + BLOCK_LEN, size) + block_size = src_stop - src_start + ids[dest : dest + block_size] = np.arange(src_start, src_stop, dtype=np.int64) + dest += block_size + return + if dist == "random": + ids[:] = np.arange(size, dtype=np.int64) + rng.shuffle(ids) + return -def build_array(size: int) -> blosc2.NDArray: + raise ValueError(f"unsupported distribution {dist!r}") + + +def make_source_data(size: int, dist: str) -> np.ndarray: dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) - data = np.empty(size, dtype=dtype) - data["id"] = np.arange(size, dtype=np.int64) - data["payload"] = (np.arange(size, dtype=np.float32) % 1024) / 1024 + data = np.zeros(size, dtype=dtype) + fill_ids(data["id"], dist, np.random.default_rng(RNG_SEED)) + return data + + +def build_array(data: np.ndarray) -> blosc2.NDArray: return blosc2.asarray(data, chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) +def build_persistent_array(data: np.ndarray, path: Path) -> blosc2.NDArray: + return blosc2.asarray(data, urlpath=path, mode="w", chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) + + def benchmark_once(expr, *, use_index: bool) -> tuple[float, int]: start = time.perf_counter() result = expr.compute(_use_index=use_index)[:] @@ -36,57 +75,204 @@ def benchmark_once(expr, *, use_index: bool) -> tuple[float, int]: return elapsed, len(result) -def benchmark_size(size: int) -> dict: - arr = build_array(size) +def index_sizes(descriptor: dict) -> tuple[int, int]: + logical = 0 + disk = 0 + for level_info in descriptor["levels"].values(): + dtype = np.dtype(level_info["dtype"]) + logical += dtype.itemsize * level_info["nsegments"] + if level_info["path"]: + disk += os.path.getsize(level_info["path"]) + + full = descriptor.get("full") + if full is not None: + values = blosc2.open(full["values_path"]) + positions = blosc2.open(full["positions_path"]) + logical += values.shape[0] * values.dtype.itemsize + logical += positions.shape[0] * positions.dtype.itemsize + disk += os.path.getsize(full["values_path"]) + disk += os.path.getsize(full["positions_path"]) + return logical, disk + + +def benchmark_size(size: int, size_dir: Path, dist: str) -> list[dict]: + data = make_source_data(size, dist) + arr = build_persistent_array(data, size_dir / f"size_{size}_{dist}.b2nd") + del data lo = size // 2 - width = max(10_000, size // 1_000) + width = 2_500 hi = min(size, lo + width) + expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields).where(arr) + base_bytes = size * arr.dtype.itemsize + compressed_base_bytes = os.path.getsize(arr.urlpath) - build_start = time.perf_counter() - arr.create_index(field="id") - build_time = time.perf_counter() - build_start + scan_ms = benchmark_once(expr, use_index=False)[0] * 1_000 - expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields).where(arr) - explanation = expr.explain() + rows = [] + for kind in KINDS: + if arr.indexes: + arr.drop_index(field="id") + build_start = time.perf_counter() + arr.create_index(field="id", kind=kind) + build_time = time.perf_counter() - build_start + explanation = expr.explain() + logical_index_bytes, disk_index_bytes = index_sizes(arr.indexes[0]) - warm_scan, scan_len = benchmark_once(expr, use_index=False) - warm_index, index_len = benchmark_once(expr, use_index=True) - assert scan_len == index_len - del warm_scan, warm_index + warm_index, index_len = benchmark_once(expr, use_index=True) + del warm_index + index_runs = [benchmark_once(expr, use_index=True)[0] for _ in range(REPEATS)] + index_ms = statistics.median(index_runs) * 1_000 - scan_runs = [benchmark_once(expr, use_index=False)[0] for _ in range(REPEATS)] - index_runs = [benchmark_once(expr, use_index=True)[0] for _ in range(REPEATS)] + rows.append( + { + "size": size, + "dist": dist, + "kind": kind, + "level": explanation["level"], + "query_rows": index_len, + "build_s": build_time, + "create_idx_ms": build_time * 1_000, + "scan_ms": scan_ms, + "index_ms": index_ms, + "speedup": scan_ms / index_ms, + "candidate_units": explanation["candidate_units"], + "total_units": explanation["total_units"], + "logical_index_bytes": logical_index_bytes, + "disk_index_bytes": disk_index_bytes, + "index_pct": logical_index_bytes / base_bytes * 100, + "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + } + ) + return rows - return { - "size": size, - "query_rows": index_len, - "build_s": build_time, - "scan_ms": statistics.median(scan_runs) * 1_000, - "index_ms": statistics.median(index_runs) * 1_000, - "speedup": statistics.median(scan_runs) / statistics.median(index_runs), - "candidate_chunks": explanation["candidate_chunks"], - "total_chunks": explanation["total_chunks"], - } +def parse_human_size(value: str) -> int: + value = value.strip() + if not value: + raise argparse.ArgumentTypeError("size must not be empty") -def main() -> None: - print("Structured range-query benchmark with chunk zone-map indexes") - print(f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={REPEATS}") - print( - "size,query_rows,build_s,scan_ms,index_ms,speedup,candidate_chunks,total_chunks" + suffixes = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000} + suffix = value[-1].lower() + if suffix in suffixes: + number = value[:-1] + if not number: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") + try: + parsed = int(number) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + size = parsed * suffixes[suffix] + else: + try: + size = int(value) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + + if size <= 0: + raise argparse.ArgumentTypeError("size must be a positive integer") + return size + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark python-blosc2 index kinds.") + parser.add_argument( + "--size", + type=parse_human_size, + help="Benchmark a single array size. Supports suffixes like 1k, 1K, 1M, 1G.", + ) + parser.add_argument( + "--dist", + choices=(*DISTS, "all"), + default="sorted", + help="Distribution for the indexed field. Use 'all' to benchmark every distribution.", ) - for size in SIZES: - result = benchmark_size(size) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + sizes = (args.size,) if args.size is not None else SIZES + dists = DISTS if args.dist == "all" else (args.dist,) + + with tempfile.TemporaryDirectory() as tmpdir: + size_dir = Path(tmpdir) + all_results = [] + print("Structured range-query benchmark across index kinds") + print(f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={REPEATS}, dist={args.dist}") print( - f"{result['size']}," - f"{result['query_rows']}," - f"{result['build_s']:.4f}," - f"{result['scan_ms']:.3f}," - f"{result['index_ms']:.3f}," - f"{result['speedup']:.2f}," - f"{result['candidate_chunks']}," - f"{result['total_chunks']}" + "size,dist,kind,level,query_rows,build_s,create_idx_ms,scan_ms,index_ms,speedup," + "candidate_units,total_units,logical_index_bytes,disk_index_bytes,index_pct,index_pct_disk" ) + for dist in dists: + for size in sizes: + size_results = benchmark_size(size, size_dir, dist) + all_results.extend(size_results) + for result in size_results: + print( + f"{result['size']}," + f"{result['dist']}," + f"{result['kind']}," + f"{result['level']}," + f"{result['query_rows']}," + f"{result['build_s']:.4f}," + f"{result['create_idx_ms']:.3f}," + f"{result['scan_ms']:.3f}," + f"{result['index_ms']:.3f}," + f"{result['speedup']:.2f}," + f"{result['candidate_units']}," + f"{result['total_units']}," + f"{result['logical_index_bytes']}," + f"{result['disk_index_bytes']}," + f"{result['index_pct']:.4f}," + f"{result['index_pct_disk']:.4f}" + ) + + print() + print("Table") + headers = [ + "rows", + "dist", + "kind", + "level", + "create_idx_ms", + "scan_ms", + "index_ms", + "speedup", + "logical_bytes", + "disk_bytes", + "index_pct", + "index_pct_disk", + ] + table_rows = [] + for result in all_results: + table_rows.append( + [ + f"{result['size']:,}", + result["dist"], + result["kind"], + result["level"], + f"{result['create_idx_ms']:.3f}", + f"{result['scan_ms']:.3f}", + f"{result['index_ms']:.3f}", + f"{result['speedup']:.2f}x", + f"{result['logical_index_bytes']:,}", + f"{result['disk_index_bytes']:,}", + f"{result['index_pct']:.4f}%", + f"{result['index_pct_disk']:.4f}%", + ] + ) + + widths = [len(header) for header in headers] + for row in table_rows: + widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] + + def format_row(row: list[str]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(row, widths, strict=True)) + + print(format_row(headers)) + print(format_row(["-" * width for width in widths])) + for row in table_rows: + print(format_row(row)) if __name__ == "__main__": diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index a7075313..d31416cc 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -18,32 +18,52 @@ INDEXES_VLMETA_KEY = "blosc2_indexes" INDEX_FORMAT_VERSION = 1 -INDEX_KIND_ZONE_MAP = "zone-map" FLAG_ALL_NAN = np.uint8(1 << 0) FLAG_HAS_NAN = np.uint8(1 << 1) +SEGMENT_LEVELS_BY_KIND = { + "ultralight": ("chunk",), + "light": ("chunk", "block"), + "medium": ("chunk", "block", "subblock"), + "full": ("chunk", "block", "subblock"), +} + _IN_MEMORY_INDEXES: dict[int, dict] = {} -_SUMMARY_CACHE: dict[tuple[int, str | None], np.ndarray] = {} +_DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} @dataclass(slots=True) class IndexPlan: usable: bool reason: str - candidate_chunks: np.ndarray | None = None descriptor: dict | None = None + base: blosc2.NDArray | None = None field: str | None = None - total_chunks: int = 0 - selected_chunks: int = 0 + level: str | None = None + segment_len: int | None = None + candidate_units: np.ndarray | None = None + total_units: int = 0 + selected_units: int = 0 + exact_positions: np.ndarray | None = None @dataclass(slots=True) -class PredicatePlan: +class SegmentPredicatePlan: base: blosc2.NDArray - candidate_chunks: np.ndarray + candidate_units: np.ndarray descriptor: dict field: str | None + level: str + segment_len: int + + +@dataclass(slots=True) +class ExactPredicatePlan: + base: blosc2.NDArray + descriptor: dict + field: str | None + intervals: list[tuple[int, int]] def _default_index_store() -> dict: @@ -58,12 +78,22 @@ def _field_token(field: str | None) -> str: return "__self__" if field is None else field +def _copy_nested_dict(value: dict | None) -> dict | None: + if value is None: + return None + copied = value.copy() + for key, item in list(copied.items()): + if isinstance(item, dict): + copied[key] = item.copy() + return copied + + def _copy_descriptor(descriptor: dict) -> dict: - summary = descriptor.get("summary") - descriptor = descriptor.copy() - if summary is not None: - descriptor["summary"] = summary.copy() - return descriptor + copied = descriptor.copy() + copied["levels"] = _copy_nested_dict(descriptor.get("levels")) + if descriptor.get("full") is not None: + copied["full"] = descriptor["full"].copy() + return copied def _is_persistent_array(array: blosc2.NDArray) -> bool: @@ -94,8 +124,7 @@ def _save_store(array: blosc2.NDArray, store: dict) -> None: def _supported_index_dtype(dtype: np.dtype) -> bool: - dtype = np.dtype(dtype) - return dtype.kind in {"b", "i", "u", "f", "m", "M"} + return np.dtype(dtype).kind in {"b", "i", "u", "f", "m", "M"} def _field_dtype(array: blosc2.NDArray, field: str | None) -> np.dtype: @@ -122,101 +151,158 @@ def _validate_index_target(array: blosc2.NDArray, field: str | None) -> np.dtype def _sanitize_sidecar_root(urlpath: str | Path) -> tuple[Path, str]: path = Path(urlpath) suffix = "".join(path.suffixes) - if suffix: - root = path.name[: -len(suffix)] - else: - root = path.name + root = path.name[: -len(suffix)] if suffix else path.name return path, root -def _summary_sidecar_path(array: blosc2.NDArray, field: str | None, kind: str) -> str: +def _sidecar_path(array: blosc2.NDArray, field: str | None, kind: str, name: str) -> str: path, root = _sanitize_sidecar_root(array.urlpath) token = _field_token(field) - return str(path.with_name(f"{root}.__index__.{token}.{kind}.b2nd")) + return str(path.with_name(f"{root}.__index__.{token}.{kind}.{name}.b2nd")) + + +def _segment_len(array: blosc2.NDArray, level: str) -> int: + if level == "chunk": + return int(array.chunks[0]) + if level == "block": + return int(array.blocks[0]) + if level == "subblock": + return max(1, int(array.blocks[0]) // 8) + raise ValueError(f"unknown level {level!r}") + + +def _data_cache_key( + array: blosc2.NDArray, field: str | None, category: str, name: str +) -> tuple[int, str | None, str, str]: + return (_array_key(array), field, category, name) + +def _clear_cached_data(array: blosc2.NDArray, field: str | None) -> None: + prefix = (_array_key(array), field) + keys = [key for key in _DATA_CACHE if key[:2] == prefix] + for key in keys: + _DATA_CACHE.pop(key, None) -def _summary_cache_key(array: blosc2.NDArray, field: str | None) -> tuple[int, str | None]: - return (_array_key(array), field) +def _values_for_index(array: blosc2.NDArray, field: str | None) -> np.ndarray: + values = array[:] + return values if field is None else values[field] -def _compute_chunk_summaries(array: blosc2.NDArray, field: str | None, dtype: np.dtype) -> np.ndarray: - chunk_len = array.chunks[0] - nchunks = math.ceil(array.shape[0] / chunk_len) + +def _compute_segment_summaries(values: np.ndarray, dtype: np.dtype, segment_len: int) -> np.ndarray: + nsegments = math.ceil(values.shape[0] / segment_len) summary_dtype = np.dtype([("min", dtype), ("max", dtype), ("flags", np.uint8)]) - summaries = np.empty(nchunks, dtype=summary_dtype) - - for nchunk in range(nchunks): - start = nchunk * chunk_len - stop = min(start + chunk_len, array.shape[0]) - chunk = array[start:stop] - if field is not None: - chunk = chunk[field] + summaries = np.empty(nsegments, dtype=summary_dtype) + + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, values.shape[0]) + segment = values[start:stop] flags = np.uint8(0) if dtype.kind == "f": - valid = ~np.isnan(chunk) + valid = ~np.isnan(segment) if not np.all(valid): flags |= FLAG_HAS_NAN if not np.any(valid): flags |= FLAG_ALL_NAN - value = np.zeros((), dtype=dtype)[()] - summaries[nchunk] = (value, value, flags) + zero = np.zeros((), dtype=dtype)[()] + summaries[idx] = (zero, zero, flags) continue - chunk = chunk[valid] - summaries[nchunk] = (chunk.min(), chunk.max(), flags) + segment = segment[valid] + summaries[idx] = (segment.min(), segment.max(), flags) return summaries -def _store_summary_data( +def _store_array_sidecar( array: blosc2.NDArray, field: str | None, - summaries: np.ndarray, - persistent: bool, kind: str, + category: str, + name: str, + data: np.ndarray, + persistent: bool, ) -> dict: + cache_key = _data_cache_key(array, field, category, name) + _DATA_CACHE[cache_key] = data if persistent: - summary_path = _summary_sidecar_path(array, field, kind) - blosc2.remove_urlpath(summary_path) - blosc2.asarray(summaries, urlpath=summary_path, mode="w") - _SUMMARY_CACHE[_summary_cache_key(array, field)] = summaries - return {"path": summary_path, "dtype": summaries.dtype.descr} - _SUMMARY_CACHE[_summary_cache_key(array, field)] = summaries - return {"path": None, "dtype": summaries.dtype.descr} - - -def _clear_summary_cache(array: blosc2.NDArray, field: str | None) -> None: - _SUMMARY_CACHE.pop(_summary_cache_key(array, field), None) + path = _sidecar_path(array, field, kind, f"{category}.{name}") + blosc2.remove_urlpath(path) + blosc2.asarray(data, urlpath=path, mode="w") + else: + path = None + return {"path": path, "dtype": data.dtype.descr if data.dtype.fields else data.dtype.str} -def _get_summary_data(array: blosc2.NDArray, descriptor: dict) -> np.ndarray: - field = descriptor["field"] - cache_key = _summary_cache_key(array, field) - cached = _SUMMARY_CACHE.get(cache_key) +def _load_array_sidecar( + array: blosc2.NDArray, field: str | None, category: str, name: str, path: str | None +) -> np.ndarray: + cache_key = _data_cache_key(array, field, category, name) + cached = _DATA_CACHE.get(cache_key) if cached is not None: return cached - summary_path = descriptor["summary"]["path"] - if summary_path is None: + if path is None: raise RuntimeError("in-memory index metadata is missing from the current process") - summaries = blosc2.open(summary_path)[:] - _SUMMARY_CACHE[cache_key] = summaries - return summaries + data = blosc2.open(path)[:] + _DATA_CACHE[cache_key] = data + return data -def _build_descriptor( +def _build_levels_descriptor( array: blosc2.NDArray, field: str | None, + kind: str, dtype: np.dtype, + values: np.ndarray, + persistent: bool, +) -> dict: + levels = {} + for level in SEGMENT_LEVELS_BY_KIND[kind]: + segment_len = _segment_len(array, level) + summaries = _compute_segment_summaries(values, dtype, segment_len) + sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + levels[level] = { + "segment_len": segment_len, + "nsegments": len(summaries), + "path": sidecar["path"], + "dtype": sidecar["dtype"], + } + return levels + + +def _build_full_descriptor( + array: blosc2.NDArray, + field: str | None, + kind: str, + values: np.ndarray, + persistent: bool, +) -> dict: + order = np.argsort(values, kind="stable") + positions = order.astype(np.int64, copy=False) + sorted_values = values[order] + values_sidecar = _store_array_sidecar(array, field, kind, "full", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar(array, field, kind, "full", "positions", positions, persistent) + return { + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + } + + +def _build_descriptor( + array: blosc2.NDArray, + field: str | None, kind: str, optlevel: int, granularity: str, persistent: bool, name: str | None, - summary: dict, + dtype: np.dtype, + levels: dict, + full: dict | None, ) -> dict: return { "name": name or _field_token(field), "field": field, - "kind": INDEX_KIND_ZONE_MAP, - "requested_kind": kind, + "kind": kind, "version": INDEX_FORMAT_VERSION, "optlevel": optlevel, "granularity": granularity, @@ -225,8 +311,9 @@ def _build_descriptor( "dtype": np.dtype(dtype).str, "shape": tuple(array.shape), "chunks": tuple(array.chunks), - "nchunks": math.ceil(array.shape[0] / array.chunks[0]), - "summary": summary, + "blocks": tuple(array.blocks), + "levels": levels, + "full": full, } @@ -242,17 +329,18 @@ def create_index( ) -> dict: del kwargs dtype = _validate_index_target(array, field) - if kind not in {"ultralight", "light", "medium"}: - raise NotImplementedError("only zone-map style indexes are implemented for now") + if kind not in SEGMENT_LEVELS_BY_KIND: + raise NotImplementedError(f"unsupported index kind {kind!r}") if granularity != "chunk": - raise NotImplementedError("only chunk-granularity indexes are implemented for now") + raise NotImplementedError("only chunk-based array indexes are implemented for now") if persistent is None: persistent = _is_persistent_array(array) - summaries = _compute_chunk_summaries(array, field, dtype) - summary = _store_summary_data(array, field, summaries, persistent, kind) + values = _values_for_index(array, field) + levels = _build_levels_descriptor(array, field, kind, dtype, values, persistent) + full = _build_full_descriptor(array, field, kind, values, persistent) if kind == "full" else None descriptor = _build_descriptor( - array, field, dtype, kind, optlevel, granularity, persistent, name, summary + array, field, kind, optlevel, granularity, persistent, name, dtype, levels, full ) store = _load_store(array) @@ -262,8 +350,7 @@ def create_index( def create_csindex(array: blosc2.NDArray, field: str | None = None, **kwargs) -> dict: - del array, field, kwargs - raise NotImplementedError("full permutation indexes are not implemented yet") + return create_index(array, field=field, kind="full", **kwargs) def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: @@ -279,10 +366,15 @@ def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None descriptor = store["indexes"].pop(token) _save_store(array, store) - _clear_summary_cache(array, descriptor["field"]) - summary_path = descriptor["summary"]["path"] - if summary_path: - blosc2.remove_urlpath(summary_path) + _clear_cached_data(array, descriptor["field"]) + for level_info in descriptor["levels"].values(): + if level_info["path"]: + blosc2.remove_urlpath(level_info["path"]) + if descriptor.get("full") is not None: + if descriptor["full"]["values_path"]: + blosc2.remove_urlpath(descriptor["full"]["values_path"]) + if descriptor["full"]["positions_path"]: + blosc2.remove_urlpath(descriptor["full"]["positions_path"]) def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> dict: @@ -301,7 +393,7 @@ def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | N return create_index( array, field=descriptor["field"], - kind=descriptor["requested_kind"], + kind=descriptor["kind"], optlevel=descriptor["optlevel"], granularity=descriptor["granularity"], persistent=descriptor["persistent"], @@ -328,11 +420,8 @@ def mark_indexes_stale(array: blosc2.NDArray) -> None: def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: - store = _load_store(array) - descriptor = store["indexes"].get(_field_token(field)) - if descriptor is None: - return None - if descriptor.get("stale", False): + descriptor = _load_store(array)["indexes"].get(_field_token(field)) + if descriptor is None or descriptor.get("stale", False): return None if tuple(descriptor.get("shape", ())) != tuple(array.shape): return None @@ -341,16 +430,29 @@ def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: return descriptor +def _load_level_summaries(array: blosc2.NDArray, descriptor: dict, level: str) -> np.ndarray: + level_info = descriptor["levels"][level] + return _load_array_sidecar(array, descriptor["field"], "summary", level, level_info["path"]) + + +def _load_full_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + values = _load_array_sidecar(array, descriptor["field"], "full", "values", full["values_path"]) + positions = _load_array_sidecar(array, descriptor["field"], "full", "positions", full["positions_path"]) + return values, positions + + def _normalize_scalar(value, dtype: np.dtype): if isinstance(value, np.generic): return value.item() if dtype.kind == "f" and isinstance(value, float) and np.isnan(value): raise ValueError("NaN comparisons are not indexable") - arr = np.asarray(value, dtype=dtype) - return arr[()] + return np.asarray(value, dtype=dtype)[()] -def _candidate_chunks_from_summary(summaries: np.ndarray, op: str, value, dtype: np.dtype) -> np.ndarray: +def _candidate_units_from_summary(summaries: np.ndarray, op: str, value, dtype: np.dtype) -> np.ndarray: mins = summaries["min"] maxs = summaries["max"] flags = summaries["flags"] @@ -369,6 +471,28 @@ def _candidate_chunks_from_summary(summaries: np.ndarray, op: str, value, dtype: raise ValueError(f"unsupported comparison operator {op!r}") +def _intervals_from_sorted(values: np.ndarray, op: str, value, dtype: np.dtype) -> list[tuple[int, int]]: + value = _normalize_scalar(value, dtype) + if op == "==": + lo = np.searchsorted(values, value, side="left") + hi = np.searchsorted(values, value, side="right") + elif op == "<": + lo = 0 + hi = np.searchsorted(values, value, side="left") + elif op == "<=": + lo = 0 + hi = np.searchsorted(values, value, side="right") + elif op == ">": + lo = np.searchsorted(values, value, side="right") + hi = len(values) + elif op == ">=": + lo = np.searchsorted(values, value, side="left") + hi = len(values) + else: + raise ValueError(f"unsupported comparison operator {op!r}") + return [] if lo >= hi else [(int(lo), int(hi))] + + def _operand_target(operand) -> tuple[blosc2.NDArray, str | None] | None: if isinstance(operand, blosc2.NDField): return operand.ndarr, operand.field @@ -408,7 +532,9 @@ def _compare_operator(node: ast.AST) -> str | None: return None -def _plan_compare(node: ast.Compare, operands: dict) -> PredicatePlan | None: +def _target_from_compare( + node: ast.Compare, operands: dict +) -> tuple[blosc2.NDArray, str | None, str, object] | None: if len(node.ops) != 1 or len(node.comparators) != 1: return None op = _compare_operator(node.ops[0]) @@ -438,43 +564,73 @@ def _plan_compare(node: ast.Compare, operands: dict) -> PredicatePlan | None: base, field = target if base.ndim != 1: return None + return base, field, op, value + + +def _finest_level(descriptor: dict) -> str: + level_names = tuple(descriptor["levels"]) + return level_names[-1] + + +def _plan_segment_compare(node: ast.Compare, operands: dict) -> SegmentPredicatePlan | None: + target = _target_from_compare(node, operands) + if target is None: + return None + base, field, op, value = target descriptor = _descriptor_for(base, field) if descriptor is None: return None + level = _finest_level(descriptor) + level_info = descriptor["levels"][level] dtype = np.dtype(descriptor["dtype"]) try: - summaries = _get_summary_data(base, descriptor) - mask = _candidate_chunks_from_summary(summaries, op, value, dtype) + summaries = _load_level_summaries(base, descriptor, level) + candidate_units = _candidate_units_from_summary(summaries, op, value, dtype) except (RuntimeError, ValueError, TypeError): return None - return PredicatePlan(base=base, candidate_chunks=mask, descriptor=descriptor, field=field) + return SegmentPredicatePlan( + base=base, + candidate_units=candidate_units, + descriptor=descriptor, + field=field, + level=level, + segment_len=level_info["segment_len"], + ) -def _same_target(left: PredicatePlan, right: PredicatePlan) -> bool: - return left.base is right.base and left.base.chunks == right.base.chunks +def _same_segment_space(left: SegmentPredicatePlan, right: SegmentPredicatePlan) -> bool: + return ( + left.base is right.base + and left.level == right.level + and left.segment_len == right.segment_len + and left.candidate_units.shape == right.candidate_units.shape + ) -def _merge_plans(left: PredicatePlan, right: PredicatePlan, op: str) -> PredicatePlan | None: - if not _same_target(left, right): +def _merge_segment_plans( + left: SegmentPredicatePlan, right: SegmentPredicatePlan, op: str +) -> SegmentPredicatePlan | None: + if not _same_segment_space(left, right): return None if op == "and": - candidate_chunks = left.candidate_chunks & right.candidate_chunks + candidate_units = left.candidate_units & right.candidate_units else: - candidate_chunks = left.candidate_chunks | right.candidate_chunks - return PredicatePlan( + candidate_units = left.candidate_units | right.candidate_units + return SegmentPredicatePlan( base=left.base, - candidate_chunks=candidate_chunks, + candidate_units=candidate_units, descriptor=left.descriptor, field=left.field, + level=left.level, + segment_len=left.segment_len, ) -def _plan_boolop(node: ast.BoolOp, operands: dict) -> PredicatePlan | None: +def _plan_segment_boolop(node: ast.BoolOp, operands: dict) -> SegmentPredicatePlan | None: op = "and" if isinstance(node.op, ast.And) else "or" if isinstance(node.op, ast.Or) else None if op is None: return None - - plans = [_plan_node(value, operands) for value in node.values] + plans = [_plan_segment_node(value, operands) for value in node.values] if op == "and": plans = [plan for plan in plans if plan is not None] if not plans: @@ -484,14 +640,14 @@ def _plan_boolop(node: ast.BoolOp, operands: dict) -> PredicatePlan | None: plan = plans[0] for other in plans[1:]: - merged = _merge_plans(plan, other, op) + merged = _merge_segment_plans(plan, other, op) if merged is None: return None plan = merged return plan -def _plan_bitop(node: ast.BinOp, operands: dict) -> PredicatePlan | None: +def _plan_segment_bitop(node: ast.BinOp, operands: dict) -> SegmentPredicatePlan | None: if isinstance(node.op, ast.BitAnd): op = "and" elif isinstance(node.op, ast.BitOr): @@ -499,32 +655,147 @@ def _plan_bitop(node: ast.BinOp, operands: dict) -> PredicatePlan | None: else: return None - left = _plan_node(node.left, operands) - right = _plan_node(node.right, operands) - if left is None: - return right if op == "and" else None - if right is None: - return left if op == "and" else None - return _merge_plans(left, right, op) + left = _plan_segment_node(node.left, operands) + right = _plan_segment_node(node.right, operands) + if op == "and": + if left is None: + return right + if right is None: + return left + return _merge_segment_plans(left, right, op) + if left is None or right is None: + return None + return _merge_segment_plans(left, right, op) + + +def _plan_segment_node(node: ast.AST, operands: dict) -> SegmentPredicatePlan | None: + if isinstance(node, ast.Compare): + return _plan_segment_compare(node, operands) + if isinstance(node, ast.BoolOp): + return _plan_segment_boolop(node, operands) + if isinstance(node, ast.BinOp): + return _plan_segment_bitop(node, operands) + return None + + +def _plan_exact_compare(node: ast.Compare, operands: dict) -> ExactPredicatePlan | None: + target = _target_from_compare(node, operands) + if target is None: + return None + base, field, op, value = target + descriptor = _descriptor_for(base, field) + if descriptor is None or descriptor.get("kind") != "full": + return None + dtype = np.dtype(descriptor["dtype"]) + try: + sorted_values, _ = _load_full_arrays(base, descriptor) + intervals = _intervals_from_sorted(sorted_values, op, value, dtype) + except (RuntimeError, ValueError, TypeError): + return None + return ExactPredicatePlan(base=base, descriptor=descriptor, field=field, intervals=intervals) + + +def _same_base(left: ExactPredicatePlan, right: ExactPredicatePlan) -> bool: + return left.base is right.base and left.field == right.field + + +def _normalize_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]: + if not intervals: + return [] + intervals = sorted(intervals) + normalized = [intervals[0]] + for lo, hi in intervals[1:]: + prev_lo, prev_hi = normalized[-1] + if lo <= prev_hi: + normalized[-1] = (prev_lo, max(prev_hi, hi)) + else: + normalized.append((lo, hi)) + return normalized + + +def _intersect_intervals( + left_intervals: list[tuple[int, int]], right_intervals: list[tuple[int, int]] +) -> list[tuple[int, int]]: + intersections = [] + left = _normalize_intervals(left_intervals) + right = _normalize_intervals(right_intervals) + i = j = 0 + while i < len(left) and j < len(right): + lo = max(left[i][0], right[j][0]) + hi = min(left[i][1], right[j][1]) + if lo < hi: + intersections.append((lo, hi)) + if left[i][1] <= right[j][1]: + i += 1 + else: + j += 1 + return intersections + + +def _merge_exact_plans( + left: ExactPredicatePlan, right: ExactPredicatePlan, op: str +) -> ExactPredicatePlan | None: + if not _same_base(left, right): + return None + if op == "and": + intervals = _intersect_intervals(left.intervals, right.intervals) + else: + intervals = _normalize_intervals(left.intervals + right.intervals) + return ExactPredicatePlan( + base=left.base, descriptor=left.descriptor, field=left.field, intervals=intervals + ) + + +def _plan_exact_boolop(node: ast.BoolOp, operands: dict) -> ExactPredicatePlan | None: + op = "and" if isinstance(node.op, ast.And) else "or" if isinstance(node.op, ast.Or) else None + if op is None: + return None + plans = [_plan_exact_node(value, operands) for value in node.values] + if any(plan is None for plan in plans): + return None + plan = plans[0] + for other in plans[1:]: + merged = _merge_exact_plans(plan, other, op) + if merged is None: + return None + plan = merged + return plan + + +def _plan_exact_bitop(node: ast.BinOp, operands: dict) -> ExactPredicatePlan | None: + if isinstance(node.op, ast.BitAnd): + op = "and" + elif isinstance(node.op, ast.BitOr): + op = "or" + else: + return None + left = _plan_exact_node(node.left, operands) + right = _plan_exact_node(node.right, operands) + if left is None or right is None: + return None + return _merge_exact_plans(left, right, op) -def _plan_node(node: ast.AST, operands: dict) -> PredicatePlan | None: +def _plan_exact_node(node: ast.AST, operands: dict) -> ExactPredicatePlan | None: if isinstance(node, ast.Compare): - return _plan_compare(node, operands) + return _plan_exact_compare(node, operands) if isinstance(node, ast.BoolOp): - return _plan_boolop(node, operands) + return _plan_exact_boolop(node, operands) if isinstance(node, ast.BinOp): - return _plan_bitop(node, operands) + return _plan_exact_bitop(node, operands) return None -def plan_query( - expression: str, - operands: dict, - where: dict | None, - *, - use_index: bool = True, -) -> IndexPlan: +def _positions_from_intervals(plan: ExactPredicatePlan) -> np.ndarray: + _, positions = _load_full_arrays(plan.base, plan.descriptor) + if not plan.intervals: + return np.empty(0, dtype=np.int64) + selected = [positions[lo:hi] for lo, hi in plan.intervals] + merged = np.concatenate(selected) if len(selected) > 1 else selected[0] + return np.sort(merged, kind="stable") + + +def plan_query(expression: str, operands: dict, where: dict | None, *, use_index: bool = True) -> IndexPlan: if not use_index: return IndexPlan(False, "index usage disabled for this query") if where is None or len(where) != 1: @@ -535,33 +806,105 @@ def plan_query( except SyntaxError: return IndexPlan(False, "expression is not valid Python syntax for planning") - plan = _plan_node(tree.body, operands) - if plan is None: + exact_plan = _plan_exact_node(tree.body, operands) + if exact_plan is not None: + exact_positions = _positions_from_intervals(exact_plan) + return IndexPlan( + True, + "full index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + field=exact_plan.field, + level="full", + total_units=exact_plan.base.shape[0], + selected_units=len(exact_positions), + exact_positions=exact_positions, + ) + + segment_plan = _plan_segment_node(tree.body, operands) + if segment_plan is None: return IndexPlan(False, "no usable index was found for this predicate") - total_chunks = len(plan.candidate_chunks) - selected_chunks = int(np.count_nonzero(plan.candidate_chunks)) - if selected_chunks == total_chunks: + total_units = len(segment_plan.candidate_units) + selected_units = int(np.count_nonzero(segment_plan.candidate_units)) + if selected_units == total_units: return IndexPlan( False, - "available index does not prune any chunks for this predicate", - candidate_chunks=plan.candidate_chunks, - descriptor=_copy_descriptor(plan.descriptor), - field=plan.field, - total_chunks=total_chunks, - selected_chunks=selected_chunks, + "available index does not prune any units for this predicate", + descriptor=_copy_descriptor(segment_plan.descriptor), + base=segment_plan.base, + field=segment_plan.field, + level=segment_plan.level, + segment_len=segment_plan.segment_len, + candidate_units=segment_plan.candidate_units, + total_units=total_units, + selected_units=selected_units, ) + return IndexPlan( True, - "zone-map index selected", - candidate_chunks=plan.candidate_chunks, - descriptor=_copy_descriptor(plan.descriptor), - field=plan.field, - total_chunks=total_chunks, - selected_chunks=selected_chunks, + f"{segment_plan.level} summaries selected", + descriptor=_copy_descriptor(segment_plan.descriptor), + base=segment_plan.base, + field=segment_plan.field, + level=segment_plan.level, + segment_len=segment_plan.segment_len, + candidate_units=segment_plan.candidate_units, + total_units=total_units, + selected_units=selected_units, ) +def _where_output_dtype(where_x) -> np.dtype: + return where_x.dtype if hasattr(where_x, "dtype") else np.asarray(where_x).dtype + + +def evaluate_segment_query( + expression: str, operands: dict, ne_args: dict, where: dict, plan: IndexPlan +) -> np.ndarray: + from .lazyexpr import _get_result + from .utils import get_chunk_operands + + if plan.base is None or plan.candidate_units is None or plan.segment_len is None: + raise ValueError("segment evaluation requires a segment-based plan") + + parts = [] + chunk_operands = {} + for unit in np.flatnonzero(plan.candidate_units): + start = int(unit) * plan.segment_len + stop = min(start + plan.segment_len, plan.base.shape[0]) + cslice = (slice(start, stop, 1),) + get_chunk_operands(operands, cslice, chunk_operands, plan.base.shape) + result, _ = _get_result(expression, chunk_operands, ne_args, where) + if len(result) > 0: + parts.append(np.require(result, requirements="C")) + + if parts: + return np.concatenate(parts) + return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) + + +def _gather_positions(where_x, positions: np.ndarray) -> np.ndarray: + if len(positions) == 0: + return np.empty(0, dtype=_where_output_dtype(where_x)) + + positions = np.asarray(positions, dtype=np.int64) + breaks = np.nonzero(np.diff(positions) != 1)[0] + 1 + runs = np.split(positions, breaks) + parts = [] + for run in runs: + start = int(run[0]) + stop = int(run[-1]) + 1 + parts.append(where_x[start:stop]) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + +def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: + if plan.exact_positions is None: + raise ValueError("full evaluation requires exact positions") + return _gather_positions(where["_where_x"], plan.exact_positions) + + def will_use_index(expr) -> bool: where = getattr(expr, "_where_args", None) return plan_query(expr.expression, expr.operands, where).usable @@ -574,7 +917,12 @@ def explain_query(expr) -> dict: "will_use_index": plan.usable, "reason": plan.reason, "field": plan.field, - "candidate_chunks": plan.selected_chunks, - "total_chunks": plan.total_chunks, + "kind": None if plan.descriptor is None else plan.descriptor["kind"], + "level": plan.level, + "candidate_units": plan.selected_units, + "total_units": plan.total_units, + "candidate_chunks": plan.selected_units, + "total_chunks": plan.total_units, + "exact_rows": None if plan.exact_positions is None else len(plan.exact_positions), "descriptor": plan.descriptor, } diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 45d6f889..17242895 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1838,10 +1838,15 @@ def slices_eval( # noqa: C901 else np.asarray(shape) ) index_plan = None - if where is not None and len(where) == 1 and use_index: + if where is not None and len(where) == 1 and use_index and _slice == (): from . import indexing index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) + if index_plan.usable and not (_indices or _order): + if index_plan.exact_positions is not None: + return indexing.evaluate_full_query(where, index_plan) + if index_plan.level not in (None, "chunk"): + return indexing.evaluate_segment_query(expression, operands, ne_args, where, index_plan) for chunk_slice in intersecting_chunks: # Check whether current cslice intersects with _slice @@ -1857,7 +1862,12 @@ def slices_eval( # noqa: C901 offset = tuple(s.start for s in cslice) # offset for the udf cslice_shape = tuple(s.stop - s.start for s in cslice) len_chunk = math.prod(cslice_shape) - if index_plan is not None and index_plan.usable and not index_plan.candidate_chunks[nchunk]: + if ( + index_plan is not None + and index_plan.usable + and index_plan.level == "chunk" + and not index_plan.candidate_units[nchunk] + ): if _indices or _order: leninputs += len_chunk continue diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 0c114fd1..53ed2247 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -6,23 +6,25 @@ ####################################################################### import numpy as np +import pytest import blosc2 -def test_scalar_zone_map_index_matches_scan(): +@pytest.mark.parametrize("kind", ["ultralight", "light", "medium", "full"]) +def test_scalar_index_matches_scan(kind): data = np.arange(200_000, dtype=np.int64) arr = blosc2.asarray(data, chunks=(10_000,), blocks=(2_000,)) - descriptor = arr.create_index() + descriptor = arr.create_index(kind=kind) - assert descriptor["kind"] == "zone-map" + assert descriptor["kind"] == kind assert descriptor["field"] is None assert len(arr.indexes) == 1 expr = ((arr >= 120_000) & (arr < 125_000)).where(arr) assert expr.will_use_index() is True explanation = expr.explain() - assert explanation["candidate_chunks"] < explanation["total_chunks"] + assert explanation["candidate_units"] < explanation["total_units"] or explanation["level"] == "full" indexed = expr.compute()[:] scanned = expr.compute(_use_index=False)[:] @@ -30,14 +32,15 @@ def test_scalar_zone_map_index_matches_scan(): np.testing.assert_array_equal(indexed, data[(data >= 120_000) & (data < 125_000)]) -def test_structured_field_index_matches_scan(): +@pytest.mark.parametrize("kind", ["ultralight", "light", "medium", "full"]) +def test_structured_field_index_matches_scan(kind): dtype = np.dtype([("id", np.int64), ("payload", np.float64)]) data = np.empty(120_000, dtype=dtype) data["id"] = np.arange(data.shape[0], dtype=np.int64) data["payload"] = np.linspace(0, 1, data.shape[0], dtype=np.float64) arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) - arr.create_index(field="id") + arr.create_index(field="id", kind=kind) expr = blosc2.lazyexpr("(id >= 48_000) & (id < 51_000)", arr.fields).where(arr) assert expr.will_use_index() is True @@ -52,13 +55,13 @@ def test_persistent_index_survives_reopen(tmp_path): path = tmp_path / "indexed_array.b2nd" data = np.arange(80_000, dtype=np.int64) arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) - descriptor = arr.create_index() + descriptor = arr.create_index(kind="full") - assert descriptor["summary"]["path"] is not None + assert descriptor["full"]["values_path"] is not None reopened = blosc2.open(path, mode="a") assert len(reopened.indexes) == 1 - assert reopened.indexes[0]["summary"]["path"] == descriptor["summary"]["path"] + assert reopened.indexes[0]["full"]["values_path"] == descriptor["full"]["values_path"] expr = (reopened >= 72_000).where(reopened) assert expr.will_use_index() is True @@ -68,7 +71,7 @@ def test_persistent_index_survives_reopen(tmp_path): def test_mutation_marks_index_stale_and_rebuild_restores_it(): data = np.arange(50_000, dtype=np.int64) arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,)) - arr.create_index() + arr.create_index(kind="full") arr[:25] = -1 assert arr.indexes[0]["stale"] is True From 43c73f01cfa27d326b981fa26d376d0eac0fe546 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 04:32:18 +0200 Subject: [PATCH 03/68] Implement chunk-aware and reduced-order NDArray indexing Add a first modern indexing engine for 1-D NDArray objects and structured fields, inspired by OPSI but adapted to Blosc2 chunk/block storage. Introduce four index kinds: - ultralight: chunk zone maps - light: chunk + block zone maps - medium: block-partitioned reduced-order exact index - full: global sorted values + logical positions Improve query execution by: - making full retrieval chunk-aware for scattered hits - making medium use per-block sorted values plus compact local offsets - integrating index planning into LazyExpr.where(...) - exposing will_use_index() and explain() helpers Add correctness coverage for scalar, structured, persistent, mutation, and random-distribution cases. Extend the benchmark to compare index kinds across distributions, report cold vs warm query timings, footprint metrics, reusable on-disk outputs, and configurable query width / repeat counts. --- bench/ndarray/index_query_bench.py | 297 ++++++++++++++++-------- src/blosc2/indexing.py | 360 ++++++++++++++++++++++------- tests/ndarray/test_indexing.py | 35 ++- 3 files changed, 510 insertions(+), 182 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 45b8c6a5..f29f5d7b 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -21,7 +21,7 @@ SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) CHUNK_LEN = 100_000 BLOCK_LEN = 20_000 -REPEATS = 5 +DEFAULT_REPEATS = 3 KINDS = ("ultralight", "light", "medium", "full") DISTS = ("sorted", "block-shuffled", "random") RNG_SEED = 0 @@ -68,6 +68,14 @@ def build_persistent_array(data: np.ndarray, path: Path) -> blosc2.NDArray: return blosc2.asarray(data, urlpath=path, mode="w", chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) +def base_array_path(size_dir: Path, size: int, dist: str) -> Path: + return size_dir / f"size_{size}_{dist}.b2nd" + + +def indexed_array_path(size_dir: Path, size: int, dist: str, kind: str) -> Path: + return size_dir / f"size_{size}_{dist}.{kind}.b2nd" + + def benchmark_once(expr, *, use_index: bool) -> tuple[float, int]: start = time.perf_counter() result = expr.compute(_use_index=use_index)[:] @@ -84,6 +92,18 @@ def index_sizes(descriptor: dict) -> tuple[int, int]: if level_info["path"]: disk += os.path.getsize(level_info["path"]) + reduced = descriptor.get("reduced") + if reduced is not None: + values = blosc2.open(reduced["values_path"]) + positions = blosc2.open(reduced["positions_path"]) + offsets = blosc2.open(reduced["offsets_path"]) + logical += values.shape[0] * values.dtype.itemsize + logical += positions.shape[0] * positions.dtype.itemsize + logical += offsets.shape[0] * offsets.dtype.itemsize + disk += os.path.getsize(reduced["values_path"]) + disk += os.path.getsize(reduced["positions_path"]) + disk += os.path.getsize(reduced["offsets_path"]) + full = descriptor.get("full") if full is not None: values = blosc2.open(full["values_path"]) @@ -95,13 +115,52 @@ def index_sizes(descriptor: dict) -> tuple[int, int]: return logical, disk -def benchmark_size(size: int, size_dir: Path, dist: str) -> list[dict]: - data = make_source_data(size, dist) - arr = build_persistent_array(data, size_dir / f"size_{size}_{dist}.b2nd") - del data +def _source_data_factory(size: int, dist: str): + data = None + + def get_data() -> np.ndarray: + nonlocal data + if data is None: + data = make_source_data(size, dist) + return data + + return get_data + + +def _valid_index_descriptor(arr: blosc2.NDArray, kind: str) -> dict | None: + for descriptor in arr.indexes: + if descriptor.get("field") == "id" and descriptor.get("kind") == kind and not descriptor.get("stale", False): + return descriptor + return None + + +def _open_or_build_persistent_array(path: Path, get_data) -> blosc2.NDArray: + if path.exists(): + return blosc2.open(path, mode="a") + blosc2.remove_urlpath(path) + return build_persistent_array(get_data(), path) + + +def _open_or_build_indexed_array(path: Path, get_data, kind: str) -> tuple[blosc2.NDArray, float]: + if path.exists(): + arr = blosc2.open(path, mode="a") + if _valid_index_descriptor(arr, kind) is not None: + return arr, 0.0 + if arr.indexes: + arr.drop_index(field="id") + blosc2.remove_urlpath(path) + + arr = build_persistent_array(get_data(), path) + build_start = time.perf_counter() + arr.create_index(field="id", kind=kind) + return arr, time.perf_counter() - build_start + + +def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> list[dict]: + get_data = _source_data_factory(size, dist) + arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist), get_data) lo = size // 2 - width = 2_500 - hi = min(size, lo + width) + hi = min(size, lo + query_width) expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields).where(arr) base_bytes = size * arr.dtype.itemsize compressed_base_bytes = os.path.getsize(arr.urlpath) @@ -110,18 +169,11 @@ def benchmark_size(size: int, size_dir: Path, dist: str) -> list[dict]: rows = [] for kind in KINDS: - if arr.indexes: - arr.drop_index(field="id") - build_start = time.perf_counter() - arr.create_index(field="id", kind=kind) - build_time = time.perf_counter() - build_start - explanation = expr.explain() - logical_index_bytes, disk_index_bytes = index_sizes(arr.indexes[0]) - - warm_index, index_len = benchmark_once(expr, use_index=True) - del warm_index - index_runs = [benchmark_once(expr, use_index=True)[0] for _ in range(REPEATS)] - index_ms = statistics.median(index_runs) * 1_000 + idx_arr, build_time = _open_or_build_indexed_array(indexed_array_path(size_dir, size, dist, kind), get_data, kind) + idx_expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", idx_arr.fields).where(idx_arr) + explanation = idx_expr.explain() + logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) + cold_time, index_len = benchmark_once(idx_expr, use_index=True) rows.append( { @@ -133,8 +185,10 @@ def benchmark_size(size: int, size_dir: Path, dist: str) -> list[dict]: "build_s": build_time, "create_idx_ms": build_time * 1_000, "scan_ms": scan_ms, - "index_ms": index_ms, - "speedup": scan_ms / index_ms, + "cold_ms": cold_time * 1_000, + "cold_speedup": scan_ms / (cold_time * 1_000), + "warm_ms": None, + "warm_speedup": None, "candidate_units": explanation["candidate_units"], "total_units": explanation["total_units"], "logical_index_bytes": logical_index_bytes, @@ -146,6 +200,20 @@ def benchmark_size(size: int, size_dir: Path, dist: str) -> list[dict]: return rows +def measure_warm_queries(rows: list[dict], size_dir: Path, query_width: int, repeats: int) -> None: + if repeats <= 0: + return + for result in rows: + arr = blosc2.open(indexed_array_path(size_dir, result["size"], result["dist"], result["kind"]), mode="a") + lo = result["size"] // 2 + hi = min(result["size"], lo + query_width) + expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields).where(arr) + index_runs = [benchmark_once(expr, use_index=True)[0] for _ in range(repeats)] + warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None + result["warm_ms"] = warm_ms + result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms + + def parse_human_size(value: str) -> int: value = value.strip() if not value: @@ -180,6 +248,23 @@ def parse_args() -> argparse.Namespace: type=parse_human_size, help="Benchmark a single array size. Supports suffixes like 1k, 1K, 1M, 1G.", ) + parser.add_argument( + "--query-width", + type=parse_human_size, + default=1_000, + help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1000.", + ) + parser.add_argument( + "--repeats", + type=int, + default=DEFAULT_REPEATS, + help="Number of repeated warm-query measurements after the first cold query. Default: 3.", + ) + parser.add_argument( + "--outdir", + type=Path, + help="Directory where benchmark arrays and index sidecars should be written and kept.", + ) parser.add_argument( "--dist", choices=(*DISTS, "all"), @@ -191,88 +276,104 @@ def parse_args() -> argparse.Namespace: def main() -> None: args = parse_args() + if args.repeats < 0: + raise SystemExit("--repeats must be >= 0") sizes = (args.size,) if args.size is not None else SIZES dists = DISTS if args.dist == "all" else (args.dist,) - with tempfile.TemporaryDirectory() as tmpdir: - size_dir = Path(tmpdir) - all_results = [] - print("Structured range-query benchmark across index kinds") - print(f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={REPEATS}, dist={args.dist}") - print( - "size,dist,kind,level,query_rows,build_s,create_idx_ms,scan_ms,index_ms,speedup," - "candidate_units,total_units,logical_index_bytes,disk_index_bytes,index_pct,index_pct_disk" + if args.outdir is None: + with tempfile.TemporaryDirectory() as tmpdir: + run_benchmarks(sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats) + else: + args.outdir.mkdir(parents=True, exist_ok=True) + run_benchmarks(sizes, dists, args.outdir, args.dist, args.query_width, args.repeats) + + +def run_benchmarks( + sizes: tuple[int, ...], + dists: tuple[str, ...], + size_dir: Path, + dist_label: str, + query_width: int, + repeats: int, +) -> None: + all_results = [] + print("Structured range-query benchmark across index kinds") + print( + f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " + f"query_width={query_width:,}" + ) + for dist in dists: + for size in sizes: + size_results = benchmark_size(size, size_dir, dist, query_width) + all_results.extend(size_results) + + print() + print("Cold Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("kind", lambda result: result["kind"]), + ("level", lambda result: result["level"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), + ], + ) + if repeats > 0: + measure_warm_queries(all_results, size_dir, query_width, repeats) + print() + print("Warm Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("kind", lambda result: result["kind"]), + ("level", lambda result: result["level"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ( + "speedup", + lambda result: f"{result['warm_speedup']:.2f}x" + if result["warm_speedup"] is not None + else "-", + ), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), + ], ) - for dist in dists: - for size in sizes: - size_results = benchmark_size(size, size_dir, dist) - all_results.extend(size_results) - for result in size_results: - print( - f"{result['size']}," - f"{result['dist']}," - f"{result['kind']}," - f"{result['level']}," - f"{result['query_rows']}," - f"{result['build_s']:.4f}," - f"{result['create_idx_ms']:.3f}," - f"{result['scan_ms']:.3f}," - f"{result['index_ms']:.3f}," - f"{result['speedup']:.2f}," - f"{result['candidate_units']}," - f"{result['total_units']}," - f"{result['logical_index_bytes']}," - f"{result['disk_index_bytes']}," - f"{result['index_pct']:.4f}," - f"{result['index_pct_disk']:.4f}" - ) - print() - print("Table") - headers = [ - "rows", - "dist", - "kind", - "level", - "create_idx_ms", - "scan_ms", - "index_ms", - "speedup", - "logical_bytes", - "disk_bytes", - "index_pct", - "index_pct_disk", - ] - table_rows = [] - for result in all_results: - table_rows.append( - [ - f"{result['size']:,}", - result["dist"], - result["kind"], - result["level"], - f"{result['create_idx_ms']:.3f}", - f"{result['scan_ms']:.3f}", - f"{result['index_ms']:.3f}", - f"{result['speedup']:.2f}x", - f"{result['logical_index_bytes']:,}", - f"{result['disk_index_bytes']:,}", - f"{result['index_pct']:.4f}%", - f"{result['index_pct_disk']:.4f}%", - ] - ) - - widths = [len(header) for header in headers] - for row in table_rows: - widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] - - def format_row(row: list[str]) -> str: - return " ".join(cell.ljust(width) for cell, width in zip(row, widths, strict=True)) - - print(format_row(headers)) - print(format_row(["-" * width for width in widths])) - for row in table_rows: - print(format_row(row)) + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def _table_rows(results: list[dict], columns: list[tuple[str, callable]]) -> tuple[list[str], list[list[str]], list[int]]: + headers = [header for header, _ in columns] + widths = [len(header) for header in headers] + rows = [[formatter(result) for _, formatter in columns] for result in results] + for row in rows: + widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] + return headers, rows, widths + + +def print_table(results: list[dict], columns: list[tuple[str, callable]]) -> None: + headers, rows, widths = _table_rows(results, columns) + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in rows: + print(_format_row(row, widths)) if __name__ == "__main__": diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index d31416cc..29e86fcd 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -63,7 +63,10 @@ class ExactPredicatePlan: base: blosc2.NDArray descriptor: dict field: str | None - intervals: list[tuple[int, int]] + lower: object | None = None + lower_inclusive: bool = True + upper: object | None = None + upper_inclusive: bool = True def _default_index_store() -> dict: @@ -91,6 +94,8 @@ def _copy_nested_dict(value: dict | None) -> dict | None: def _copy_descriptor(descriptor: dict) -> dict: copied = descriptor.copy() copied["levels"] = _copy_nested_dict(descriptor.get("levels")) + if descriptor.get("reduced") is not None: + copied["reduced"] = descriptor["reduced"].copy() if descriptor.get("full") is not None: copied["full"] = descriptor["full"].copy() return copied @@ -287,6 +292,57 @@ def _build_full_descriptor( } +def _position_dtype(max_value: int) -> np.dtype: + if max_value <= np.iinfo(np.uint8).max: + return np.dtype(np.uint8) + if max_value <= np.iinfo(np.uint16).max: + return np.dtype(np.uint16) + if max_value <= np.iinfo(np.uint32).max: + return np.dtype(np.uint32) + return np.dtype(np.uint64) + + +def _build_reduced_descriptor( + array: blosc2.NDArray, + field: str | None, + kind: str, + values: np.ndarray, + persistent: bool, +) -> dict: + block_len = int(array.blocks[0]) + nblocks = math.ceil(values.shape[0] / block_len) + position_dtype = _position_dtype(block_len - 1) + offsets = np.empty(nblocks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty_like(values) + positions = np.empty(values.shape[0], dtype=position_dtype) + cursor = 0 + + for block_id in range(nblocks): + start = block_id * block_len + stop = min(start + block_len, values.shape[0]) + block = values[start:stop] + order = np.argsort(block, kind="stable") + block_size = stop - start + next_cursor = cursor + block_size + sorted_values[cursor:next_cursor] = block[order] + positions[cursor:next_cursor] = order.astype(position_dtype, copy=False) + cursor = next_cursor + offsets[block_id + 1] = cursor + + values_sidecar = _store_array_sidecar(array, field, kind, "reduced", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar( + array, field, kind, "reduced", "positions", positions, persistent + ) + offsets_sidecar = _store_array_sidecar(array, field, kind, "reduced", "offsets", offsets, persistent) + return { + "block_len": block_len, + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + "offsets_path": offsets_sidecar["path"], + } + + def _build_descriptor( array: blosc2.NDArray, field: str | None, @@ -297,6 +353,7 @@ def _build_descriptor( name: str | None, dtype: np.dtype, levels: dict, + reduced: dict | None, full: dict | None, ) -> dict: return { @@ -313,6 +370,7 @@ def _build_descriptor( "chunks": tuple(array.chunks), "blocks": tuple(array.blocks), "levels": levels, + "reduced": reduced, "full": full, } @@ -338,9 +396,10 @@ def create_index( values = _values_for_index(array, field) levels = _build_levels_descriptor(array, field, kind, dtype, values, persistent) + reduced = _build_reduced_descriptor(array, field, kind, values, persistent) if kind == "medium" else None full = _build_full_descriptor(array, field, kind, values, persistent) if kind == "full" else None descriptor = _build_descriptor( - array, field, kind, optlevel, granularity, persistent, name, dtype, levels, full + array, field, kind, optlevel, granularity, persistent, name, dtype, levels, reduced, full ) store = _load_store(array) @@ -353,8 +412,7 @@ def create_csindex(array: blosc2.NDArray, field: str | None = None, **kwargs) -> return create_index(array, field=field, kind="full", **kwargs) -def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: - store = _load_store(array) +def _resolve_index_token(store: dict, field: str | None, name: str | None) -> str: token = _field_token(field) if field is not None or name is None else None if token is None: for key, descriptor in store["indexes"].items(): @@ -363,31 +421,38 @@ def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None break if token is None or token not in store["indexes"]: raise KeyError("index not found") + return token + +def _remove_sidecar_path(path: str | None) -> None: + if path: + blosc2.remove_urlpath(path) + + +def _drop_descriptor_sidecars(descriptor: dict) -> None: + for level_info in descriptor["levels"].values(): + _remove_sidecar_path(level_info["path"]) + if descriptor.get("reduced") is not None: + _remove_sidecar_path(descriptor["reduced"]["values_path"]) + _remove_sidecar_path(descriptor["reduced"]["positions_path"]) + _remove_sidecar_path(descriptor["reduced"]["offsets_path"]) + if descriptor.get("full") is not None: + _remove_sidecar_path(descriptor["full"]["values_path"]) + _remove_sidecar_path(descriptor["full"]["positions_path"]) + + +def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: + store = _load_store(array) + token = _resolve_index_token(store, field, name) descriptor = store["indexes"].pop(token) _save_store(array, store) _clear_cached_data(array, descriptor["field"]) - for level_info in descriptor["levels"].values(): - if level_info["path"]: - blosc2.remove_urlpath(level_info["path"]) - if descriptor.get("full") is not None: - if descriptor["full"]["values_path"]: - blosc2.remove_urlpath(descriptor["full"]["values_path"]) - if descriptor["full"]["positions_path"]: - blosc2.remove_urlpath(descriptor["full"]["positions_path"]) + _drop_descriptor_sidecars(descriptor) def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> dict: store = _load_store(array) - token = _field_token(field) if field is not None or name is None else None - if token is None: - for key, descriptor in store["indexes"].items(): - if descriptor.get("name") == name: - token = key - field = descriptor["field"] - break - if token is None or token not in store["indexes"]: - raise KeyError("index not found") + token = _resolve_index_token(store, field, name) descriptor = store["indexes"][token] drop_index(array, field=descriptor["field"], name=descriptor["name"]) return create_index( @@ -444,6 +509,20 @@ def _load_full_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarr return values, positions +def _load_reduced_arrays( + array: blosc2.NDArray, descriptor: dict +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + values = _load_array_sidecar(array, descriptor["field"], "reduced", "values", reduced["values_path"]) + positions = _load_array_sidecar( + array, descriptor["field"], "reduced", "positions", reduced["positions_path"] + ) + offsets = _load_array_sidecar(array, descriptor["field"], "reduced", "offsets", reduced["offsets_path"]) + return values, positions, offsets + + def _normalize_scalar(value, dtype: np.dtype): if isinstance(value, np.generic): return value.item() @@ -684,78 +763,104 @@ def _plan_exact_compare(node: ast.Compare, operands: dict) -> ExactPredicatePlan return None base, field, op, value = target descriptor = _descriptor_for(base, field) - if descriptor is None or descriptor.get("kind") != "full": + if descriptor is None or descriptor.get("kind") not in {"medium", "full"}: return None - dtype = np.dtype(descriptor["dtype"]) try: - sorted_values, _ = _load_full_arrays(base, descriptor) - intervals = _intervals_from_sorted(sorted_values, op, value, dtype) + value = _normalize_scalar(value, np.dtype(descriptor["dtype"])) except (RuntimeError, ValueError, TypeError): return None - return ExactPredicatePlan(base=base, descriptor=descriptor, field=field, intervals=intervals) + if op == "==": + return ExactPredicatePlan( + base=base, + descriptor=descriptor, + field=field, + lower=value, + lower_inclusive=True, + upper=value, + upper_inclusive=True, + ) + if op == ">": + return ExactPredicatePlan( + base=base, descriptor=descriptor, field=field, lower=value, lower_inclusive=False + ) + if op == ">=": + return ExactPredicatePlan( + base=base, descriptor=descriptor, field=field, lower=value, lower_inclusive=True + ) + if op == "<": + return ExactPredicatePlan( + base=base, descriptor=descriptor, field=field, upper=value, upper_inclusive=False + ) + if op == "<=": + return ExactPredicatePlan( + base=base, descriptor=descriptor, field=field, upper=value, upper_inclusive=True + ) + return None def _same_base(left: ExactPredicatePlan, right: ExactPredicatePlan) -> bool: return left.base is right.base and left.field == right.field -def _normalize_intervals(intervals: list[tuple[int, int]]) -> list[tuple[int, int]]: - if not intervals: - return [] - intervals = sorted(intervals) - normalized = [intervals[0]] - for lo, hi in intervals[1:]: - prev_lo, prev_hi = normalized[-1] - if lo <= prev_hi: - normalized[-1] = (prev_lo, max(prev_hi, hi)) - else: - normalized.append((lo, hi)) - return normalized - - -def _intersect_intervals( - left_intervals: list[tuple[int, int]], right_intervals: list[tuple[int, int]] -) -> list[tuple[int, int]]: - intersections = [] - left = _normalize_intervals(left_intervals) - right = _normalize_intervals(right_intervals) - i = j = 0 - while i < len(left) and j < len(right): - lo = max(left[i][0], right[j][0]) - hi = min(left[i][1], right[j][1]) - if lo < hi: - intersections.append((lo, hi)) - if left[i][1] <= right[j][1]: - i += 1 - else: - j += 1 - return intersections +def _merge_lower_bound( + left: object | None, left_inclusive: bool, right: object | None, right_inclusive: bool +) -> tuple[object | None, bool]: + if left is None: + return right, right_inclusive + if right is None: + return left, left_inclusive + if left < right: + return right, right_inclusive + if left > right: + return left, left_inclusive + return left, left_inclusive and right_inclusive + + +def _merge_upper_bound( + left: object | None, left_inclusive: bool, right: object | None, right_inclusive: bool +) -> tuple[object | None, bool]: + if left is None: + return right, right_inclusive + if right is None: + return left, left_inclusive + if left < right: + return left, left_inclusive + if left > right: + return right, right_inclusive + return left, left_inclusive and right_inclusive def _merge_exact_plans( left: ExactPredicatePlan, right: ExactPredicatePlan, op: str ) -> ExactPredicatePlan | None: - if not _same_base(left, right): + if op != "and" or not _same_base(left, right): return None - if op == "and": - intervals = _intersect_intervals(left.intervals, right.intervals) - else: - intervals = _normalize_intervals(left.intervals + right.intervals) + lower, lower_inclusive = _merge_lower_bound( + left.lower, left.lower_inclusive, right.lower, right.lower_inclusive + ) + upper, upper_inclusive = _merge_upper_bound( + left.upper, left.upper_inclusive, right.upper, right.upper_inclusive + ) return ExactPredicatePlan( - base=left.base, descriptor=left.descriptor, field=left.field, intervals=intervals + base=left.base, + descriptor=left.descriptor, + field=left.field, + lower=lower, + lower_inclusive=lower_inclusive, + upper=upper, + upper_inclusive=upper_inclusive, ) def _plan_exact_boolop(node: ast.BoolOp, operands: dict) -> ExactPredicatePlan | None: - op = "and" if isinstance(node.op, ast.And) else "or" if isinstance(node.op, ast.Or) else None - if op is None: + if not isinstance(node.op, ast.And): return None plans = [_plan_exact_node(value, operands) for value in node.values] if any(plan is None for plan in plans): return None plan = plans[0] for other in plans[1:]: - merged = _merge_exact_plans(plan, other, op) + merged = _merge_exact_plans(plan, other, "and") if merged is None: return None plan = merged @@ -763,17 +868,13 @@ def _plan_exact_boolop(node: ast.BoolOp, operands: dict) -> ExactPredicatePlan | def _plan_exact_bitop(node: ast.BinOp, operands: dict) -> ExactPredicatePlan | None: - if isinstance(node.op, ast.BitAnd): - op = "and" - elif isinstance(node.op, ast.BitOr): - op = "or" - else: + if not isinstance(node.op, ast.BitAnd): return None left = _plan_exact_node(node.left, operands) right = _plan_exact_node(node.right, operands) if left is None or right is None: return None - return _merge_exact_plans(left, right, op) + return _merge_exact_plans(left, right, "and") def _plan_exact_node(node: ast.AST, operands: dict) -> ExactPredicatePlan | None: @@ -786,12 +887,81 @@ def _plan_exact_node(node: ast.AST, operands: dict) -> ExactPredicatePlan | None return None -def _positions_from_intervals(plan: ExactPredicatePlan) -> np.ndarray: - _, positions = _load_full_arrays(plan.base, plan.descriptor) - if not plan.intervals: +def _range_is_empty(plan: ExactPredicatePlan) -> bool: + if plan.lower is None or plan.upper is None: + return False + if plan.lower < plan.upper: + return False + if plan.lower > plan.upper: + return True + return not (plan.lower_inclusive and plan.upper_inclusive) + + +def _candidate_units_from_exact_plan( + summaries: np.ndarray, dtype: np.dtype, plan: ExactPredicatePlan +) -> np.ndarray: + candidate_units = np.ones(len(summaries), dtype=bool) + if plan.lower is not None: + lower_op = ">=" if plan.lower_inclusive else ">" + candidate_units &= _candidate_units_from_summary(summaries, lower_op, plan.lower, dtype) + if plan.upper is not None: + upper_op = "<=" if plan.upper_inclusive else "<" + candidate_units &= _candidate_units_from_summary(summaries, upper_op, plan.upper, dtype) + return candidate_units + + +def _search_bounds(values: np.ndarray, plan: ExactPredicatePlan) -> tuple[int, int]: + lo = 0 + hi = len(values) + if plan.lower is not None: + side = "left" if plan.lower_inclusive else "right" + lo = int(np.searchsorted(values, plan.lower, side=side)) + if plan.upper is not None: + side = "right" if plan.upper_inclusive else "left" + hi = int(np.searchsorted(values, plan.upper, side=side)) + return lo, hi + + +def _exact_positions_from_full( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + if _range_is_empty(plan): + return np.empty(0, dtype=np.int64) + sorted_values, positions = _load_full_arrays(array, descriptor) + lo, hi = _search_bounds(sorted_values, plan) + if lo >= hi: + return np.empty(0, dtype=np.int64) + return np.sort(positions[lo:hi], kind="stable") + + +def _exact_positions_from_reduced( + array: blosc2.NDArray, descriptor: dict, dtype: np.dtype, plan: ExactPredicatePlan +) -> np.ndarray: + if _range_is_empty(plan): + return np.empty(0, dtype=np.int64) + + summaries = _load_level_summaries(array, descriptor, "block") + candidate_blocks = _candidate_units_from_exact_plan(summaries, dtype, plan) + if not np.any(candidate_blocks): + return np.empty(0, dtype=np.int64) + + sorted_values, local_positions, offsets = _load_reduced_arrays(array, descriptor) + block_len = int(descriptor["reduced"]["block_len"]) + parts = [] + for block_id in np.flatnonzero(candidate_blocks): + start = int(offsets[block_id]) + stop = int(offsets[block_id + 1]) + block_values = sorted_values[start:stop] + lo, hi = _search_bounds(block_values, plan) + if lo >= hi: + continue + absolute = block_id * block_len + local = local_positions[start + lo : start + hi].astype(np.int64, copy=False) + parts.append(absolute + local) + + if not parts: return np.empty(0, dtype=np.int64) - selected = [positions[lo:hi] for lo, hi in plan.intervals] - merged = np.concatenate(selected) if len(selected) > 1 else selected[0] + merged = np.concatenate(parts) if len(parts) > 1 else parts[0] return np.sort(merged, kind="stable") @@ -808,14 +978,21 @@ def plan_query(expression: str, operands: dict, where: dict | None, *, use_index exact_plan = _plan_exact_node(tree.body, operands) if exact_plan is not None: - exact_positions = _positions_from_intervals(exact_plan) + kind = exact_plan.descriptor["kind"] + dtype = np.dtype(exact_plan.descriptor["dtype"]) + if kind == "full": + exact_positions = _exact_positions_from_full(exact_plan.base, exact_plan.descriptor, exact_plan) + else: + exact_positions = _exact_positions_from_reduced( + exact_plan.base, exact_plan.descriptor, dtype, exact_plan + ) return IndexPlan( True, - "full index selected", + f"{kind} exact index selected", descriptor=_copy_descriptor(exact_plan.descriptor), base=exact_plan.base, field=exact_plan.field, - level="full", + level=kind, total_units=exact_plan.base.shape[0], selected_units=len(exact_positions), exact_positions=exact_positions, @@ -899,9 +1076,32 @@ def _gather_positions(where_x, positions: np.ndarray) -> np.ndarray: return np.concatenate(parts) if len(parts) > 1 else parts[0] +def _gather_positions_by_chunk(where_x, positions: np.ndarray, chunk_len: int) -> np.ndarray: + if len(positions) == 0: + return np.empty(0, dtype=_where_output_dtype(where_x)) + + positions = np.asarray(positions, dtype=np.int64) + output = np.empty(len(positions), dtype=_where_output_dtype(where_x)) + chunk_ids = positions // chunk_len + breaks = np.nonzero(np.diff(chunk_ids) != 0)[0] + 1 + start_idx = 0 + for stop_idx in (*breaks, len(positions)): + chunk_positions = positions[start_idx:stop_idx] + chunk_id = int(chunk_ids[start_idx]) + chunk_start = chunk_id * chunk_len + chunk_stop = chunk_start + chunk_len + chunk_values = where_x[chunk_start:chunk_stop] + local_positions = chunk_positions - chunk_start + output[start_idx:stop_idx] = chunk_values[local_positions] + start_idx = stop_idx + return output + + def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: if plan.exact_positions is None: raise ValueError("full evaluation requires exact positions") + if plan.base is not None: + return _gather_positions_by_chunk(where["_where_x"], plan.exact_positions, int(plan.base.chunks[0])) return _gather_positions(where["_where_x"], plan.exact_positions) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 53ed2247..32522e88 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -51,17 +51,44 @@ def test_structured_field_index_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 48_000) & (data["id"] < 51_000)]) -def test_persistent_index_survives_reopen(tmp_path): +@pytest.mark.parametrize("kind", ["medium", "full"]) +def test_random_field_index_matches_scan(kind): + rng = np.random.default_rng(0) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(150_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, chunks=(15_000,), blocks=(3_000,)) + arr.create_index(field="id", kind=kind) + + expr = blosc2.lazyexpr("(id >= 70_000) & (id < 71_200)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= 70_000) & (data["id"] < 71_200)]) + + +@pytest.mark.parametrize("kind", ["medium", "full"]) +def test_persistent_index_survives_reopen(tmp_path, kind): path = tmp_path / "indexed_array.b2nd" data = np.arange(80_000, dtype=np.int64) arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) - descriptor = arr.create_index(kind="full") + descriptor = arr.create_index(kind=kind) - assert descriptor["full"]["values_path"] is not None + if kind == "medium": + assert descriptor["reduced"]["values_path"] is not None + else: + assert descriptor["full"]["values_path"] is not None reopened = blosc2.open(path, mode="a") assert len(reopened.indexes) == 1 - assert reopened.indexes[0]["full"]["values_path"] == descriptor["full"]["values_path"] + if kind == "medium": + assert reopened.indexes[0]["reduced"]["values_path"] == descriptor["reduced"]["values_path"] + else: + assert reopened.indexes[0]["full"]["values_path"] == descriptor["full"]["values_path"] expr = (reopened >= 72_000).where(reopened) assert expr.will_use_index() is True From 241d1251fd6e198f030c9c5fa08a422eed3592f5 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 05:54:27 +0200 Subject: [PATCH 04/68] New get_1d_span_numpy for reading single blocks --- bench/ndarray/index_query_bench.py | 14 +++---- src/blosc2/blosc2_ext.pyx | 60 ++++++++++++++++++++++++++++++ src/blosc2/indexing.py | 50 +++++++++++++++++++++++++ tests/ndarray/test_indexing.py | 20 ++++++++++ 4 files changed, 135 insertions(+), 9 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index f29f5d7b..46566b2e 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -180,7 +180,6 @@ def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> li "size": size, "dist": dist, "kind": kind, - "level": explanation["level"], "query_rows": index_len, "build_s": build_time, "create_idx_ms": build_time * 1_000, @@ -195,19 +194,18 @@ def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> li "disk_index_bytes": disk_index_bytes, "index_pct": logical_index_bytes / base_bytes * 100, "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + "_arr": idx_arr, + "_expr": idx_expr, } ) return rows -def measure_warm_queries(rows: list[dict], size_dir: Path, query_width: int, repeats: int) -> None: +def measure_warm_queries(rows: list[dict], repeats: int) -> None: if repeats <= 0: return for result in rows: - arr = blosc2.open(indexed_array_path(size_dir, result["size"], result["dist"], result["kind"]), mode="a") - lo = result["size"] // 2 - hi = min(result["size"], lo + query_width) - expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields).where(arr) + expr = result["_expr"] index_runs = [benchmark_once(expr, use_index=True)[0] for _ in range(repeats)] warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None result["warm_ms"] = warm_ms @@ -316,7 +314,6 @@ def run_benchmarks( ("rows", lambda result: f"{result['size']:,}"), ("dist", lambda result: result["dist"]), ("kind", lambda result: result["kind"]), - ("level", lambda result: result["level"]), ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), @@ -328,7 +325,7 @@ def run_benchmarks( ], ) if repeats > 0: - measure_warm_queries(all_results, size_dir, query_width, repeats) + measure_warm_queries(all_results, repeats) print() print("Warm Query Table") print_table( @@ -337,7 +334,6 @@ def run_benchmarks( ("rows", lambda result: f"{result['size']:,}"), ("dist", lambda result: result["dist"]), ("kind", lambda result: result["kind"]), - ("level", lambda result: result["level"]), ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 2ba002e5..2cb46cd8 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -3235,6 +3235,66 @@ cdef class NDArray: return arr + def get_1d_span_numpy(self, arr, int64_t nchunk, int32_t start, int32_t nitems): + if self.ndim != 1: + raise ValueError("get_1d_span_numpy is only supported for 1-D arrays") + if nchunk < 0 or nchunk >= self.array.sc.nchunks: + raise IndexError("chunk index out of range") + if start < 0 or nitems < 0: + raise ValueError("start and nitems must be >= 0") + if start + nitems > self.array.chunknitems: + raise ValueError("requested span exceeds chunk size") + + cdef uint8_t *chunk = NULL + cdef c_bool needs_free + cdef int32_t chunk_nbytes + cdef int32_t chunk_cbytes + cdef int32_t block_nbytes + cdef blosc2_context *dctx = self.array.sc.dctx + cdef Py_buffer view + cdef int rc + cdef c_bool owns_dctx = False + + rc = blosc2_schunk_get_chunk(self.array.sc, nchunk, &chunk, &needs_free) + if rc < 0: + raise RuntimeError("Error while getting the chunk") + + rc = blosc2_cbuffer_sizes(chunk, &chunk_nbytes, &chunk_cbytes, &block_nbytes) + if rc < 0: + if needs_free: + free(chunk) + raise RuntimeError("Error while getting compressed buffer sizes") + if start + nitems > chunk_nbytes // self.array.sc.typesize: + if needs_free: + free(chunk) + raise ValueError("requested span exceeds decoded chunk size") + + PyObject_GetBuffer(arr, &view, PyBUF_SIMPLE) + if view.len < nitems * self.array.sc.typesize: + PyBuffer_Release(&view) + if needs_free: + free(chunk) + raise ValueError("destination buffer is smaller than the requested decoded span") + + if dctx == NULL: + dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS) + owns_dctx = True + if dctx == NULL: + PyBuffer_Release(&view) + if needs_free: + free(chunk) + raise RuntimeError("Could not create decompression context") + rc = blosc2_getitem_ctx(dctx, chunk, chunk_cbytes, start, nitems, view.buf, view.len) + if owns_dctx: + blosc2_free_ctx(dctx) + PyBuffer_Release(&view) + if needs_free: + free(chunk) + if rc < 0: + raise RuntimeError("Error while decoding the requested span") + + return arr + def get_oindex_numpy(self, arr, key): """ Orthogonal indexing. Key is a tuple of lists of integer indices. diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 29e86fcd..007221c5 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -31,6 +31,7 @@ _IN_MEMORY_INDEXES: dict[int, dict] = {} _DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} +BLOCK_GATHER_POSITIONS_THRESHOLD = 32 @dataclass(slots=True) @@ -1097,10 +1098,59 @@ def _gather_positions_by_chunk(where_x, positions: np.ndarray, chunk_len: int) - return output +def _supports_block_reads(where_x) -> bool: + return isinstance(where_x, blosc2.NDArray) and hasattr(where_x, "get_1d_span_numpy") + + +def _gather_positions_by_block( + where_x, positions: np.ndarray, chunk_len: int, block_len: int, total_len: int +) -> np.ndarray: + if len(positions) == 0: + return np.empty(0, dtype=_where_output_dtype(where_x)) + if not _supports_block_reads(where_x): + return _gather_positions_by_chunk(where_x, positions, chunk_len) + + positions = np.asarray(positions, dtype=np.int64) + output = np.empty(len(positions), dtype=_where_output_dtype(where_x)) + chunk_ids = positions // chunk_len + chunk_breaks = np.nonzero(np.diff(chunk_ids) != 0)[0] + 1 + chunk_start_idx = 0 + for chunk_stop_idx in (*chunk_breaks, len(positions)): + chunk_positions = positions[chunk_start_idx:chunk_stop_idx] + chunk_id = int(chunk_ids[chunk_start_idx]) + chunk_origin = chunk_id * chunk_len + local_positions = chunk_positions - chunk_origin + block_ids = local_positions // block_len + unique_blocks = np.unique(block_ids) + if len(unique_blocks) != 1: + chunk_stop = min(chunk_origin + chunk_len, total_len) + chunk_values = where_x[chunk_origin:chunk_stop] + output[chunk_start_idx:chunk_stop_idx] = chunk_values[local_positions] + chunk_start_idx = chunk_stop_idx + continue + + span_start = int(local_positions[0]) + span_stop = int(local_positions[-1]) + 1 + span_items = span_stop - span_start + span_values = np.empty(span_items, dtype=_where_output_dtype(where_x)) + where_x.get_1d_span_numpy(span_values, chunk_id, span_start, span_items) + output[chunk_start_idx:chunk_stop_idx] = span_values[local_positions - span_start] + chunk_start_idx = chunk_stop_idx + return output + + def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: if plan.exact_positions is None: raise ValueError("full evaluation requires exact positions") if plan.base is not None: + if len(plan.exact_positions) <= BLOCK_GATHER_POSITIONS_THRESHOLD: + return _gather_positions_by_block( + where["_where_x"], + plan.exact_positions, + int(plan.base.chunks[0]), + int(plan.base.blocks[0]), + int(plan.base.shape[0]), + ) return _gather_positions_by_chunk(where["_where_x"], plan.exact_positions, int(plan.base.chunks[0])) return _gather_positions(where["_where_x"], plan.exact_positions) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 32522e88..14dfd335 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -71,6 +71,26 @@ def test_random_field_index_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 70_000) & (data["id"] < 71_200)]) +@pytest.mark.parametrize("kind", ["medium", "full"]) +def test_random_field_point_query_matches_scan(kind): + rng = np.random.default_rng(1) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(200_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, chunks=(20_000,), blocks=(4_000,)) + arr.create_index(field="id", kind=kind) + + expr = blosc2.lazyexpr("(id >= 123_456) & (id < 123_457)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= 123_456) & (data["id"] < 123_457)]) + + @pytest.mark.parametrize("kind", ["medium", "full"]) def test_persistent_index_survives_reopen(tmp_path, kind): path = tmp_path / "indexed_array.b2nd" From 43e8e67b1910b4237519cf655108398b3de406d9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 06:03:00 +0200 Subject: [PATCH 05/68] Use lazychunks for avoiding a full chunk load --- src/blosc2/blosc2_ext.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 2cb46cd8..50888a9c 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -3255,9 +3255,9 @@ cdef class NDArray: cdef int rc cdef c_bool owns_dctx = False - rc = blosc2_schunk_get_chunk(self.array.sc, nchunk, &chunk, &needs_free) + rc = blosc2_schunk_get_lazychunk(self.array.sc, nchunk, &chunk, &needs_free) if rc < 0: - raise RuntimeError("Error while getting the chunk") + raise RuntimeError("Error while getting the lazy chunk") rc = blosc2_cbuffer_sizes(chunk, &chunk_nbytes, &chunk_cbytes, &block_nbytes) if rc < 0: From bf4771c1b034ba8601b3ffa1a15ad1bd25d419ec Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 06:24:36 +0200 Subject: [PATCH 06/68] Speed up indexed point queries and benchmark the public query path Cache persisted index descriptors per array to avoid repeated vlmeta loads during indexed queries, and keep lazy-chunk span reads for the block-aware gather path. This reduces planner overhead substantially for tiny exact-hit queries: - _load_store() becomes effectively free after the first lookup - plan_query() drops from about 0.27 ms to about 0.02 ms - arr[cond][:] on 10M random/full point queries drops to ~0.24 ms Update the benchmark to measure the clearer public indexed idiom: - keep scan baseline with cond.where(arr).compute(_use_index=False)[:] - use arr[cond][:] for indexed timings This makes benchmark results closer to real user code and shows the actual public-query latency improvements more accurately. --- bench/ndarray/index_query_bench.py | 28 +++++++++++++++++++--------- src/blosc2/indexing.py | 19 +++++++++++++------ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 46566b2e..2be6de6b 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -76,9 +76,16 @@ def indexed_array_path(size_dir: Path, size: int, dist: str, kind: str) -> Path: return size_dir / f"size_{size}_{dist}.{kind}.b2nd" -def benchmark_once(expr, *, use_index: bool) -> tuple[float, int]: +def benchmark_scan_once(expr) -> tuple[float, int]: start = time.perf_counter() - result = expr.compute(_use_index=use_index)[:] + result = expr.compute(_use_index=False)[:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def benchmark_index_once(arr: blosc2.NDArray, cond) -> tuple[float, int]: + start = time.perf_counter() + result = arr[cond][:] elapsed = time.perf_counter() - start return elapsed, len(result) @@ -161,19 +168,21 @@ def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> li arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist), get_data) lo = size // 2 hi = min(size, lo + query_width) - expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields).where(arr) + condition = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields) + expr = condition.where(arr) base_bytes = size * arr.dtype.itemsize compressed_base_bytes = os.path.getsize(arr.urlpath) - scan_ms = benchmark_once(expr, use_index=False)[0] * 1_000 + scan_ms = benchmark_scan_once(expr)[0] * 1_000 rows = [] for kind in KINDS: idx_arr, build_time = _open_or_build_indexed_array(indexed_array_path(size_dir, size, dist, kind), get_data, kind) - idx_expr = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", idx_arr.fields).where(idx_arr) + idx_cond = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", idx_arr.fields) + idx_expr = idx_cond.where(idx_arr) explanation = idx_expr.explain() logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) - cold_time, index_len = benchmark_once(idx_expr, use_index=True) + cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) rows.append( { @@ -195,7 +204,7 @@ def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> li "index_pct": logical_index_bytes / base_bytes * 100, "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, "_arr": idx_arr, - "_expr": idx_expr, + "_cond": idx_cond, } ) return rows @@ -205,8 +214,9 @@ def measure_warm_queries(rows: list[dict], repeats: int) -> None: if repeats <= 0: return for result in rows: - expr = result["_expr"] - index_runs = [benchmark_once(expr, use_index=True)[0] for _ in range(repeats)] + arr = result["_arr"] + cond = result["_cond"] + index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None result["warm_ms"] = warm_ms result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 007221c5..04ac9272 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -107,26 +107,33 @@ def _is_persistent_array(array: blosc2.NDArray) -> bool: def _load_store(array: blosc2.NDArray) -> dict: + key = _array_key(array) + cached = _IN_MEMORY_INDEXES.get(key) + if cached is not None: + return cached + if _is_persistent_array(array): try: store = array.schunk.vlmeta[INDEXES_VLMETA_KEY] except KeyError: - return _default_index_store() + store = _default_index_store() if not isinstance(store, dict): - return _default_index_store() + store = _default_index_store() store.setdefault("version", INDEX_FORMAT_VERSION) store.setdefault("indexes", {}) - return store - return _IN_MEMORY_INDEXES.get(_array_key(array), _default_index_store()) + else: + store = _default_index_store() + + _IN_MEMORY_INDEXES[key] = store + return store def _save_store(array: blosc2.NDArray, store: dict) -> None: store.setdefault("version", INDEX_FORMAT_VERSION) store.setdefault("indexes", {}) + _IN_MEMORY_INDEXES[_array_key(array)] = store if _is_persistent_array(array): array.schunk.vlmeta[INDEXES_VLMETA_KEY] = store - else: - _IN_MEMORY_INDEXES[_array_key(array)] = store def _supported_index_dtype(dtype: np.dtype) -> bool: From cb2a0cd11ac5d113ca4992235b0987b8923d95df Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 07:39:35 +0200 Subject: [PATCH 07/68] New (preliminary) algorithm for light indexes --- bench/ndarray/index_query_bench.py | 16 +- src/blosc2/indexing.py | 235 +++++++++++++++++++++++++++-- src/blosc2/lazyexpr.py | 2 + tests/ndarray/test_indexing.py | 14 +- 4 files changed, 245 insertions(+), 22 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 2be6de6b..3a532d19 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -17,6 +17,7 @@ import numpy as np import blosc2 +from blosc2 import indexing as blosc2_indexing SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) CHUNK_LEN = 100_000 @@ -99,6 +100,13 @@ def index_sizes(descriptor: dict) -> tuple[int, int]: if level_info["path"]: disk += os.path.getsize(level_info["path"]) + light = descriptor.get("light") + if light is not None: + for key in ("values_path", "bucket_positions_path", "offsets_path"): + array = blosc2.open(light[key]) + logical += int(np.prod(array.shape)) * array.dtype.itemsize + disk += os.path.getsize(light[key]) + reduced = descriptor.get("reduced") if reduced is not None: values = blosc2.open(reduced["values_path"]) @@ -136,7 +144,13 @@ def get_data() -> np.ndarray: def _valid_index_descriptor(arr: blosc2.NDArray, kind: str) -> dict | None: for descriptor in arr.indexes: - if descriptor.get("field") == "id" and descriptor.get("kind") == kind and not descriptor.get("stale", False): + if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: + continue + if ( + descriptor.get("field") == "id" + and descriptor.get("kind") == kind + and not descriptor.get("stale", False) + ): return descriptor return None diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 04ac9272..4745b0d0 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -17,7 +17,7 @@ import blosc2 INDEXES_VLMETA_KEY = "blosc2_indexes" -INDEX_FORMAT_VERSION = 1 +INDEX_FORMAT_VERSION = 2 FLAG_ALL_NAN = np.uint8(1 << 0) FLAG_HAS_NAN = np.uint8(1 << 1) @@ -47,6 +47,13 @@ class IndexPlan: total_units: int = 0 selected_units: int = 0 exact_positions: np.ndarray | None = None + bucket_masks: np.ndarray | None = None + bucket_len: int | None = None + block_len: int | None = None + lower: object | None = None + lower_inclusive: bool = True + upper: object | None = None + upper_inclusive: bool = True @dataclass(slots=True) @@ -95,6 +102,8 @@ def _copy_nested_dict(value: dict | None) -> dict | None: def _copy_descriptor(descriptor: dict) -> dict: copied = descriptor.copy() copied["levels"] = _copy_nested_dict(descriptor.get("levels")) + if descriptor.get("light") is not None: + copied["light"] = descriptor["light"].copy() if descriptor.get("reduced") is not None: copied["reduced"] = descriptor["reduced"].copy() if descriptor.get("full") is not None: @@ -351,6 +360,61 @@ def _build_reduced_descriptor( } +def _light_bucket_count(block_len: int) -> int: + return max(1, min(64, block_len)) + + +def _pack_bucket_mask(bucket_ids: np.ndarray) -> np.uint64: + mask = np.uint64(0) + for bucket_id in np.unique(bucket_ids): + mask |= np.uint64(1) << np.uint64(int(bucket_id)) + return mask + + +def _build_light_descriptor( + array: blosc2.NDArray, + field: str | None, + kind: str, + values: np.ndarray, + persistent: bool, +) -> dict: + block_len = int(array.blocks[0]) + bucket_count = _light_bucket_count(block_len) + bucket_len = math.ceil(block_len / bucket_count) + nblocks = math.ceil(values.shape[0] / block_len) + offsets = np.empty(nblocks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty_like(values) + bucket_positions = np.empty(values.shape[0], dtype=np.uint8) + cursor = 0 + + for block_id in range(nblocks): + start = block_id * block_len + stop = min(start + block_len, values.shape[0]) + block = values[start:stop] + order = np.argsort(block, kind="stable") + block_size = stop - start + next_cursor = cursor + block_size + sorted_values[cursor:next_cursor] = block[order] + bucket_positions[cursor:next_cursor] = (order // bucket_len).astype(np.uint8, copy=False) + cursor = next_cursor + offsets[block_id + 1] = cursor + + values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar( + array, field, kind, "light", "bucket_positions", bucket_positions, persistent + ) + offsets_sidecar = _store_array_sidecar(array, field, kind, "light", "offsets", offsets, persistent) + return { + "block_len": block_len, + "bucket_count": bucket_count, + "bucket_len": bucket_len, + "values_path": values_sidecar["path"], + "bucket_positions_path": positions_sidecar["path"], + "offsets_path": offsets_sidecar["path"], + } + + def _build_descriptor( array: blosc2.NDArray, field: str | None, @@ -361,6 +425,7 @@ def _build_descriptor( name: str | None, dtype: np.dtype, levels: dict, + light: dict | None, reduced: dict | None, full: dict | None, ) -> dict: @@ -378,6 +443,7 @@ def _build_descriptor( "chunks": tuple(array.chunks), "blocks": tuple(array.blocks), "levels": levels, + "light": light, "reduced": reduced, "full": full, } @@ -404,10 +470,11 @@ def create_index( values = _values_for_index(array, field) levels = _build_levels_descriptor(array, field, kind, dtype, values, persistent) + light = _build_light_descriptor(array, field, kind, values, persistent) if kind == "light" else None reduced = _build_reduced_descriptor(array, field, kind, values, persistent) if kind == "medium" else None full = _build_full_descriptor(array, field, kind, values, persistent) if kind == "full" else None descriptor = _build_descriptor( - array, field, kind, optlevel, granularity, persistent, name, dtype, levels, reduced, full + array, field, kind, optlevel, granularity, persistent, name, dtype, levels, light, reduced, full ) store = _load_store(array) @@ -440,6 +507,10 @@ def _remove_sidecar_path(path: str | None) -> None: def _drop_descriptor_sidecars(descriptor: dict) -> None: for level_info in descriptor["levels"].values(): _remove_sidecar_path(level_info["path"]) + if descriptor.get("light") is not None: + _remove_sidecar_path(descriptor["light"]["values_path"]) + _remove_sidecar_path(descriptor["light"]["bucket_positions_path"]) + _remove_sidecar_path(descriptor["light"]["offsets_path"]) if descriptor.get("reduced") is not None: _remove_sidecar_path(descriptor["reduced"]["values_path"]) _remove_sidecar_path(descriptor["reduced"]["positions_path"]) @@ -496,6 +567,10 @@ def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: descriptor = _load_store(array)["indexes"].get(_field_token(field)) if descriptor is None or descriptor.get("stale", False): return None + if descriptor.get("version") != INDEX_FORMAT_VERSION: + return None + if descriptor.get("kind") == "light" and "values_path" not in descriptor.get("light", {}): + return None if tuple(descriptor.get("shape", ())) != tuple(array.shape): return None if tuple(descriptor.get("chunks", ())) != tuple(array.chunks): @@ -531,6 +606,18 @@ def _load_reduced_arrays( return values, positions, offsets +def _load_light_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + values = _load_array_sidecar(array, descriptor["field"], "light", "values", light["values_path"]) + positions = _load_array_sidecar( + array, descriptor["field"], "light", "bucket_positions", light["bucket_positions_path"] + ) + offsets = _load_array_sidecar(array, descriptor["field"], "light", "offsets", light["offsets_path"]) + return values, positions, offsets + + def _normalize_scalar(value, dtype: np.dtype): if isinstance(value, np.generic): return value.item() @@ -771,7 +858,7 @@ def _plan_exact_compare(node: ast.Compare, operands: dict) -> ExactPredicatePlan return None base, field, op, value = target descriptor = _descriptor_for(base, field) - if descriptor is None or descriptor.get("kind") not in {"medium", "full"}: + if descriptor is None or descriptor.get("kind") not in {"light", "medium", "full"}: return None try: value = _normalize_scalar(value, np.dtype(descriptor["dtype"])) @@ -942,6 +1029,35 @@ def _exact_positions_from_full( return np.sort(positions[lo:hi], kind="stable") +def _bit_count_sum(masks: np.ndarray) -> int: + return sum(int(mask).bit_count() for mask in masks.tolist()) + + +def _bucket_masks_from_light( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + if _range_is_empty(plan): + return np.empty(0, dtype=np.uint64) + + summaries = _load_level_summaries(array, descriptor, "block") + dtype = np.dtype(descriptor["dtype"]) + candidate_blocks = _candidate_units_from_exact_plan(summaries, dtype, plan) + if not np.any(candidate_blocks): + return np.zeros(len(summaries), dtype=np.uint64) + + sorted_values, bucket_positions, offsets = _load_light_arrays(array, descriptor) + masks = np.zeros(len(summaries), dtype=np.uint64) + for block_id in np.flatnonzero(candidate_blocks): + start = int(offsets[block_id]) + stop = int(offsets[block_id + 1]) + block_values = sorted_values[start:stop] + lo, hi = _search_bounds(block_values, plan) + if lo >= hi: + continue + masks[block_id] = _pack_bucket_mask(bucket_positions[start + lo : start + hi]) + return masks + + def _exact_positions_from_reduced( array: blosc2.NDArray, descriptor: dict, dtype: np.dtype, plan: ExactPredicatePlan ) -> np.ndarray: @@ -987,24 +1103,58 @@ def plan_query(expression: str, operands: dict, where: dict | None, *, use_index exact_plan = _plan_exact_node(tree.body, operands) if exact_plan is not None: kind = exact_plan.descriptor["kind"] - dtype = np.dtype(exact_plan.descriptor["dtype"]) if kind == "full": exact_positions = _exact_positions_from_full(exact_plan.base, exact_plan.descriptor, exact_plan) - else: + return IndexPlan( + True, + f"{kind} exact index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + field=exact_plan.field, + level=kind, + total_units=exact_plan.base.shape[0], + selected_units=len(exact_positions), + exact_positions=exact_positions, + ) + if kind == "medium": + dtype = np.dtype(exact_plan.descriptor["dtype"]) exact_positions = _exact_positions_from_reduced( exact_plan.base, exact_plan.descriptor, dtype, exact_plan ) - return IndexPlan( - True, - f"{kind} exact index selected", - descriptor=_copy_descriptor(exact_plan.descriptor), - base=exact_plan.base, - field=exact_plan.field, - level=kind, - total_units=exact_plan.base.shape[0], - selected_units=len(exact_positions), - exact_positions=exact_positions, - ) + return IndexPlan( + True, + f"{kind} exact index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + field=exact_plan.field, + level=kind, + total_units=exact_plan.base.shape[0], + selected_units=len(exact_positions), + exact_positions=exact_positions, + ) + if kind == "light": + bucket_masks = _bucket_masks_from_light(exact_plan.base, exact_plan.descriptor, exact_plan) + light = exact_plan.descriptor["light"] + total_units = len(bucket_masks) * int(light["bucket_count"]) + selected_units = _bit_count_sum(bucket_masks) + if selected_units < total_units: + return IndexPlan( + True, + "light approximate-order index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + field=exact_plan.field, + level=kind, + total_units=total_units, + selected_units=selected_units, + bucket_masks=bucket_masks, + bucket_len=int(light["bucket_len"]), + block_len=int(light["block_len"]), + lower=exact_plan.lower, + lower_inclusive=exact_plan.lower_inclusive, + upper=exact_plan.upper, + upper_inclusive=exact_plan.upper_inclusive, + ) segment_plan = _plan_segment_node(tree.body, operands) if segment_plan is None: @@ -1069,6 +1219,59 @@ def evaluate_segment_query( return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) +def evaluate_light_query( + expression: str, operands: dict, ne_args: dict, where: dict, plan: IndexPlan +) -> np.ndarray: + del expression, operands, ne_args + + if plan.base is None or plan.bucket_masks is None or plan.block_len is None or plan.bucket_len is None: + raise ValueError("light evaluation requires bucket masks and block geometry") + + parts = [] + total_len = int(plan.base.shape[0]) + chunk_len = int(plan.base.chunks[0]) + bucket_count = int(plan.descriptor["light"]["bucket_count"]) + where_x = where["_where_x"] + for block_id, bucket_mask in enumerate(plan.bucket_masks.tolist()): + mask = int(bucket_mask) + if mask == 0: + continue + block_start = block_id * plan.block_len + block_stop = min(block_start + plan.block_len, total_len) + bucket_id = 0 + while bucket_id < bucket_count: + if not ((mask >> bucket_id) & 1): + bucket_id += 1 + continue + run_start = bucket_id + bucket_id += 1 + while bucket_id < bucket_count and ((mask >> bucket_id) & 1): + bucket_id += 1 + start = block_start + run_start * plan.bucket_len + stop = min(block_start + bucket_id * plan.bucket_len, block_stop) + if start >= stop: + continue + if _supports_block_reads(where_x): + span = np.empty(stop - start, dtype=where_x.dtype) + chunk_id = start // chunk_len + local_start = start - chunk_id * chunk_len + where_x.get_1d_span_numpy(span, chunk_id, local_start, stop - start) + else: + span = where_x[start:stop] + field_values = span if plan.field is None else span[plan.field] + match = np.ones(len(field_values), dtype=bool) + if plan.lower is not None: + match &= field_values >= plan.lower if plan.lower_inclusive else field_values > plan.lower + if plan.upper is not None: + match &= field_values <= plan.upper if plan.upper_inclusive else field_values < plan.upper + if np.any(match): + parts.append(np.require(span[match], requirements="C")) + + if parts: + return np.concatenate(parts) + return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) + + def _gather_positions(where_x, positions: np.ndarray) -> np.ndarray: if len(positions) == 0: return np.empty(0, dtype=_where_output_dtype(where_x)) diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 17242895..3103ef6b 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1845,6 +1845,8 @@ def slices_eval( # noqa: C901 if index_plan.usable and not (_indices or _order): if index_plan.exact_positions is not None: return indexing.evaluate_full_query(where, index_plan) + if index_plan.bucket_masks is not None: + return indexing.evaluate_light_query(expression, operands, ne_args, where, index_plan) if index_plan.level not in (None, "chunk"): return indexing.evaluate_segment_query(expression, operands, ne_args, where, index_plan) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 14dfd335..d4beb141 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -51,7 +51,7 @@ def test_structured_field_index_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 48_000) & (data["id"] < 51_000)]) -@pytest.mark.parametrize("kind", ["medium", "full"]) +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_random_field_index_matches_scan(kind): rng = np.random.default_rng(0) dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) @@ -71,7 +71,7 @@ def test_random_field_index_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 70_000) & (data["id"] < 71_200)]) -@pytest.mark.parametrize("kind", ["medium", "full"]) +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_random_field_point_query_matches_scan(kind): rng = np.random.default_rng(1) dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) @@ -91,21 +91,25 @@ def test_random_field_point_query_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 123_456) & (data["id"] < 123_457)]) -@pytest.mark.parametrize("kind", ["medium", "full"]) +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_persistent_index_survives_reopen(tmp_path, kind): path = tmp_path / "indexed_array.b2nd" data = np.arange(80_000, dtype=np.int64) arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) descriptor = arr.create_index(kind=kind) - if kind == "medium": + if kind == "light": + assert descriptor["light"]["values_path"] is not None + elif kind == "medium": assert descriptor["reduced"]["values_path"] is not None else: assert descriptor["full"]["values_path"] is not None reopened = blosc2.open(path, mode="a") assert len(reopened.indexes) == 1 - if kind == "medium": + if kind == "light": + assert reopened.indexes[0]["light"]["values_path"] == descriptor["light"]["values_path"] + elif kind == "medium": assert reopened.indexes[0]["reduced"]["values_path"] == descriptor["reduced"]["values_path"] else: assert reopened.indexes[0]["full"]["values_path"] == descriptor["full"]["values_path"] From c1e69abea5626898c41cf61323b28a1dd0beaf7e Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 08:09:07 +0200 Subject: [PATCH 08/68] Rework light indexes around coarse block-local order Replace the experimental metadata-only light path with a real block-local reduced/coarse index more in line with OPSI. The new light stores: - block-local sorted values - coarse physical bucket positions for those sorted values - block offsets into the flattened sidecars Query execution now: - prunes with chunk/block summaries - does exact searchsorted() inside each surviving block - builds a coarse bucket mask from matching sorted rows - rechecks only those physical buckets against base data Add an integer-only lossy compression knob for light.values: - light_value_lossy_bits = min(9 - optlevel, dtype.itemsize) - capped to one eighth of the integer width - default optlevel=5 - exact base-row recheck preserves correctness Extend the benchmark with --optlevel and make index reuse optlevel-aware. Update tests to cover: - persistent light indexes - lossy integer light correctness --- bench/ndarray/index_query_bench.py | 36 ++++++---- src/blosc2/indexing.py | 111 +++++++++++++++++++++-------- tests/ndarray/test_indexing.py | 21 ++++++ 3 files changed, 127 insertions(+), 41 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 3a532d19..308d3ea6 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -26,6 +26,7 @@ KINDS = ("ultralight", "light", "medium", "full") DISTS = ("sorted", "block-shuffled", "random") RNG_SEED = 0 +DEFAULT_OPLEVEL = 5 def fill_ids(ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: @@ -73,8 +74,8 @@ def base_array_path(size_dir: Path, size: int, dist: str) -> Path: return size_dir / f"size_{size}_{dist}.b2nd" -def indexed_array_path(size_dir: Path, size: int, dist: str, kind: str) -> Path: - return size_dir / f"size_{size}_{dist}.{kind}.b2nd" +def indexed_array_path(size_dir: Path, size: int, dist: str, kind: str, optlevel: int) -> Path: + return size_dir / f"size_{size}_{dist}.{kind}.opt{optlevel}.b2nd" def benchmark_scan_once(expr) -> tuple[float, int]: @@ -142,13 +143,14 @@ def get_data() -> np.ndarray: return get_data -def _valid_index_descriptor(arr: blosc2.NDArray, kind: str) -> dict | None: +def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int) -> dict | None: for descriptor in arr.indexes: if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: continue if ( descriptor.get("field") == "id" and descriptor.get("kind") == kind + and int(descriptor.get("optlevel", -1)) == int(optlevel) and not descriptor.get("stale", False) ): return descriptor @@ -162,10 +164,10 @@ def _open_or_build_persistent_array(path: Path, get_data) -> blosc2.NDArray: return build_persistent_array(get_data(), path) -def _open_or_build_indexed_array(path: Path, get_data, kind: str) -> tuple[blosc2.NDArray, float]: +def _open_or_build_indexed_array(path: Path, get_data, kind: str, optlevel: int) -> tuple[blosc2.NDArray, float]: if path.exists(): arr = blosc2.open(path, mode="a") - if _valid_index_descriptor(arr, kind) is not None: + if _valid_index_descriptor(arr, kind, optlevel) is not None: return arr, 0.0 if arr.indexes: arr.drop_index(field="id") @@ -173,11 +175,11 @@ def _open_or_build_indexed_array(path: Path, get_data, kind: str) -> tuple[blosc arr = build_persistent_array(get_data(), path) build_start = time.perf_counter() - arr.create_index(field="id", kind=kind) + arr.create_index(field="id", kind=kind, optlevel=optlevel) return arr, time.perf_counter() - build_start -def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> list[dict]: +def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int, optlevel: int) -> list[dict]: get_data = _source_data_factory(size, dist) arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist), get_data) lo = size // 2 @@ -191,7 +193,9 @@ def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> li rows = [] for kind in KINDS: - idx_arr, build_time = _open_or_build_indexed_array(indexed_array_path(size_dir, size, dist, kind), get_data, kind) + idx_arr, build_time = _open_or_build_indexed_array( + indexed_array_path(size_dir, size, dist, kind, optlevel), get_data, kind, optlevel + ) idx_cond = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", idx_arr.fields) idx_expr = idx_cond.where(idx_arr) explanation = idx_expr.explain() @@ -203,6 +207,7 @@ def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int) -> li "size": size, "dist": dist, "kind": kind, + "optlevel": optlevel, "query_rows": index_len, "build_s": build_time, "create_idx_ms": build_time * 1_000, @@ -287,6 +292,12 @@ def parse_args() -> argparse.Namespace: type=Path, help="Directory where benchmark arrays and index sidecars should be written and kept.", ) + parser.add_argument( + "--optlevel", + type=int, + default=DEFAULT_OPLEVEL, + help="Index optlevel to use when creating indexes. Default: 5.", + ) parser.add_argument( "--dist", choices=(*DISTS, "all"), @@ -305,10 +316,10 @@ def main() -> None: if args.outdir is None: with tempfile.TemporaryDirectory() as tmpdir: - run_benchmarks(sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats) + run_benchmarks(sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats, args.optlevel) else: args.outdir.mkdir(parents=True, exist_ok=True) - run_benchmarks(sizes, dists, args.outdir, args.dist, args.query_width, args.repeats) + run_benchmarks(sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel) def run_benchmarks( @@ -318,16 +329,17 @@ def run_benchmarks( dist_label: str, query_width: int, repeats: int, + optlevel: int, ) -> None: all_results = [] print("Structured range-query benchmark across index kinds") print( f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " - f"query_width={query_width:,}" + f"query_width={query_width:,}, optlevel={optlevel}" ) for dist in dists: for size in sizes: - size_results = benchmark_size(size, size_dir, dist, query_width) + size_results = benchmark_size(size, size_dir, dist, query_width, optlevel) all_results.extend(size_results) print() diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 4745b0d0..2729ec09 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -319,14 +319,9 @@ def _position_dtype(max_value: int) -> np.dtype: return np.dtype(np.uint64) -def _build_reduced_descriptor( - array: blosc2.NDArray, - field: str | None, - kind: str, - values: np.ndarray, - persistent: bool, -) -> dict: - block_len = int(array.blocks[0]) +def _build_block_sorted_payload( + values: np.ndarray, block_len: int +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.dtype]: nblocks = math.ceil(values.shape[0] / block_len) position_dtype = _position_dtype(block_len - 1) offsets = np.empty(nblocks + 1, dtype=np.int64) @@ -347,6 +342,19 @@ def _build_reduced_descriptor( cursor = next_cursor offsets[block_id + 1] = cursor + return sorted_values, positions, offsets, position_dtype + + +def _build_reduced_descriptor( + array: blosc2.NDArray, + field: str | None, + kind: str, + values: np.ndarray, + persistent: bool, +) -> dict: + block_len = int(array.blocks[0]) + sorted_values, positions, offsets, _ = _build_block_sorted_payload(values, block_len) + values_sidecar = _store_array_sidecar(array, field, kind, "reduced", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar( array, field, kind, "reduced", "positions", positions, persistent @@ -371,34 +379,48 @@ def _pack_bucket_mask(bucket_ids: np.ndarray) -> np.uint64: return mask +def _light_value_lossy_bits(dtype: np.dtype, optlevel: int) -> int: + dtype = np.dtype(dtype) + if dtype.kind not in {"i", "u"}: + return 0 + max_bits = dtype.itemsize + return min(max(0, 9 - int(optlevel)), max_bits) + + +def _quantize_integer_array(values: np.ndarray, bits: int) -> np.ndarray: + if bits <= 0: + return values + dtype = np.dtype(values.dtype) + mask = np.asarray(~((1 << bits) - 1), dtype=dtype)[()] + quantized = values.copy() + np.bitwise_and(quantized, mask, out=quantized) + return quantized + + +def _quantize_integer_scalar(value, dtype: np.dtype, bits: int): + scalar = np.asarray(value, dtype=dtype)[()] + if bits <= 0: + return scalar + mask = np.asarray(~((1 << bits) - 1), dtype=dtype)[()] + return np.bitwise_and(scalar, mask, dtype=dtype) + + def _build_light_descriptor( array: blosc2.NDArray, field: str | None, kind: str, values: np.ndarray, + optlevel: int, persistent: bool, ) -> dict: block_len = int(array.blocks[0]) bucket_count = _light_bucket_count(block_len) bucket_len = math.ceil(block_len / bucket_count) - nblocks = math.ceil(values.shape[0] / block_len) - offsets = np.empty(nblocks + 1, dtype=np.int64) - offsets[0] = 0 - sorted_values = np.empty_like(values) - bucket_positions = np.empty(values.shape[0], dtype=np.uint8) - cursor = 0 - - for block_id in range(nblocks): - start = block_id * block_len - stop = min(start + block_len, values.shape[0]) - block = values[start:stop] - order = np.argsort(block, kind="stable") - block_size = stop - start - next_cursor = cursor + block_size - sorted_values[cursor:next_cursor] = block[order] - bucket_positions[cursor:next_cursor] = (order // bucket_len).astype(np.uint8, copy=False) - cursor = next_cursor - offsets[block_id + 1] = cursor + value_lossy_bits = _light_value_lossy_bits(values.dtype, optlevel) + sorted_values, positions, offsets, _ = _build_block_sorted_payload(values, block_len) + if value_lossy_bits > 0: + sorted_values = _quantize_integer_array(sorted_values, value_lossy_bits) + bucket_positions = (positions // bucket_len).astype(np.uint8, copy=False) values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar( @@ -409,6 +431,7 @@ def _build_light_descriptor( "block_len": block_len, "bucket_count": bucket_count, "bucket_len": bucket_len, + "value_lossy_bits": value_lossy_bits, "values_path": values_sidecar["path"], "bucket_positions_path": positions_sidecar["path"], "offsets_path": offsets_sidecar["path"], @@ -453,7 +476,7 @@ def create_index( array: blosc2.NDArray, field: str | None = None, kind: str = "light", - optlevel: int = 3, + optlevel: int = 5, granularity: str = "chunk", persistent: bool | None = None, name: str | None = None, @@ -470,7 +493,11 @@ def create_index( values = _values_for_index(array, field) levels = _build_levels_descriptor(array, field, kind, dtype, values, persistent) - light = _build_light_descriptor(array, field, kind, values, persistent) if kind == "light" else None + light = ( + _build_light_descriptor(array, field, kind, values, optlevel, persistent) + if kind == "light" + else None + ) reduced = _build_reduced_descriptor(array, field, kind, values, persistent) if kind == "medium" else None full = _build_full_descriptor(array, field, kind, values, persistent) if kind == "full" else None descriptor = _build_descriptor( @@ -1046,12 +1073,38 @@ def _bucket_masks_from_light( return np.zeros(len(summaries), dtype=np.uint64) sorted_values, bucket_positions, offsets = _load_light_arrays(array, descriptor) + light = descriptor["light"] + value_lossy_bits = int(light.get("value_lossy_bits", 0)) + dtype = np.dtype(descriptor["dtype"]) masks = np.zeros(len(summaries), dtype=np.uint64) for block_id in np.flatnonzero(candidate_blocks): start = int(offsets[block_id]) stop = int(offsets[block_id + 1]) block_values = sorted_values[start:stop] - lo, hi = _search_bounds(block_values, plan) + if value_lossy_bits > 0 and dtype.kind in {"i", "u"}: + if plan.lower is not None: + if plan.lower_inclusive: + next_lower = plan.lower + else: + max_value = np.iinfo(dtype).max + next_lower = min(int(plan.lower) + 1, max_value) + lower = _quantize_integer_scalar(next_lower, dtype, value_lossy_bits) + lower_inclusive = True + else: + lower = None + lower_inclusive = True + search_plan = ExactPredicatePlan( + base=plan.base, + descriptor=plan.descriptor, + field=plan.field, + lower=lower, + lower_inclusive=lower_inclusive, + upper=plan.upper, + upper_inclusive=plan.upper_inclusive, + ) + lo, hi = _search_bounds(block_values, search_plan) + else: + lo, hi = _search_bounds(block_values, plan) if lo >= hi: continue masks[block_id] = _pack_bucket_mask(bucket_positions[start + lo : start + hi]) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index d4beb141..37212979 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -91,6 +91,27 @@ def test_random_field_point_query_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 123_456) & (data["id"] < 123_457)]) +def test_light_lossy_integer_values_match_scan(): + rng = np.random.default_rng(2) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(180_000, dtype=dtype) + data["id"] = np.arange(-90_000, 90_000, dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, chunks=(18_000,), blocks=(3_000,)) + descriptor = arr.create_index(field="id", kind="light", optlevel=0) + + assert descriptor["light"]["value_lossy_bits"] == 8 + + expr = blosc2.lazyexpr("(id >= -123) & (id < 456)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= -123) & (data["id"] < 456)]) + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_persistent_index_survives_reopen(tmp_path, kind): path = tmp_path / "indexed_array.b2nd" From 61b23387b61fd8d76c4f57ddaedddea4869886a2 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 08:29:39 +0200 Subject: [PATCH 09/68] Add dtype-aware indexing benchmarks and lossy float light indexes Extend the light lossy-value experiment from integers to float32 and float64, while keeping all other non-integer dtypes exact. Use monotonic downward quantization for finite float values so light can still widen bounds safely and preserve correctness via exact base-row rechecks. Update benchmark coverage in both: - bench/ndarray/index_query_bench.py - bench/ndarray/index_query_bench_tables.py Add --dtype to both scripts, defaulting to float64, and make data generation, query construction, and persisted output reuse dtype-aware. This lets us benchmark indexing behavior consistently across boolean, integer, and floating-point columns in both python-blosc2 and PyTables. --- bench/ndarray/index_query_bench.py | 134 +++++++++++++++++++++++------ src/blosc2/indexing.py | 102 ++++++++++++++++++++-- tests/ndarray/test_indexing.py | 21 +++++ 3 files changed, 223 insertions(+), 34 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 308d3ea6..ccfdd779 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -9,6 +9,7 @@ import argparse import os +import re import statistics import tempfile import time @@ -29,10 +30,38 @@ DEFAULT_OPLEVEL = 5 -def fill_ids(ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: size = ids.shape[0] if dist == "sorted": - ids[:] = np.arange(size, dtype=np.int64) + ids[:] = ordered_ids return if dist == "block-shuffled": @@ -43,22 +72,22 @@ def fill_ids(ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: src_start = int(src_block) * BLOCK_LEN src_stop = min(src_start + BLOCK_LEN, size) block_size = src_stop - src_start - ids[dest : dest + block_size] = np.arange(src_start, src_stop, dtype=np.int64) + ids[dest : dest + block_size] = ordered_ids[src_start:src_stop] dest += block_size return if dist == "random": - ids[:] = np.arange(size, dtype=np.int64) + ids[:] = ordered_ids rng.shuffle(ids) return raise ValueError(f"unsupported distribution {dist!r}") -def make_source_data(size: int, dist: str) -> np.ndarray: - dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) +def make_source_data(size: int, dist: str, id_dtype: np.dtype) -> np.ndarray: + dtype = np.dtype([("id", id_dtype), ("payload", np.float32)]) data = np.zeros(size, dtype=dtype) - fill_ids(data["id"], dist, np.random.default_rng(RNG_SEED)) + fill_ids(data["id"], make_ordered_ids(size, id_dtype), dist, np.random.default_rng(RNG_SEED)) return data @@ -70,12 +99,12 @@ def build_persistent_array(data: np.ndarray, path: Path) -> blosc2.NDArray: return blosc2.asarray(data, urlpath=path, mode="w", chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) -def base_array_path(size_dir: Path, size: int, dist: str) -> Path: - return size_dir / f"size_{size}_{dist}.b2nd" +def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype) -> Path: + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.b2nd" -def indexed_array_path(size_dir: Path, size: int, dist: str, kind: str, optlevel: int) -> Path: - return size_dir / f"size_{size}_{dist}.{kind}.opt{optlevel}.b2nd" +def indexed_array_path(size_dir: Path, size: int, dist: str, kind: str, optlevel: int, id_dtype: np.dtype) -> Path: + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{kind}.opt{optlevel}.b2nd" def benchmark_scan_once(expr) -> tuple[float, int]: @@ -131,18 +160,56 @@ def index_sizes(descriptor: dict) -> tuple[int, int]: return logical, disk -def _source_data_factory(size: int, dist: str): +def _source_data_factory(size: int, dist: str, id_dtype: np.dtype): data = None def get_data() -> np.ndarray: nonlocal data if data is None: - data = make_source_data(size, dist) + data = make_source_data(size, dist, id_dtype) return data return get_data +def _ordered_ids_factory(size: int, id_dtype: np.dtype): + ordered_ids = None + + def get_ordered_ids() -> np.ndarray: + nonlocal ordered_ids + if ordered_ids is None: + ordered_ids = make_ordered_ids(size, id_dtype) + return ordered_ids + + return get_ordered_ids + + +def _query_bounds(ordered_ids: np.ndarray, query_width: int) -> tuple[object, object]: + if ordered_ids.size == 0: + raise ValueError("benchmark arrays must not be empty") + + lo_idx = ordered_ids.size // 2 + hi_idx = min(ordered_ids.size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_ids[lo_idx].item(), ordered_ids[hi_idx].item() + + +def _literal(value: object, dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "True" if bool(value) else "False" + if dtype.kind == "f": + return repr(float(value)) + if dtype.kind in {"i", "u"}: + return str(int(value)) + raise ValueError(f"unsupported dtype for literal formatting: {dtype}") + + +def _condition_expr(lo: object, hi: object, dtype: np.dtype) -> str: + lo_literal = _literal(lo, dtype) + hi_literal = _literal(hi, dtype) + return f"(id >= {lo_literal}) & (id <= {hi_literal})" + + def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int) -> dict | None: for descriptor in arr.indexes: if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: @@ -179,12 +246,15 @@ def _open_or_build_indexed_array(path: Path, get_data, kind: str, optlevel: int) return arr, time.perf_counter() - build_start -def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int, optlevel: int) -> list[dict]: - get_data = _source_data_factory(size, dist) - arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist), get_data) - lo = size // 2 - hi = min(size, lo + query_width) - condition = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields) +def benchmark_size( + size: int, size_dir: Path, dist: str, query_width: int, optlevel: int, id_dtype: np.dtype +) -> list[dict]: + get_data = _source_data_factory(size, dist, id_dtype) + get_ordered_ids = _ordered_ids_factory(size, id_dtype) + arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist, id_dtype), get_data) + lo, hi = _query_bounds(get_ordered_ids(), query_width) + condition_str = _condition_expr(lo, hi, id_dtype) + condition = blosc2.lazyexpr(condition_str, arr.fields) expr = condition.where(arr) base_bytes = size * arr.dtype.itemsize compressed_base_bytes = os.path.getsize(arr.urlpath) @@ -194,9 +264,9 @@ def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int, optle rows = [] for kind in KINDS: idx_arr, build_time = _open_or_build_indexed_array( - indexed_array_path(size_dir, size, dist, kind, optlevel), get_data, kind, optlevel + indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype), get_data, kind, optlevel ) - idx_cond = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", idx_arr.fields) + idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) idx_expr = idx_cond.where(idx_arr) explanation = idx_expr.explain() logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) @@ -298,6 +368,11 @@ def parse_args() -> argparse.Namespace: default=DEFAULT_OPLEVEL, help="Index optlevel to use when creating indexes. Default: 5.", ) + parser.add_argument( + "--dtype", + default="float64", + help="NumPy dtype for the indexed field. Examples: float64, float32, int16, bool. Default: float64.", + ) parser.add_argument( "--dist", choices=(*DISTS, "all"), @@ -311,15 +386,23 @@ def main() -> None: args = parse_args() if args.repeats < 0: raise SystemExit("--repeats must be >= 0") + try: + id_dtype = np.dtype(args.dtype) + except TypeError as exc: + raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc + if id_dtype.kind not in {"b", "i", "u", "f"}: + raise SystemExit(f"--dtype only supports bool, integer, and floating-point dtypes; got {id_dtype}") sizes = (args.size,) if args.size is not None else SIZES dists = DISTS if args.dist == "all" else (args.dist,) if args.outdir is None: with tempfile.TemporaryDirectory() as tmpdir: - run_benchmarks(sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats, args.optlevel) + run_benchmarks( + sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats, args.optlevel, id_dtype + ) else: args.outdir.mkdir(parents=True, exist_ok=True) - run_benchmarks(sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel) + run_benchmarks(sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel, id_dtype) def run_benchmarks( @@ -330,16 +413,17 @@ def run_benchmarks( query_width: int, repeats: int, optlevel: int, + id_dtype: np.dtype, ) -> None: all_results = [] print("Structured range-query benchmark across index kinds") print( f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " - f"query_width={query_width:,}, optlevel={optlevel}" + f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}" ) for dist in dists: for size in sizes: - size_results = benchmark_size(size, size_dir, dist, query_width, optlevel) + size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, id_dtype) all_results.extend(size_results) print() diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 2729ec09..1f7614f1 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -381,9 +381,10 @@ def _pack_bucket_mask(bucket_ids: np.ndarray) -> np.uint64: def _light_value_lossy_bits(dtype: np.dtype, optlevel: int) -> int: dtype = np.dtype(dtype) - if dtype.kind not in {"i", "u"}: + if dtype.kind in {"i", "u"} or dtype == np.dtype(np.float32) or dtype == np.dtype(np.float64): + max_bits = dtype.itemsize + else: return 0 - max_bits = dtype.itemsize return min(max(0, 9 - int(optlevel)), max_bits) @@ -405,6 +406,83 @@ def _quantize_integer_scalar(value, dtype: np.dtype, bits: int): return np.bitwise_and(scalar, mask, dtype=dtype) +def _float_order_uint_dtype(dtype: np.dtype) -> np.dtype: + if dtype == np.dtype(np.float32): + return np.dtype(np.uint32) + if dtype == np.dtype(np.float64): + return np.dtype(np.uint64) + raise TypeError(f"unsupported float dtype {dtype}") + + +def _ordered_uint_from_float(values: np.ndarray) -> np.ndarray: + dtype = np.dtype(values.dtype) + uint_dtype = _float_order_uint_dtype(dtype) + bits = values.view(uint_dtype).copy() + sign_mask = np.asarray(1 << (dtype.itemsize * 8 - 1), dtype=uint_dtype)[()] + negative = (bits & sign_mask) != 0 + bits[negative] = ~bits[negative] + bits[~negative] ^= sign_mask + return bits + + +def _float_from_ordered_uint(ordered: np.ndarray, dtype: np.dtype) -> np.ndarray: + uint_dtype = _float_order_uint_dtype(dtype) + bits = ordered.astype(uint_dtype, copy=True) + sign_mask = np.asarray(1 << (dtype.itemsize * 8 - 1), dtype=uint_dtype)[()] + positive = (bits & sign_mask) != 0 + bits[positive] ^= sign_mask + bits[~positive] = ~bits[~positive] + return bits.view(dtype) + + +def _quantize_float_array(values: np.ndarray, bits: int) -> np.ndarray: + if bits <= 0: + return values + quantized = values.copy() + finite = np.isfinite(quantized) + if not np.any(finite): + return quantized + ordered = _ordered_uint_from_float(quantized[finite]) + uint_dtype = ordered.dtype + mask = np.asarray(np.iinfo(uint_dtype).max ^ ((1 << bits) - 1), dtype=uint_dtype)[()] + np.bitwise_and(ordered, mask, out=ordered) + quantized[finite] = _float_from_ordered_uint(ordered, quantized.dtype) + return quantized + + +def _quantize_float_scalar(value, dtype: np.dtype, bits: int): + scalar = np.asarray(value, dtype=dtype)[()] + if bits <= 0 or not np.isfinite(scalar): + return scalar + ordered = _ordered_uint_from_float(np.asarray([scalar], dtype=dtype)) + uint_dtype = ordered.dtype + mask = np.asarray(np.iinfo(uint_dtype).max ^ ((1 << bits) - 1), dtype=uint_dtype)[()] + np.bitwise_and(ordered, mask, out=ordered) + return _float_from_ordered_uint(ordered, dtype)[0] + + +def _quantize_light_values_array(values: np.ndarray, bits: int) -> np.ndarray: + dtype = np.dtype(values.dtype) + if bits <= 0: + return values + if dtype.kind in {"i", "u"}: + return _quantize_integer_array(values, bits) + if dtype == np.dtype(np.float32) or dtype == np.dtype(np.float64): + return _quantize_float_array(values, bits) + return values + + +def _quantize_light_value_scalar(value, dtype: np.dtype, bits: int): + dtype = np.dtype(dtype) + if bits <= 0: + return np.asarray(value, dtype=dtype)[()] + if dtype.kind in {"i", "u"}: + return _quantize_integer_scalar(value, dtype, bits) + if dtype == np.dtype(np.float32) or dtype == np.dtype(np.float64): + return _quantize_float_scalar(value, dtype, bits) + return np.asarray(value, dtype=dtype)[()] + + def _build_light_descriptor( array: blosc2.NDArray, field: str | None, @@ -419,7 +497,7 @@ def _build_light_descriptor( value_lossy_bits = _light_value_lossy_bits(values.dtype, optlevel) sorted_values, positions, offsets, _ = _build_block_sorted_payload(values, block_len) if value_lossy_bits > 0: - sorted_values = _quantize_integer_array(sorted_values, value_lossy_bits) + sorted_values = _quantize_light_values_array(sorted_values, value_lossy_bits) bucket_positions = (positions // bucket_len).astype(np.uint8, copy=False) values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", sorted_values, persistent) @@ -1081,14 +1159,20 @@ def _bucket_masks_from_light( start = int(offsets[block_id]) stop = int(offsets[block_id + 1]) block_values = sorted_values[start:stop] - if value_lossy_bits > 0 and dtype.kind in {"i", "u"}: + if value_lossy_bits > 0: if plan.lower is not None: - if plan.lower_inclusive: - next_lower = plan.lower + if dtype.kind in {"i", "u"}: + if plan.lower_inclusive: + next_lower = plan.lower + else: + max_value = np.iinfo(dtype).max + next_lower = min(int(plan.lower) + 1, max_value) else: - max_value = np.iinfo(dtype).max - next_lower = min(int(plan.lower) + 1, max_value) - lower = _quantize_integer_scalar(next_lower, dtype, value_lossy_bits) + if plan.lower_inclusive: + next_lower = plan.lower + else: + next_lower = np.nextafter(np.asarray(plan.lower, dtype=dtype)[()], np.inf) + lower = _quantize_light_value_scalar(next_lower, dtype, value_lossy_bits) lower_inclusive = True else: lower = None diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 37212979..10170825 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -112,6 +112,27 @@ def test_light_lossy_integer_values_match_scan(): np.testing.assert_array_equal(indexed, data[(data["id"] >= -123) & (data["id"] < 456)]) +def test_light_lossy_float_values_match_scan(): + rng = np.random.default_rng(3) + dtype = np.dtype([("x", np.float64), ("payload", np.float32)]) + data = np.zeros(160_000, dtype=dtype) + data["x"] = np.linspace(-5000.0, 5000.0, data.shape[0], dtype=np.float64) + rng.shuffle(data["x"]) + + arr = blosc2.asarray(data, chunks=(16_000,), blocks=(4_000,)) + descriptor = arr.create_index(field="x", kind="light", optlevel=0) + + assert descriptor["light"]["value_lossy_bits"] == 8 + + expr = blosc2.lazyexpr("(x >= -12.5) & (x < 17.25)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["x"] >= -12.5) & (data["x"] < 17.25)]) + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_persistent_index_survives_reopen(tmp_path, kind): path = tmp_path / "indexed_array.b2nd" From 3cf0df5c7a9dad9bc9fc158d7719ea4aef04af63 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 08:30:23 +0200 Subject: [PATCH 10/68] Add missing bench for pytables indexing --- bench/ndarray/index_query_bench_tables.py | 458 ++++++++++++++++++++++ 1 file changed, 458 insertions(+) create mode 100644 bench/ndarray/index_query_bench_tables.py diff --git a/bench/ndarray/index_query_bench_tables.py b/bench/ndarray/index_query_bench_tables.py new file mode 100644 index 00000000..246ee50d --- /dev/null +++ b/bench/ndarray/index_query_bench_tables.py @@ -0,0 +1,458 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import os +import re +import statistics +import tempfile +import time +from pathlib import Path + +import numpy as np +import tables + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +CHUNK_LEN = 100_000 +DEFAULT_REPEATS = 3 +KINDS = ("ultralight", "light", "medium", "full") +DISTS = ("sorted", "block-shuffled", "random") +RNG_SEED = 0 +TABLE_NAME = "data" +DATA_FILTERS = tables.Filters(complevel=5, complib="blosc2:zstd", shuffle=True) + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: + size = ids.shape[0] + if dist == "sorted": + ids[:] = ordered_ids + return + + if dist == "block-shuffled": + nblocks = (size + CHUNK_LEN - 1) // CHUNK_LEN + order = rng.permutation(nblocks) + dest = 0 + for src_block in order: + src_start = int(src_block) * CHUNK_LEN + src_stop = min(src_start + CHUNK_LEN, size) + block_size = src_stop - src_start + ids[dest : dest + block_size] = ordered_ids[src_start:src_stop] + dest += block_size + return + + if dist == "random": + ids[:] = ordered_ids + rng.shuffle(ids) + return + + raise ValueError(f"unsupported distribution {dist!r}") + + +def make_source_data(size: int, dist: str, id_dtype: np.dtype) -> np.ndarray: + dtype = np.dtype([("id", id_dtype), ("payload", np.float32)]) + data = np.zeros(size, dtype=dtype) + fill_ids(data["id"], make_ordered_ids(size, id_dtype), dist, np.random.default_rng(RNG_SEED)) + return data + + +def _source_data_factory(size: int, dist: str, id_dtype: np.dtype): + data = None + + def get_data() -> np.ndarray: + nonlocal data + if data is None: + data = make_source_data(size, dist, id_dtype) + return data + + return get_data + + +def _ordered_ids_factory(size: int, id_dtype: np.dtype): + ordered_ids = None + + def get_ordered_ids() -> np.ndarray: + nonlocal ordered_ids + if ordered_ids is None: + ordered_ids = make_ordered_ids(size, id_dtype) + return ordered_ids + + return get_ordered_ids + + +def base_table_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype) -> Path: + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.h5" + + +def indexed_table_path(size_dir: Path, size: int, dist: str, kind: str, id_dtype: np.dtype) -> Path: + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{kind}.h5" + + +def build_persistent_table(data: np.ndarray, path: Path) -> tuple[tables.File, tables.Table]: + h5 = tables.open_file(path, mode="w") + table = h5.create_table( + "/", + TABLE_NAME, + obj=data, + filters=DATA_FILTERS, + expectedrows=len(data), + chunkshape=CHUNK_LEN, + ) + h5.flush() + return h5, table + + +def benchmark_once(table: tables.Table, condition: str) -> tuple[float, int]: + start = time.perf_counter() + result = table.read_where(condition) + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def pytables_index_sizes(h5: tables.File) -> int: + total = 0 + if "/_i_data" not in h5: + return total + for node in h5.walk_nodes("/_i_data"): + dtype = getattr(node, "dtype", None) + shape = getattr(node, "shape", None) + if dtype is None or shape is None: + continue + nitems = 1 + for dim in shape: + nitems *= int(dim) + total += nitems * dtype.itemsize + return total + + +def _valid_index(table: tables.Table, kind: str) -> bool: + if not table.cols.id.is_indexed: + return False + return table.colindexes["id"].kind == kind + + +def _open_or_build_base_table(path: Path, get_data) -> tuple[tables.File, tables.Table]: + if path.exists(): + h5 = tables.open_file(path, mode="a") + return h5, getattr(h5.root, TABLE_NAME) + path.unlink(missing_ok=True) + return build_persistent_table(get_data(), path) + + +def _open_or_build_indexed_table(path: Path, get_data, kind: str) -> tuple[tables.File, tables.Table, float]: + if path.exists(): + h5 = tables.open_file(path, mode="a") + table = getattr(h5.root, TABLE_NAME) + if _valid_index(table, kind): + return h5, table, 0.0 + h5.close() + path.unlink() + + h5, table = build_persistent_table(get_data(), path) + build_start = time.perf_counter() + table.cols.id.create_index(kind=kind) + h5.flush() + return h5, table, time.perf_counter() - build_start + + +def _query_bounds(ordered_ids: np.ndarray, query_width: int) -> tuple[object, object]: + if ordered_ids.size == 0: + raise ValueError("benchmark arrays must not be empty") + + lo_idx = ordered_ids.size // 2 + hi_idx = min(ordered_ids.size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_ids[lo_idx].item(), ordered_ids[hi_idx].item() + + +def _literal(value: object, dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "True" if bool(value) else "False" + if dtype.kind == "f": + return repr(float(value)) + if dtype.kind in {"i", "u"}: + return str(int(value)) + raise ValueError(f"unsupported dtype for literal formatting: {dtype}") + + +def _condition_expr(lo: object, hi: object, dtype: np.dtype) -> str: + return f"(id >= {_literal(lo, dtype)}) & (id <= {_literal(hi, dtype)})" + + +def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int, id_dtype: np.dtype) -> list[dict]: + get_data = _source_data_factory(size, dist, id_dtype) + get_ordered_ids = _ordered_ids_factory(size, id_dtype) + base_h5, base_table = _open_or_build_base_table(base_table_path(size_dir, size, dist, id_dtype), get_data) + lo, hi = _query_bounds(get_ordered_ids(), query_width) + condition = _condition_expr(lo, hi, id_dtype) + base_bytes = size * np.dtype([("id", id_dtype), ("payload", np.float32)]).itemsize + compressed_base_bytes = os.path.getsize(base_h5.filename) + + scan_ms = benchmark_once(base_table, condition)[0] * 1_000 + + rows = [] + for kind in KINDS: + idx_h5, idx_table, build_time = _open_or_build_indexed_table( + indexed_table_path(size_dir, size, dist, kind, id_dtype), get_data, kind + ) + cold_time, index_len = benchmark_once(idx_table, condition) + indexed_file_bytes = os.path.getsize(idx_h5.filename) + disk_index_bytes = max(0, indexed_file_bytes - compressed_base_bytes) + logical_index_bytes = pytables_index_sizes(idx_h5) + + rows.append( + { + "size": size, + "dist": dist, + "kind": kind, + "query_rows": index_len, + "create_idx_ms": build_time * 1_000, + "scan_ms": scan_ms, + "cold_ms": cold_time * 1_000, + "cold_speedup": scan_ms / (cold_time * 1_000), + "warm_ms": None, + "warm_speedup": None, + "logical_index_bytes": logical_index_bytes, + "disk_index_bytes": disk_index_bytes, + "index_pct": logical_index_bytes / base_bytes * 100, + "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + "_h5": idx_h5, + "_table": idx_table, + "_condition": condition, + } + ) + + base_h5.close() + return rows + + +def measure_warm_queries(rows: list[dict], repeats: int) -> None: + if repeats <= 0: + return + for result in rows: + table = result["_table"] + condition = result["_condition"] + index_runs = [benchmark_once(table, condition)[0] for _ in range(repeats)] + warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None + result["warm_ms"] = warm_ms + result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms + + +def close_rows(rows: list[dict]) -> None: + for result in rows: + h5 = result.pop("_h5", None) + result.pop("_table", None) + result.pop("_condition", None) + if h5 is not None and h5.isopen: + h5.close() + + +def parse_human_size(value: str) -> int: + value = value.strip() + if not value: + raise argparse.ArgumentTypeError("size must not be empty") + + suffixes = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000} + suffix = value[-1].lower() + if suffix in suffixes: + number = value[:-1] + if not number: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") + try: + parsed = int(number) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + size = parsed * suffixes[suffix] + else: + try: + size = int(value) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + + if size <= 0: + raise argparse.ArgumentTypeError("size must be a positive integer") + return size + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark PyTables OPSI index kinds.") + parser.add_argument( + "--size", + type=parse_human_size, + help="Benchmark a single array size. Supports suffixes like 1k, 1K, 1M, 1G.", + ) + parser.add_argument( + "--query-width", + type=parse_human_size, + default=1_000, + help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1000.", + ) + parser.add_argument( + "--repeats", + type=int, + default=DEFAULT_REPEATS, + help="Number of repeated warm-query measurements after the first cold query. Default: 3.", + ) + parser.add_argument( + "--dtype", + default="float64", + help="NumPy dtype for the indexed field. Examples: float64, float32, int16, bool. Default: float64.", + ) + parser.add_argument( + "--dist", + choices=(*DISTS, "all"), + default="all", + help="Data distribution to benchmark. Default: all.", + ) + parser.add_argument( + "--outdir", + type=Path, + help="Optional directory to keep and reuse generated HDF5 files.", + ) + return parser.parse_args() + + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def print_table(rows: list[dict], columns: list[tuple[str, callable]]) -> None: + header = [name for name, _ in columns] + body = [[formatter(row) for _, formatter in columns] for row in rows] + widths = [len(name) for name in header] + for row in body: + for index, cell in enumerate(row): + widths[index] = max(widths[index], len(cell)) + + print(_format_row(header, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in body: + print(_format_row(row, widths)) + + +def run_benchmark() -> None: + args = parse_args() + try: + id_dtype = np.dtype(args.dtype) + except TypeError as exc: + raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc + if id_dtype.kind not in {"b", "i", "u", "f"}: + raise SystemExit(f"--dtype only supports bool, integer, and floating-point dtypes; got {id_dtype}") + sizes = (args.size,) if args.size is not None else SIZES + dists = DISTS if args.dist == "all" else (args.dist,) + dist_label = args.dist + repeats = max(0, args.repeats) + query_width = args.query_width + + if args.outdir is None: + with tempfile.TemporaryDirectory() as tmpdir: + _run_benchmark(Path(tmpdir), sizes, dists, dist_label, repeats, query_width, id_dtype) + else: + size_dir = args.outdir.expanduser() + size_dir.mkdir(parents=True, exist_ok=True) + _run_benchmark(size_dir, sizes, dists, dist_label, repeats, query_width, id_dtype) + + +def _run_benchmark( + size_dir: Path, + sizes: tuple[int, ...], + dists: tuple[str, ...], + dist_label: str, + repeats: int, + query_width: int, + id_dtype: np.dtype, +) -> None: + all_results = [] + print("Structured range-query benchmark across PyTables index kinds") + print( + f"chunks={CHUNK_LEN:,}, repeats={repeats}, dist={dist_label}, " + f"query_width={query_width:,}, dtype={id_dtype.name}, complib={DATA_FILTERS.complib}" + ) + try: + for dist in dists: + for size in sizes: + size_results = benchmark_size(size, size_dir, dist, query_width, id_dtype) + all_results.extend(size_results) + + print() + print("Cold Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), + ], + ) + if repeats > 0: + measure_warm_queries(all_results, repeats) + print() + print("Warm Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ( + "speedup", + lambda result: f"{result['warm_speedup']:.2f}x" + if result["warm_speedup"] is not None + else "-", + ), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), + ], + ) + finally: + close_rows(all_results) + + +if __name__ == "__main__": + run_benchmark() From 60c16683aeb0708ff2d86fba72543919a070eeff Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 09:58:30 +0200 Subject: [PATCH 11/68] Make index builders out-of-core by default - add streaming/out-of-core builders for light, medium, and full indexes - keep in_mem=True as the explicit switch back to in-memory builds - persist and rebuild the chosen build mode in index descriptors - speed up the OOC full builder with chunked external merge runs - fix persistent index cache reuse across reopened arrays - add coverage for OOC persistence, rebuilds, and in-memory override - switch index benchmark CLI to --in-mem with OOC as the default --- bench/ndarray/index_query_bench.py | 54 ++- src/blosc2/indexing.py | 510 +++++++++++++++++++++++++++-- src/blosc2/ndarray.py | 24 ++ tests/ndarray/test_indexing.py | 49 +++ 4 files changed, 595 insertions(+), 42 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index ccfdd779..cde0fd1a 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -103,8 +103,11 @@ def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype) -> return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.b2nd" -def indexed_array_path(size_dir: Path, size: int, dist: str, kind: str, optlevel: int, id_dtype: np.dtype) -> Path: - return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{kind}.opt{optlevel}.b2nd" +def indexed_array_path( + size_dir: Path, size: int, dist: str, kind: str, optlevel: int, id_dtype: np.dtype, in_mem: bool +) -> Path: + mode = "mem" if in_mem else "ooc" + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{kind}.opt{optlevel}.{mode}.b2nd" def benchmark_scan_once(expr) -> tuple[float, int]: @@ -210,7 +213,7 @@ def _condition_expr(lo: object, hi: object, dtype: np.dtype) -> str: return f"(id >= {lo_literal}) & (id <= {hi_literal})" -def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int) -> dict | None: +def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int, in_mem: bool) -> dict | None: for descriptor in arr.indexes: if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: continue @@ -218,6 +221,7 @@ def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int) -> di descriptor.get("field") == "id" and descriptor.get("kind") == kind and int(descriptor.get("optlevel", -1)) == int(optlevel) + and bool(descriptor.get("ooc", False)) is (not bool(in_mem)) and not descriptor.get("stale", False) ): return descriptor @@ -231,10 +235,12 @@ def _open_or_build_persistent_array(path: Path, get_data) -> blosc2.NDArray: return build_persistent_array(get_data(), path) -def _open_or_build_indexed_array(path: Path, get_data, kind: str, optlevel: int) -> tuple[blosc2.NDArray, float]: +def _open_or_build_indexed_array( + path: Path, get_data, kind: str, optlevel: int, in_mem: bool +) -> tuple[blosc2.NDArray, float]: if path.exists(): arr = blosc2.open(path, mode="a") - if _valid_index_descriptor(arr, kind, optlevel) is not None: + if _valid_index_descriptor(arr, kind, optlevel, in_mem) is not None: return arr, 0.0 if arr.indexes: arr.drop_index(field="id") @@ -242,12 +248,12 @@ def _open_or_build_indexed_array(path: Path, get_data, kind: str, optlevel: int) arr = build_persistent_array(get_data(), path) build_start = time.perf_counter() - arr.create_index(field="id", kind=kind, optlevel=optlevel) + arr.create_index(field="id", kind=kind, optlevel=optlevel, in_mem=in_mem) return arr, time.perf_counter() - build_start def benchmark_size( - size: int, size_dir: Path, dist: str, query_width: int, optlevel: int, id_dtype: np.dtype + size: int, size_dir: Path, dist: str, query_width: int, optlevel: int, id_dtype: np.dtype, in_mem: bool ) -> list[dict]: get_data = _source_data_factory(size, dist, id_dtype) get_ordered_ids = _ordered_ids_factory(size, id_dtype) @@ -264,7 +270,11 @@ def benchmark_size( rows = [] for kind in KINDS: idx_arr, build_time = _open_or_build_indexed_array( - indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype), get_data, kind, optlevel + indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem), + get_data, + kind, + optlevel, + in_mem, ) idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) idx_expr = idx_cond.where(idx_arr) @@ -278,6 +288,7 @@ def benchmark_size( "dist": dist, "kind": kind, "optlevel": optlevel, + "in_mem": in_mem, "query_rows": index_len, "build_s": build_time, "create_idx_ms": build_time * 1_000, @@ -379,6 +390,12 @@ def parse_args() -> argparse.Namespace: default="sorted", help="Distribution for the indexed field. Use 'all' to benchmark every distribution.", ) + parser.add_argument( + "--in-mem", + action=argparse.BooleanOptionalAction, + default=False, + help="Use the in-memory index builders. Disabled by default; pass --in-mem to force them.", + ) return parser.parse_args() @@ -398,11 +415,21 @@ def main() -> None: if args.outdir is None: with tempfile.TemporaryDirectory() as tmpdir: run_benchmarks( - sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats, args.optlevel, id_dtype + sizes, + dists, + Path(tmpdir), + args.dist, + args.query_width, + args.repeats, + args.optlevel, + id_dtype, + args.in_mem, ) else: args.outdir.mkdir(parents=True, exist_ok=True) - run_benchmarks(sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel, id_dtype) + run_benchmarks( + sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel, id_dtype, args.in_mem + ) def run_benchmarks( @@ -414,16 +441,17 @@ def run_benchmarks( repeats: int, optlevel: int, id_dtype: np.dtype, + in_mem: bool, ) -> None: all_results = [] print("Structured range-query benchmark across index kinds") print( f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " - f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}" + f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}" ) for dist in dists: for size in sizes: - size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, id_dtype) + size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, id_dtype, in_mem) all_results.extend(size_results) print() @@ -433,6 +461,7 @@ def run_benchmarks( [ ("rows", lambda result: f"{result['size']:,}"), ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), ("kind", lambda result: result["kind"]), ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), @@ -453,6 +482,7 @@ def run_benchmarks( [ ("rows", lambda result: f"{result['size']:,}"), ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), ("kind", lambda result: result["kind"]), ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 1f7614f1..f98e67f9 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -9,6 +9,7 @@ import ast import math +import tempfile from dataclasses import dataclass from pathlib import Path @@ -32,6 +33,8 @@ _IN_MEMORY_INDEXES: dict[int, dict] = {} _DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} BLOCK_GATHER_POSITIONS_THRESHOLD = 32 +FULL_OOC_RUN_ITEMS = 2_000_000 +FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 @dataclass(slots=True) @@ -77,12 +80,21 @@ class ExactPredicatePlan: upper_inclusive: bool = True +@dataclass(slots=True) +class SortedRun: + values_path: Path + positions_path: Path + length: int + + def _default_index_store() -> dict: return {"version": INDEX_FORMAT_VERSION, "indexes": {}} -def _array_key(array: blosc2.NDArray) -> int: - return id(array) +def _array_key(array: blosc2.NDArray) -> tuple[str, str | int]: + if _is_persistent_array(array): + return ("persistent", str(Path(array.urlpath).resolve())) + return ("memory", id(array)) def _field_token(field: str | None) -> str: @@ -211,27 +223,39 @@ def _values_for_index(array: blosc2.NDArray, field: str | None) -> np.ndarray: return values if field is None else values[field] +def _slice_values_for_index(array: blosc2.NDArray, field: str | None, start: int, stop: int) -> np.ndarray: + values = array[start:stop] + return values if field is None else values[field] + + +def _summary_dtype(dtype: np.dtype) -> np.dtype: + return np.dtype([("min", dtype), ("max", dtype), ("flags", np.uint8)]) + + +def _segment_summary(segment: np.ndarray, dtype: np.dtype): + flags = np.uint8(0) + if dtype.kind == "f": + valid = ~np.isnan(segment) + if not np.all(valid): + flags |= FLAG_HAS_NAN + if not np.any(valid): + flags |= FLAG_ALL_NAN + zero = np.zeros((), dtype=dtype)[()] + return zero, zero, flags + segment = segment[valid] + return segment.min(), segment.max(), flags + + def _compute_segment_summaries(values: np.ndarray, dtype: np.dtype, segment_len: int) -> np.ndarray: nsegments = math.ceil(values.shape[0] / segment_len) - summary_dtype = np.dtype([("min", dtype), ("max", dtype), ("flags", np.uint8)]) + summary_dtype = _summary_dtype(dtype) summaries = np.empty(nsegments, dtype=summary_dtype) for idx in range(nsegments): start = idx * segment_len stop = min(start + segment_len, values.shape[0]) segment = values[start:stop] - flags = np.uint8(0) - if dtype.kind == "f": - valid = ~np.isnan(segment) - if not np.all(valid): - flags |= FLAG_HAS_NAN - if not np.any(valid): - flags |= FLAG_ALL_NAN - zero = np.zeros((), dtype=dtype)[()] - summaries[idx] = (zero, zero, flags) - continue - segment = segment[valid] - summaries[idx] = (segment.min(), segment.max(), flags) + summaries[idx] = _segment_summary(segment, dtype) return summaries @@ -245,13 +269,17 @@ def _store_array_sidecar( persistent: bool, ) -> dict: cache_key = _data_cache_key(array, field, category, name) - _DATA_CACHE[cache_key] = data if persistent: path = _sidecar_path(array, field, kind, f"{category}.{name}") blosc2.remove_urlpath(path) blosc2.asarray(data, urlpath=path, mode="w") + if isinstance(data, np.memmap): + _DATA_CACHE.pop(cache_key, None) + else: + _DATA_CACHE[cache_key] = data else: path = None + _DATA_CACHE[cache_key] = np.array(data, copy=True) if isinstance(data, np.memmap) else data return {"path": path, "dtype": data.dtype.descr if data.dtype.fields else data.dtype.str} @@ -291,6 +319,34 @@ def _build_levels_descriptor( return levels +def _build_levels_descriptor_ooc( + array: blosc2.NDArray, + field: str | None, + kind: str, + dtype: np.dtype, + persistent: bool, +) -> dict: + levels = {} + size = int(array.shape[0]) + summary_dtype = _summary_dtype(dtype) + for level in SEGMENT_LEVELS_BY_KIND[kind]: + segment_len = _segment_len(array, level) + nsegments = math.ceil(size / segment_len) + summaries = np.empty(nsegments, dtype=summary_dtype) + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, size) + summaries[idx] = _segment_summary(_slice_values_for_index(array, field, start, stop), dtype) + sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + levels[level] = { + "segment_len": segment_len, + "nsegments": len(summaries), + "path": sidecar["path"], + "dtype": sidecar["dtype"], + } + return levels + + def _build_full_descriptor( array: blosc2.NDArray, field: str | None, @@ -319,6 +375,12 @@ def _position_dtype(max_value: int) -> np.dtype: return np.dtype(np.uint64) +def _resolve_ooc_mode(kind: str, in_mem: bool) -> bool: + if kind not in {"light", "medium", "full"}: + return False + return not in_mem + + def _build_block_sorted_payload( values: np.ndarray, block_len: int ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.dtype]: @@ -368,6 +430,54 @@ def _build_reduced_descriptor( } +def _open_temp_memmap(workdir: Path, name: str, dtype: np.dtype, shape: tuple[int, ...]) -> np.memmap: + path = workdir / f"{name}.npy" + return np.lib.format.open_memmap(path, mode="w+", dtype=dtype, shape=shape) + + +def _build_reduced_descriptor_ooc( + array: blosc2.NDArray, + field: str | None, + kind: str, + persistent: bool, + workdir: Path, +) -> dict: + size = int(array.shape[0]) + block_len = int(array.blocks[0]) + nblocks = math.ceil(size / block_len) + position_dtype = _position_dtype(block_len - 1) + offsets = np.empty(nblocks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = _open_temp_memmap( + workdir, f"{kind}_reduced_values", np.dtype(_field_dtype(array, field)), (size,) + ) + positions = _open_temp_memmap(workdir, f"{kind}_reduced_positions", position_dtype, (size,)) + + cursor = 0 + for block_id in range(nblocks): + start = block_id * block_len + stop = min(start + block_len, size) + block = _slice_values_for_index(array, field, start, stop) + order = np.argsort(block, kind="stable") + next_cursor = cursor + (stop - start) + sorted_values[cursor:next_cursor] = block[order] + positions[cursor:next_cursor] = order.astype(position_dtype, copy=False) + cursor = next_cursor + offsets[block_id + 1] = cursor + + values_sidecar = _store_array_sidecar(array, field, kind, "reduced", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar( + array, field, kind, "reduced", "positions", positions, persistent + ) + offsets_sidecar = _store_array_sidecar(array, field, kind, "reduced", "offsets", offsets, persistent) + return { + "block_len": block_len, + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + "offsets_path": offsets_sidecar["path"], + } + + def _light_bucket_count(block_len: int) -> int: return max(1, min(64, block_len)) @@ -516,6 +626,288 @@ def _build_light_descriptor( } +def _build_light_descriptor_ooc( + array: blosc2.NDArray, + field: str | None, + kind: str, + dtype: np.dtype, + optlevel: int, + persistent: bool, + workdir: Path, +) -> dict: + size = int(array.shape[0]) + block_len = int(array.blocks[0]) + nblocks = math.ceil(size / block_len) + bucket_count = _light_bucket_count(block_len) + bucket_len = math.ceil(block_len / bucket_count) + value_lossy_bits = _light_value_lossy_bits(dtype, optlevel) + offsets = np.empty(nblocks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = _open_temp_memmap(workdir, f"{kind}_light_values", dtype, (size,)) + bucket_positions = _open_temp_memmap(workdir, f"{kind}_light_bucket_positions", np.uint8, (size,)) + + cursor = 0 + for block_id in range(nblocks): + start = block_id * block_len + stop = min(start + block_len, size) + block = _slice_values_for_index(array, field, start, stop) + order = np.argsort(block, kind="stable") + block_values = block[order] + if value_lossy_bits > 0: + block_values = _quantize_light_values_array(block_values, value_lossy_bits) + next_cursor = cursor + (stop - start) + sorted_values[cursor:next_cursor] = block_values + bucket_positions[cursor:next_cursor] = (order // bucket_len).astype(np.uint8, copy=False) + cursor = next_cursor + offsets[block_id + 1] = cursor + + values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar( + array, field, kind, "light", "bucket_positions", bucket_positions, persistent + ) + offsets_sidecar = _store_array_sidecar(array, field, kind, "light", "offsets", offsets, persistent) + return { + "block_len": block_len, + "bucket_count": bucket_count, + "bucket_len": bucket_len, + "value_lossy_bits": value_lossy_bits, + "values_path": values_sidecar["path"], + "bucket_positions_path": positions_sidecar["path"], + "offsets_path": offsets_sidecar["path"], + } + + +def _scalar_compare(left, right, dtype: np.dtype) -> int: + dtype = np.dtype(dtype) + if dtype.kind == "f": + left_nan = np.isnan(left) + right_nan = np.isnan(right) + if left_nan and right_nan: + return 0 + if left_nan: + return 1 + if right_nan: + return -1 + if left < right: + return -1 + if left > right: + return 1 + return 0 + + +def _pair_le(left_value, left_position: int, right_value, right_position: int, dtype: np.dtype) -> bool: + cmp = _scalar_compare(left_value, right_value, dtype) + if cmp < 0: + return True + if cmp > 0: + return False + return int(left_position) <= int(right_position) + + +def _pair_record_dtype(dtype: np.dtype) -> np.dtype: + return np.dtype([("value", dtype), ("position", np.int64)]) + + +def _pair_records(values: np.ndarray, positions: np.ndarray, dtype: np.dtype) -> np.ndarray: + records = np.empty(values.shape[0], dtype=_pair_record_dtype(dtype)) + records["value"] = values + records["position"] = positions + return records + + +def _merge_sorted_slices( + left_values: np.ndarray, + left_positions: np.ndarray, + right_values: np.ndarray, + right_positions: np.ndarray, + dtype: np.dtype, +) -> tuple[np.ndarray, np.ndarray]: + if left_values.size == 0: + return right_values, right_positions + if right_values.size == 0: + return left_values, left_positions + values = np.concatenate((left_values, right_values)) + positions = np.concatenate((left_positions, right_positions)) + order = np.lexsort((positions, values)) + return values[order], positions[order] + + +def _pair_searchsorted_right(values: np.ndarray, positions: np.ndarray, value, position: int) -> int: + records = _pair_records(values, positions, values.dtype) + needle = np.asarray((value, position), dtype=records.dtype)[()] + return int(np.searchsorted(records, needle, side="right")) + + +def _refill_run_buffer( + values_mm: np.ndarray, positions_mm: np.ndarray, cursor: int, buffer_items: int +) -> tuple[np.ndarray, np.ndarray, int]: + if cursor >= len(values_mm): + return np.empty(0, dtype=values_mm.dtype), np.empty(0, dtype=positions_mm.dtype), cursor + stop = min(cursor + buffer_items, len(values_mm)) + return np.asarray(values_mm[cursor:stop]), np.asarray(positions_mm[cursor:stop]), stop + + +def _merge_run_pair( + left: SortedRun, right: SortedRun, workdir: Path, dtype: np.dtype, merge_id: int, buffer_items: int +) -> SortedRun: + left_values_mm = np.load(left.values_path, mmap_mode="r") + left_positions_mm = np.load(left.positions_path, mmap_mode="r") + right_values_mm = np.load(right.values_path, mmap_mode="r") + right_positions_mm = np.load(right.positions_path, mmap_mode="r") + + out_values_path = workdir / f"full_merge_values_{merge_id}.npy" + out_positions_path = workdir / f"full_merge_positions_{merge_id}.npy" + out_values = np.lib.format.open_memmap( + out_values_path, mode="w+", dtype=dtype, shape=(left.length + right.length,) + ) + out_positions = np.lib.format.open_memmap( + out_positions_path, mode="w+", dtype=np.int64, shape=(left.length + right.length,) + ) + + left_cursor = 0 + right_cursor = 0 + out_cursor = 0 + left_values = np.empty(0, dtype=dtype) + left_positions = np.empty(0, dtype=np.int64) + right_values = np.empty(0, dtype=dtype) + right_positions = np.empty(0, dtype=np.int64) + while True: + if left_values.size == 0: + left_values, left_positions, left_cursor = _refill_run_buffer( + left_values_mm, left_positions_mm, left_cursor, buffer_items + ) + if right_values.size == 0: + right_values, right_positions, right_cursor = _refill_run_buffer( + right_values_mm, right_positions_mm, right_cursor, buffer_items + ) + + if left_values.size == 0 and right_values.size == 0: + break + if left_values.size == 0: + take = right_values.size + out_values[out_cursor : out_cursor + take] = right_values + out_positions[out_cursor : out_cursor + take] = right_positions + out_cursor += take + right_values = np.empty(0, dtype=dtype) + right_positions = np.empty(0, dtype=np.int64) + continue + if right_values.size == 0: + take = left_values.size + out_values[out_cursor : out_cursor + take] = left_values + out_positions[out_cursor : out_cursor + take] = left_positions + out_cursor += take + left_values = np.empty(0, dtype=dtype) + left_positions = np.empty(0, dtype=np.int64) + continue + + if _pair_le(left_values[-1], left_positions[-1], right_values[-1], right_positions[-1], dtype): + left_cut = left_values.size + right_cut = _pair_searchsorted_right( + right_values, right_positions, left_values[-1], int(left_positions[-1]) + ) + else: + left_cut = _pair_searchsorted_right( + left_values, left_positions, right_values[-1], int(right_positions[-1]) + ) + right_cut = right_values.size + + merged_values, merged_positions = _merge_sorted_slices( + left_values[:left_cut], + left_positions[:left_cut], + right_values[:right_cut], + right_positions[:right_cut], + dtype, + ) + take = merged_values.size + out_values[out_cursor : out_cursor + take] = merged_values + out_positions[out_cursor : out_cursor + take] = merged_positions + out_cursor += take + left_values = left_values[left_cut:] + left_positions = left_positions[left_cut:] + right_values = right_values[right_cut:] + right_positions = right_positions[right_cut:] + + out_values.flush() + out_positions.flush() + del left_values_mm, left_positions_mm, right_values_mm, right_positions_mm, out_values, out_positions + left.values_path.unlink(missing_ok=True) + left.positions_path.unlink(missing_ok=True) + right.values_path.unlink(missing_ok=True) + right.positions_path.unlink(missing_ok=True) + return SortedRun(out_values_path, out_positions_path, out_cursor) + + +def _build_full_descriptor_ooc( + array: blosc2.NDArray, + field: str | None, + kind: str, + dtype: np.dtype, + persistent: bool, + workdir: Path, +) -> dict: + size = int(array.shape[0]) + if size == 0: + sorted_values = np.empty(0, dtype=dtype) + positions = np.empty(0, dtype=np.int64) + values_sidecar = _store_array_sidecar( + array, field, kind, "full", "values", sorted_values, persistent + ) + positions_sidecar = _store_array_sidecar( + array, field, kind, "full", "positions", positions, persistent + ) + return { + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + } + run_items = max(int(array.chunks[0]), min(size, FULL_OOC_RUN_ITEMS)) + runs = [] + for run_id, start in enumerate(range(0, size, run_items)): + stop = min(start + run_items, size) + values = _slice_values_for_index(array, field, start, stop) + positions = np.arange(start, stop, dtype=np.int64) + order = np.lexsort((positions, values)) + sorted_values = values[order] + sorted_positions = positions[order] + + values_path = workdir / f"full_run_values_{run_id}.npy" + positions_path = workdir / f"full_run_positions_{run_id}.npy" + run_values = np.lib.format.open_memmap(values_path, mode="w+", dtype=dtype, shape=(stop - start,)) + run_positions = np.lib.format.open_memmap( + positions_path, mode="w+", dtype=np.int64, shape=(stop - start,) + ) + run_values[:] = sorted_values + run_positions[:] = sorted_positions + run_values.flush() + run_positions.flush() + del run_values, run_positions + runs.append(SortedRun(values_path, positions_path, stop - start)) + + merge_id = 0 + while len(runs) > 1: + next_runs = [] + for idx in range(0, len(runs), 2): + if idx + 1 >= len(runs): + next_runs.append(runs[idx]) + continue + next_runs.append( + _merge_run_pair( + runs[idx], runs[idx + 1], workdir, dtype, merge_id, FULL_OOC_MERGE_BUFFER_ITEMS + ) + ) + merge_id += 1 + runs = next_runs + + final_run = runs[0] + sorted_values = np.load(final_run.values_path, mmap_mode="r") + positions = np.load(final_run.positions_path, mmap_mode="r") + values_sidecar = _store_array_sidecar(array, field, kind, "full", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar(array, field, kind, "full", "positions", positions, persistent) + return { + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + } + + def _build_descriptor( array: blosc2.NDArray, field: str | None, @@ -523,6 +915,7 @@ def _build_descriptor( optlevel: int, granularity: str, persistent: bool, + ooc: bool, name: str | None, dtype: np.dtype, levels: dict, @@ -538,6 +931,7 @@ def _build_descriptor( "optlevel": optlevel, "granularity": granularity, "persistent": persistent, + "ooc": ooc, "stale": False, "dtype": np.dtype(dtype).str, "shape": tuple(array.shape), @@ -557,6 +951,7 @@ def create_index( optlevel: int = 5, granularity: str = "chunk", persistent: bool | None = None, + in_mem: bool = False, name: str | None = None, **kwargs, ) -> dict: @@ -568,19 +963,69 @@ def create_index( raise NotImplementedError("only chunk-based array indexes are implemented for now") if persistent is None: persistent = _is_persistent_array(array) - - values = _values_for_index(array, field) - levels = _build_levels_descriptor(array, field, kind, dtype, values, persistent) - light = ( - _build_light_descriptor(array, field, kind, values, optlevel, persistent) - if kind == "light" - else None - ) - reduced = _build_reduced_descriptor(array, field, kind, values, persistent) if kind == "medium" else None - full = _build_full_descriptor(array, field, kind, values, persistent) if kind == "full" else None - descriptor = _build_descriptor( - array, field, kind, optlevel, granularity, persistent, name, dtype, levels, light, reduced, full - ) + use_ooc = _resolve_ooc_mode(kind, in_mem) + + if use_ooc and kind in {"light", "medium", "full"}: + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + workdir = Path(tmpdir) + levels = _build_levels_descriptor_ooc(array, field, kind, dtype, persistent) + light = ( + _build_light_descriptor_ooc(array, field, kind, dtype, optlevel, persistent, workdir) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor_ooc(array, field, kind, persistent, workdir) + if kind == "medium" + else None + ) + full = ( + _build_full_descriptor_ooc(array, field, kind, dtype, persistent, workdir) + if kind == "full" + else None + ) + descriptor = _build_descriptor( + array, + field, + kind, + optlevel, + granularity, + persistent, + True, + name, + dtype, + levels, + light, + reduced, + full, + ) + else: + values = _values_for_index(array, field) + levels = _build_levels_descriptor(array, field, kind, dtype, values, persistent) + light = ( + _build_light_descriptor(array, field, kind, values, optlevel, persistent) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor(array, field, kind, values, persistent) if kind == "medium" else None + ) + full = _build_full_descriptor(array, field, kind, values, persistent) if kind == "full" else None + descriptor = _build_descriptor( + array, + field, + kind, + optlevel, + granularity, + persistent, + False, + name, + dtype, + levels, + light, + reduced, + full, + ) store = _load_store(array) store["indexes"][_field_token(field)] = descriptor @@ -593,7 +1038,11 @@ def create_csindex(array: blosc2.NDArray, field: str | None = None, **kwargs) -> def _resolve_index_token(store: dict, field: str | None, name: str | None) -> str: - token = _field_token(field) if field is not None or name is None else None + token = None + if field is not None: + token = _field_token(field) + elif name is None and len(store["indexes"]) == 1: + token = next(iter(store["indexes"])) if token is None: for key, descriptor in store["indexes"].items(): if descriptor.get("name") == name: @@ -646,6 +1095,7 @@ def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | N optlevel=descriptor["optlevel"], granularity=descriptor["granularity"], persistent=descriptor["persistent"], + in_mem=not descriptor.get("ooc", False), name=descriptor["name"], ) diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 257bb22d..351b5c27 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4724,9 +4724,32 @@ def create_index( optlevel: int = 3, granularity: str = "chunk", persistent: bool | None = None, + in_mem: bool = False, name: str | None = None, **kwargs: Any, ) -> dict: + """Create an index for a 1-D array or structured field. + + Parameters + ---------- + field : str or None, optional + Field to index for structured dtypes. Use ``None`` to index the array values. + kind : {"ultralight", "light", "medium", "full"}, optional + Index tier to build. + optlevel : int, optional + Optimization level for index payload construction. + granularity : str, optional + Current implementation only supports ``"chunk"``. + persistent : bool or None, optional + Whether index sidecars should be persisted. If ``None``, this follows whether the base array is persistent. + in_mem : bool, optional + Force the in-memory builder. When set to ``True``, index creation materializes the indexed field in RAM and + may allocate additional temporary arrays for sorting, permutations, and block payloads. For large datasets + this can require substantially more memory than the final index itself, so the default is ``False`` and + uses the out-of-core builders for ``light``, ``medium``, and ``full``. + name : str or None, optional + Optional logical name for the index descriptor. + """ from . import indexing return indexing.create_index( @@ -4736,6 +4759,7 @@ def create_index( optlevel=optlevel, granularity=granularity, persistent=persistent, + in_mem=in_mem, name=name, **kwargs, ) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 10170825..57a05186 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -161,6 +161,55 @@ def test_persistent_index_survives_reopen(tmp_path, kind): np.testing.assert_array_equal(expr.compute()[:], data[data >= 72_000]) +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_default_ooc_persistent_index_matches_scan_and_rebuilds(tmp_path, kind): + path = tmp_path / f"indexed_ooc_{kind}.b2nd" + rng = np.random.default_rng(7) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(240_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(24_000,), blocks=(4_000,)) + descriptor = arr.create_index(field="id", kind=kind) + + assert descriptor["ooc"] is True + + reopened = blosc2.open(path, mode="a") + assert reopened.indexes[0]["ooc"] is True + + expr = blosc2.lazyexpr("(id >= 123_456) & (id < 124_321)", reopened.fields).where(reopened) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(data["id"] >= 123_456) & (data["id"] < 124_321)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + rebuilt = reopened.rebuild_index() + assert rebuilt["ooc"] is True + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_small_default_index_builder_uses_ooc(kind): + data = np.arange(100_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(10_000,), blocks=(2_000,)) + + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_in_mem_override_disables_ooc_builder(kind): + data = np.arange(120_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + + descriptor = arr.create_index(kind=kind, in_mem=True) + + assert descriptor["ooc"] is False + + def test_mutation_marks_index_stale_and_rebuild_restores_it(): data = np.arange(50_000, dtype=np.int64) arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,)) From dc0a189c789d1a70a10f385bc86980049971964a Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 13:30:23 +0200 Subject: [PATCH 12/68] Add ordered access and append maintenance to NDArray indexes - reuse full indexes for direct sort(order=...) and indices(order=...) - add itersorted(...) for streaming ordered traversal via full indexes - teach filtered ordered queries to reuse full indexes on the order key - intersect exact positions across multiple indexed fields for AND predicates - add NDArray.append(...) for 1-D arrays - keep light, medium, and full indexes current on append - preserve sorted reads and indexed filtering after append without rebuild - add regression coverage for ordered access, cross-field exact filters, and append maintenance - add examples for sorted iteration and append-aware index maintenance --- examples/ndarray/index_append_maintenance.py | 31 ++ examples/ndarray/index_sorted_iteration.py | 40 ++ src/blosc2/indexing.py | 555 +++++++++++++++++-- src/blosc2/lazyexpr.py | 6 + src/blosc2/ndarray.py | 118 ++++ tests/ndarray/test_indexing.py | 100 ++++ 6 files changed, 796 insertions(+), 54 deletions(-) create mode 100644 examples/ndarray/index_append_maintenance.py create mode 100644 examples/ndarray/index_sorted_iteration.py diff --git a/examples/ndarray/index_append_maintenance.py b/examples/ndarray/index_append_maintenance.py new file mode 100644 index 00000000..5a944d5f --- /dev/null +++ b/examples/ndarray/index_append_maintenance.py @@ -0,0 +1,31 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np + +import blosc2 + +# Intent: show that appending to a 1-D indexed array keeps the index sidecars +# usable, so indexed queries and sorted reads continue to work without an +# explicit rebuild after append(). + +dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) +data = np.array([(2, 20), (0, 0), (3, 30), (1, 10)], dtype=dtype) +arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + +arr.create_csindex("id") + +to_append = np.array([(6, 60), (4, 40), (5, 50)], dtype=dtype) +arr.append(to_append) + +expr = blosc2.lazyexpr("(id >= 4) & (id < 7)", arr.fields).where(arr) + +print("Indexed query after append:") +print(expr.compute()[:]) + +print("\nSorted rows after append:") +print(arr.sort(order="id")[:]) diff --git a/examples/ndarray/index_sorted_iteration.py b/examples/ndarray/index_sorted_iteration.py new file mode 100644 index 00000000..5a562f84 --- /dev/null +++ b/examples/ndarray/index_sorted_iteration.py @@ -0,0 +1,40 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np + +import blosc2 + +# Intent: show how a full/csindex can be reused for direct sorted reads, +# sorted logical positions, and streaming ordered iteration. + +dtype = np.dtype([("id", np.int64), ("score", np.float64)]) +data = np.array( + [ + (4, 0.3), + (1, 1.5), + (3, 0.8), + (1, 0.2), + (2, 3.1), + (3, 0.1), + (2, 1.2), + ], + dtype=dtype, +) + +arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) +arr.create_csindex("id") + +print("Sorted rows via full index:") +print(arr.sort(order=["id", "score"])[:]) + +print("\nSorted logical positions:") +print(arr.indices(order=["id", "score"])[:]) + +print("\nIterating in sorted order:") +for row in arr.itersorted(order=["id", "score"], batch_size=3): + print(row) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index f98e67f9..8a9767f4 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -1074,6 +1074,168 @@ def _drop_descriptor_sidecars(descriptor: dict) -> None: _remove_sidecar_path(descriptor["full"]["positions_path"]) +def _replace_levels_descriptor( + array: blosc2.NDArray, descriptor: dict, field: str | None, kind: str, persistent: bool +) -> None: + size = int(array.shape[0]) + for level, level_info in descriptor["levels"].items(): + segment_len = int(level_info["segment_len"]) + start = 0 + summaries = _compute_segment_summaries( + _slice_values_for_index(array, field, start, size), _field_dtype(array, field), segment_len + ) + sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + level_info["path"] = sidecar["path"] + level_info["dtype"] = sidecar["dtype"] + level_info["nsegments"] = len(summaries) + + +def _replace_levels_descriptor_tail( + array: blosc2.NDArray, descriptor: dict, field: str | None, kind: str, old_size: int, persistent: bool +) -> None: + dtype = _field_dtype(array, field) + new_size = int(array.shape[0]) + for level, level_info in descriptor["levels"].items(): + segment_len = int(level_info["segment_len"]) + start_segment = old_size // segment_len + prefix = _load_level_summaries(array, descriptor, level)[:start_segment] + tail_start = start_segment * segment_len + tail_values = _slice_values_for_index(array, field, tail_start, new_size) + tail_summaries = _compute_segment_summaries(tail_values, dtype, segment_len) + summaries = np.concatenate((prefix, tail_summaries)) if len(prefix) else tail_summaries + sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + level_info["path"] = sidecar["path"] + level_info["dtype"] = sidecar["dtype"] + level_info["nsegments"] = len(summaries) + + +def _replace_reduced_descriptor_tail( + array: blosc2.NDArray, descriptor: dict, field: str | None, old_size: int, persistent: bool +) -> None: + reduced = descriptor["reduced"] + block_len = int(reduced["block_len"]) + start_block = old_size // block_len + block_start = start_block * block_len + tail_values = _slice_values_for_index(array, field, block_start, int(array.shape[0])) + sorted_values_tail, positions_tail, offsets_tail, _ = _build_block_sorted_payload(tail_values, block_len) + + values, positions, offsets = _load_reduced_arrays(array, descriptor) + prefix_items = int(offsets[start_block]) + updated_values = np.concatenate((values[:prefix_items], sorted_values_tail)) + updated_positions = np.concatenate((positions[:prefix_items], positions_tail)) + updated_offsets = np.concatenate((offsets[: start_block + 1], prefix_items + offsets_tail[1:])) + + kind = descriptor["kind"] + values_sidecar = _store_array_sidecar( + array, field, kind, "reduced", "values", updated_values, persistent + ) + positions_sidecar = _store_array_sidecar( + array, field, kind, "reduced", "positions", updated_positions, persistent + ) + offsets_sidecar = _store_array_sidecar( + array, field, kind, "reduced", "offsets", updated_offsets, persistent + ) + reduced["values_path"] = values_sidecar["path"] + reduced["positions_path"] = positions_sidecar["path"] + reduced["offsets_path"] = offsets_sidecar["path"] + + +def _replace_light_descriptor_tail( + array: blosc2.NDArray, descriptor: dict, field: str | None, old_size: int, persistent: bool +) -> None: + light = descriptor["light"] + block_len = int(light["block_len"]) + start_block = old_size // block_len + block_start = start_block * block_len + tail_values = _slice_values_for_index(array, field, block_start, int(array.shape[0])) + value_lossy_bits = int(light["value_lossy_bits"]) + bucket_len = int(light["bucket_len"]) + sorted_values_tail, positions_tail, offsets_tail, _ = _build_block_sorted_payload(tail_values, block_len) + if value_lossy_bits > 0: + sorted_values_tail = _quantize_light_values_array(sorted_values_tail, value_lossy_bits) + bucket_positions_tail = (positions_tail // bucket_len).astype(np.uint8, copy=False) + + values, bucket_positions, offsets = _load_light_arrays(array, descriptor) + prefix_items = int(offsets[start_block]) + updated_values = np.concatenate((values[:prefix_items], sorted_values_tail)) + updated_bucket_positions = np.concatenate((bucket_positions[:prefix_items], bucket_positions_tail)) + updated_offsets = np.concatenate((offsets[: start_block + 1], prefix_items + offsets_tail[1:])) + + kind = descriptor["kind"] + values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", updated_values, persistent) + positions_sidecar = _store_array_sidecar( + array, field, kind, "light", "bucket_positions", updated_bucket_positions, persistent + ) + offsets_sidecar = _store_array_sidecar( + array, field, kind, "light", "offsets", updated_offsets, persistent + ) + light["values_path"] = values_sidecar["path"] + light["bucket_positions_path"] = positions_sidecar["path"] + light["offsets_path"] = offsets_sidecar["path"] + + +def _replace_full_descriptor( + array: blosc2.NDArray, + descriptor: dict, + field: str | None, + sorted_values: np.ndarray, + positions: np.ndarray, + persistent: bool, +) -> None: + kind = descriptor["kind"] + values_sidecar = _store_array_sidecar(array, field, kind, "full", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar(array, field, kind, "full", "positions", positions, persistent) + descriptor["full"]["values_path"] = values_sidecar["path"] + descriptor["full"]["positions_path"] = positions_sidecar["path"] + + +def _append_full_descriptor( + array: blosc2.NDArray, descriptor: dict, field: str | None, old_size: int, appended_values: np.ndarray +) -> None: + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + existing_values, existing_positions = _load_full_arrays(array, descriptor) + appended_positions = np.arange(old_size, old_size + len(appended_values), dtype=np.int64) + order = np.lexsort((appended_positions, appended_values)) + merged_values, merged_positions = _merge_sorted_slices( + existing_values, + existing_positions, + appended_values[order], + appended_positions[order], + np.dtype(descriptor["dtype"]), + ) + _replace_full_descriptor( + array, descriptor, field, merged_values, merged_positions, descriptor["persistent"] + ) + + +def append_to_indexes(array: blosc2.NDArray, old_size: int, appended_values: np.ndarray) -> None: + store = _load_store(array) + if not store["indexes"]: + return + + for descriptor in store["indexes"].values(): + field = descriptor["field"] + kind = descriptor["kind"] + persistent = descriptor["persistent"] + field_values = appended_values if field is None else appended_values[field] + if descriptor.get("stale", False): + continue + if kind == "full": + _append_full_descriptor(array, descriptor, field, old_size, field_values) + elif kind == "medium": + _replace_reduced_descriptor_tail(array, descriptor, field, old_size, persistent) + elif kind == "light": + _replace_light_descriptor_tail(array, descriptor, field, old_size, persistent) + _replace_levels_descriptor_tail(array, descriptor, field, kind, old_size, persistent) + descriptor["shape"] = tuple(array.shape) + descriptor["chunks"] = tuple(array.chunks) + descriptor["blocks"] = tuple(array.blocks) + descriptor["stale"] = False + _save_store(array, store) + + def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: store = _load_store(array) token = _resolve_index_token(store, field, name) @@ -1502,6 +1664,31 @@ def _merge_exact_plans( ) +def _plan_exact_conjunction(node: ast.AST, operands: dict) -> list[ExactPredicatePlan] | None: + if isinstance(node, ast.Compare): + plan = _plan_exact_compare(node, operands) + return None if plan is None else [plan] + if isinstance(node, ast.BoolOp): + if not isinstance(node.op, ast.And): + return None + plans = [] + for value in node.values: + subplans = _plan_exact_conjunction(value, operands) + if subplans is None: + return None + plans.extend(subplans) + return plans + if isinstance(node, ast.BinOp): + if not isinstance(node.op, ast.BitAnd): + return None + left = _plan_exact_conjunction(node.left, operands) + right = _plan_exact_conjunction(node.right, operands) + if left is None or right is None: + return None + return left + right + return None + + def _plan_exact_boolop(node: ast.BoolOp, operands: dict) -> ExactPredicatePlan | None: if not isinstance(node.op, ast.And): return None @@ -1676,6 +1863,124 @@ def _exact_positions_from_reduced( return np.sort(merged, kind="stable") +def _exact_positions_from_plan(plan: ExactPredicatePlan) -> np.ndarray | None: + kind = plan.descriptor["kind"] + if kind == "full": + return _exact_positions_from_full(plan.base, plan.descriptor, plan) + if kind == "medium": + return _exact_positions_from_reduced( + plan.base, plan.descriptor, np.dtype(plan.descriptor["dtype"]), plan + ) + return None + + +def _multi_exact_positions(plans: list[ExactPredicatePlan]) -> tuple[blosc2.NDArray, np.ndarray] | None: + if not plans: + return None + base = plans[0].base + merged_by_field: dict[str | None, ExactPredicatePlan] = {} + for plan in plans: + if plan.base is not base: + return None + key = plan.field + current = merged_by_field.get(key) + if current is None: + merged_by_field[key] = plan + continue + merged = _merge_exact_plans(current, plan, "and") + if merged is None: + return None + merged_by_field[key] = merged + + exact_arrays = [] + for plan in merged_by_field.values(): + positions = _exact_positions_from_plan(plan) + if positions is None: + return None + exact_arrays.append(np.asarray(positions, dtype=np.int64)) + + result = exact_arrays[0] + for other in exact_arrays[1:]: + result = np.intersect1d(result, other, assume_unique=False) + return base, result + + +def _plan_multi_exact_query(plans: list[ExactPredicatePlan]) -> IndexPlan | None: + multi_exact = _multi_exact_positions(plans) + if multi_exact is None: + return None + base, exact_positions = multi_exact + if len(exact_positions) >= int(base.shape[0]): + return None + return IndexPlan( + True, + "multi-field exact indexes selected", + descriptor=_copy_descriptor(plans[0].descriptor), + base=base, + field=None, + level="exact", + total_units=int(base.shape[0]), + selected_units=len(exact_positions), + exact_positions=exact_positions, + ) + + +def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: + kind = exact_plan.descriptor["kind"] + if kind == "full": + exact_positions = _exact_positions_from_full(exact_plan.base, exact_plan.descriptor, exact_plan) + return IndexPlan( + True, + f"{kind} exact index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + field=exact_plan.field, + level=kind, + total_units=exact_plan.base.shape[0], + selected_units=len(exact_positions), + exact_positions=exact_positions, + ) + if kind == "medium": + dtype = np.dtype(exact_plan.descriptor["dtype"]) + exact_positions = _exact_positions_from_reduced( + exact_plan.base, exact_plan.descriptor, dtype, exact_plan + ) + return IndexPlan( + True, + f"{kind} exact index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + field=exact_plan.field, + level=kind, + total_units=exact_plan.base.shape[0], + selected_units=len(exact_positions), + exact_positions=exact_positions, + ) + bucket_masks = _bucket_masks_from_light(exact_plan.base, exact_plan.descriptor, exact_plan) + light = exact_plan.descriptor["light"] + total_units = len(bucket_masks) * int(light["bucket_count"]) + selected_units = _bit_count_sum(bucket_masks) + if selected_units < total_units: + return IndexPlan( + True, + "light approximate-order index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + field=exact_plan.field, + level=kind, + total_units=total_units, + selected_units=selected_units, + bucket_masks=bucket_masks, + bucket_len=int(light["bucket_len"]), + block_len=int(light["block_len"]), + lower=exact_plan.lower, + lower_inclusive=exact_plan.lower_inclusive, + upper=exact_plan.upper, + upper_inclusive=exact_plan.upper_inclusive, + ) + return IndexPlan(False, "available exact index does not prune any units for this predicate") + + def plan_query(expression: str, operands: dict, where: dict | None, *, use_index: bool = True) -> IndexPlan: if not use_index: return IndexPlan(False, "index usage disabled for this query") @@ -1687,61 +1992,17 @@ def plan_query(expression: str, operands: dict, where: dict | None, *, use_index except SyntaxError: return IndexPlan(False, "expression is not valid Python syntax for planning") + exact_terms = _plan_exact_conjunction(tree.body, operands) + if exact_terms is not None and len(exact_terms) > 1: + multi_exact_plan = _plan_multi_exact_query(exact_terms) + if multi_exact_plan is not None: + return multi_exact_plan + exact_plan = _plan_exact_node(tree.body, operands) if exact_plan is not None: - kind = exact_plan.descriptor["kind"] - if kind == "full": - exact_positions = _exact_positions_from_full(exact_plan.base, exact_plan.descriptor, exact_plan) - return IndexPlan( - True, - f"{kind} exact index selected", - descriptor=_copy_descriptor(exact_plan.descriptor), - base=exact_plan.base, - field=exact_plan.field, - level=kind, - total_units=exact_plan.base.shape[0], - selected_units=len(exact_positions), - exact_positions=exact_positions, - ) - if kind == "medium": - dtype = np.dtype(exact_plan.descriptor["dtype"]) - exact_positions = _exact_positions_from_reduced( - exact_plan.base, exact_plan.descriptor, dtype, exact_plan - ) - return IndexPlan( - True, - f"{kind} exact index selected", - descriptor=_copy_descriptor(exact_plan.descriptor), - base=exact_plan.base, - field=exact_plan.field, - level=kind, - total_units=exact_plan.base.shape[0], - selected_units=len(exact_positions), - exact_positions=exact_positions, - ) - if kind == "light": - bucket_masks = _bucket_masks_from_light(exact_plan.base, exact_plan.descriptor, exact_plan) - light = exact_plan.descriptor["light"] - total_units = len(bucket_masks) * int(light["bucket_count"]) - selected_units = _bit_count_sum(bucket_masks) - if selected_units < total_units: - return IndexPlan( - True, - "light approximate-order index selected", - descriptor=_copy_descriptor(exact_plan.descriptor), - base=exact_plan.base, - field=exact_plan.field, - level=kind, - total_units=total_units, - selected_units=selected_units, - bucket_masks=bucket_masks, - bucket_len=int(light["bucket_len"]), - block_len=int(light["block_len"]), - lower=exact_plan.lower, - lower_inclusive=exact_plan.lower_inclusive, - upper=exact_plan.upper, - upper_inclusive=exact_plan.upper_inclusive, - ) + exact_query_plan = _plan_single_exact_query(exact_plan) + if exact_query_plan.usable: + return exact_query_plan segment_plan = _plan_segment_node(tree.body, operands) if segment_plan is None: @@ -1919,7 +2180,7 @@ def _gather_positions_by_block( local_positions = chunk_positions - chunk_origin block_ids = local_positions // block_len unique_blocks = np.unique(block_ids) - if len(unique_blocks) != 1: + if len(unique_blocks) != 1 or np.any(np.diff(local_positions) < 0): chunk_stop = min(chunk_origin + chunk_len, total_len) chunk_values = where_x[chunk_origin:chunk_stop] output[chunk_start_idx:chunk_stop_idx] = chunk_values[local_positions] @@ -1952,6 +2213,192 @@ def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: return _gather_positions(where["_where_x"], plan.exact_positions) +def _normalize_order_fields(array: blosc2.NDArray, order: str | list[str] | None) -> list[str | None]: + if order is None: + if array.dtype.fields is None: + return [None] + return list(array.dtype.names) + if isinstance(order, list): + fields = list(order) + else: + fields = [order] + if array.dtype.fields is None: + if fields != [None]: + raise ValueError("order is only supported for structured arrays") + return [None] + for field in fields: + if field not in array.dtype.fields: + raise ValueError(f"field {field!r} is not present in the dtype") + return fields + + +def _positions_in_input_order( + positions: np.ndarray, start: int | None, stop: int | None, step: int | None +) -> np.ndarray: + if step is None: + step = 1 + if step == 0: + raise ValueError("step cannot be zero") + return positions[slice(start, stop, step)] + + +def _full_descriptor_for_order(array: blosc2.NDArray, field: str | None) -> dict | None: + descriptor = _descriptor_for(array, field) + if descriptor is None or descriptor.get("kind") != "full": + return None + return descriptor + + +def _equal_primary_values(left, right, dtype: np.dtype) -> bool: + return _scalar_compare(left, right, dtype) == 0 + + +def _refine_secondary_order( + array: blosc2.NDArray, + positions: np.ndarray, + primary_values: np.ndarray, + primary_dtype: np.dtype, + secondary_fields: list[str], +) -> np.ndarray: + if not secondary_fields or len(positions) <= 1: + return positions + + refined = positions.copy() + start = 0 + while start < len(refined): + stop = start + 1 + while stop < len(refined) and _equal_primary_values( + primary_values[start], primary_values[stop], primary_dtype + ): + stop += 1 + if stop - start > 1: + tied_positions = refined[start:stop] + tied_rows = array[tied_positions] + tie_order = np.argsort(tied_rows, order=secondary_fields, kind="stable") + refined[start:stop] = tied_positions[tie_order] + start = stop + return refined + + +def _ordered_positions_from_exact_positions( + array: blosc2.NDArray, descriptor: dict, exact_positions: np.ndarray, order_fields: list[str | None] +) -> np.ndarray: + sorted_values, sorted_positions = _load_full_arrays(array, descriptor) + if len(exact_positions) == len(sorted_positions): + selected_positions = np.asarray(sorted_positions, dtype=np.int64) + selected_values = np.asarray(sorted_values) + else: + keep = np.zeros(int(array.shape[0]), dtype=bool) + keep[np.asarray(exact_positions, dtype=np.int64)] = True + mask = keep[sorted_positions] + selected_positions = np.asarray(sorted_positions[mask], dtype=np.int64) + selected_values = np.asarray(sorted_values[mask]) + + secondary_fields = [field for field in order_fields[1:] if field is not None] + if secondary_fields: + selected_positions = _refine_secondary_order( + array, selected_positions, selected_values, np.dtype(descriptor["dtype"]), secondary_fields + ) + return selected_positions + + +def ordered_indices( + array: blosc2.NDArray, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + require_full: bool = False, +) -> np.ndarray | None: + order_fields = _normalize_order_fields(array, order) + primary_field = order_fields[0] + descriptor = _full_descriptor_for_order(array, primary_field) + if descriptor is None: + if require_full: + raise ValueError(f"field {primary_field!r} must have an associated full index") + return None + positions = _ordered_positions_from_exact_positions( + array, descriptor, np.arange(int(array.shape[0]), dtype=np.int64), order_fields + ) + return _positions_in_input_order(positions, start, stop, step) + + +def ordered_query_indices( + expression: str, + operands: dict, + where: dict, + order: str | list[str], + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, +) -> np.ndarray | None: + if len(where) != 1: + return None + base = where["_where_x"] + if not isinstance(base, blosc2.NDArray) or base.ndim != 1: + return None + + order_fields = _normalize_order_fields(base, order) + primary_field = order_fields[0] + descriptor = _full_descriptor_for_order(base, primary_field) + if descriptor is None: + return None + + plan = plan_query(expression, operands, where, use_index=True) + if not plan.usable or plan.base is not base or plan.exact_positions is None: + return None + + positions = _ordered_positions_from_exact_positions(base, descriptor, plan.exact_positions, order_fields) + return _positions_in_input_order(positions, start, stop, step) + + +def read_sorted( + array: blosc2.NDArray, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + require_full: bool = False, +) -> np.ndarray | None: + positions = ordered_indices( + array, order=order, start=start, stop=stop, step=step, require_full=require_full + ) + if positions is None: + return None + return _gather_positions_by_block( + array, positions, int(array.chunks[0]), int(array.blocks[0]), int(array.shape[0]) + ) + + +def iter_sorted( + array: blosc2.NDArray, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + batch_size: int | None = None, +) -> np.ndarray: + positions = ordered_indices(array, order=order, start=start, stop=stop, step=step, require_full=True) + if batch_size is None: + batch_size = max(1, int(array.blocks[0])) + if batch_size <= 0: + raise ValueError("batch_size must be positive") + + for idx in range(0, len(positions), batch_size): + batch = _gather_positions_by_block( + array, + positions[idx : idx + batch_size], + int(array.chunks[0]), + int(array.blocks[0]), + int(array.shape[0]), + ) + yield from batch + + def will_use_index(expr) -> bool: where = getattr(expr, "_where_args", None) return plan_query(expr.expression, expr.operands, where).usable diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 3103ef6b..74d80371 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1842,6 +1842,12 @@ def slices_eval( # noqa: C901 from . import indexing index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) + if _order is not None: + ordered_positions = indexing.ordered_query_indices(expression, operands, where, _order) + if ordered_positions is not None: + return ordered_positions + if _indices and _order is None and index_plan.usable and index_plan.exact_positions is not None: + return np.asarray(index_plan.exact_positions, dtype=np.int64) if index_plan.usable and not (_indices or _order): if index_plan.exact_positions is not None: return indexing.evaluate_full_query(where, index_plan) diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 351b5c27..02571b16 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4829,6 +4829,54 @@ def resize(self, newshape: tuple | list) -> None: indexing.mark_indexes_stale(self) + def append(self, values: object) -> int: + """Append values to a 1-D array and keep indexes current when possible. + + Parameters + ---------- + values : object + Values to append. Scalars append one element; array-like inputs must be + compatible with ``self.dtype`` and flatten to one dimension. + + Returns + ------- + out : int + The new length of the array. + + Notes + ----- + Appending to indexed arrays updates the index sidecars as part of the + append path. For ``full`` indexes this extends the sorted payload + incrementally; for ``light`` and ``medium`` only the affected tail + segments and block payloads are recomputed. General slice updates and + resizes outside ``append()`` still mark indexes as stale. + """ + if self.ndim != 1: + raise ValueError("append() is only supported for 1-D arrays") + if 0 in self.chunks or 0 in self.blocks: + raise ValueError("Cannot append to arrays with zero-sized chunks or blocks") + + blosc2_ext.check_access_mode(self.schunk.urlpath, self.schunk.mode) + + appended = np.asarray(values, dtype=self.dtype) + if appended.ndim == 0: + appended = appended.reshape(1) + elif appended.ndim != 1: + appended = appended.reshape(-1) + if appended.dtype != self.dtype: + appended = appended.astype(self.dtype, copy=False) + if len(appended) == 0: + return int(self.shape[0]) + + old_size = int(self.shape[0]) + super().resize((old_size + len(appended),)) + super().set_slice(([old_size], [old_size + len(appended)]), appended) + + from . import indexing + + indexing.append_to_indexes(self, old_size, appended) + return int(self.shape[0]) + def slice(self, key: int | slice | Sequence[slice], **kwargs: Any) -> NDArray: """Get a (multidimensional) slice as a new :ref:`NDArray`. @@ -4954,6 +5002,30 @@ def indices(self, order: str | list[str] | None = None, **kwargs: Any) -> NDArra """ return indices(self, order, **kwargs) + def itersorted( + self, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + batch_size: int | None = None, + ) -> Iterator[np.generic | np.void]: + """Iterate array values following a matching full index order. + + Parameters + ---------- + order : str, list of str, optional + Sort order to iterate. The first field must have an associated + ``full`` index. + start, stop, step : int or None, optional + Optional slice applied to the ordered sequence before iteration. + batch_size : int or None, optional + Internal prefetch size used when reading ordered rows. Larger values + reduce read overhead at the cost of more temporary memory. + """ + return itersorted(self, order, start=start, stop=stop, step=step, batch_size=batch_size) + def sort(self, order: str | list[str] | None = None, **kwargs: Any) -> NDArray: """ Return a sorted array following the specified order, or the order of the fields. @@ -6314,6 +6386,13 @@ def indices(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: # Shortcut for this relatively rare case return arange(array.shape[0], dtype=np.int64) + if isinstance(array, blosc2.NDArray): + from . import indexing + + ordered = indexing.ordered_indices(array, order=order) + if ordered is not None: + return blosc2.asarray(ordered, **kwargs) + # Create a lazy array to access the sort machinery there # This is a bit of a hack, but it is the simplest way to do it # (the sorting mechanism in LazyExpr should be improved to avoid this) @@ -6347,6 +6426,13 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An if not order: return array + if isinstance(array, blosc2.NDArray): + from . import indexing + + ordered = indexing.read_sorted(array, order=order) + if ordered is not None: + return blosc2.asarray(ordered, **kwargs) + # Create a lazy array to access the sort machinery there # This is a bit of a hack, but it is the simplest way to do it # (the sorting mechanism in LazyExpr should be improved to avoid this) @@ -6355,6 +6441,38 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An return larr.sort(order).compute(**kwargs) +def itersorted( + array: blosc2.Array, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + batch_size: int | None = None, +) -> Iterator[np.generic | np.void]: + """ + Iterate array values following a matching full index order. + + Parameters + ---------- + array : :ref:`blosc2.Array` + The array to iterate. + order : str, list of str, optional + Specifies which fields define the ordered traversal. The first field + must have an associated ``full`` index. + start, stop, step : int or None, optional + Optional slice applied to the ordered sequence before iteration. + batch_size : int or None, optional + Internal prefetch size used during iteration. + """ + if not isinstance(array, blosc2.NDArray): + raise TypeError("itersorted() is only supported on NDArray") + + from . import indexing + + return indexing.iter_sorted(array, order=order, start=start, stop=stop, step=step, batch_size=batch_size) + + # Class for dealing with fields in an NDArray # This will allow to access fields by name in the dtype of the NDArray class NDField(Operand): diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 57a05186..57f16802 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -225,3 +225,103 @@ def test_mutation_marks_index_stale_and_rebuild_restores_it(): rebuilt = arr.rebuild_index() assert rebuilt["stale"] is False assert expr.will_use_index() is True + + +def test_full_index_reuses_primary_order_for_indices_and_sort(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array( + [(2, 9), (1, 8), (2, 7), (1, 6), (2, 5), (1, 4), (2, 3), (1, 2), (2, 1), (1, 0)], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_csindex("a") + + np.testing.assert_array_equal(arr.indices(order=["a", "b"])[:], np.argsort(data, order=["a", "b"])) + np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], np.sort(data, order=["a", "b"])) + + +def test_filtered_ordered_queries_support_cross_field_exact_indexes(): + dtype = np.dtype([("a", np.int64), ("b", np.int64), ("payload", np.int32)]) + data = np.array( + [ + (2, 9, 10), + (1, 8, 11), + (2, 7, 12), + (1, 6, 13), + (2, 5, 14), + (1, 4, 15), + (2, 3, 16), + (1, 2, 17), + (2, 1, 18), + (1, 0, 19), + ], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_csindex("a") + arr.create_csindex("b") + + expr = blosc2.lazyexpr("(a >= 1) & (a < 3) & (b >= 2) & (b < 8)", arr.fields).where(arr) + mask = (data["a"] >= 1) & (data["a"] < 3) & (data["b"] >= 2) & (data["b"] < 8) + expected_indices = np.where(mask)[0] + expected_order = np.argsort(data[mask], order=["a", "b"]) + + np.testing.assert_array_equal( + expr.indices(order=["a", "b"]).compute()[:], expected_indices[expected_order] + ) + np.testing.assert_array_equal( + expr.sort(order=["a", "b"]).compute()[:], np.sort(data[mask], order=["a", "b"]) + ) + + +def test_itersorted_matches_numpy_sorted_order(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array( + [(3, 2), (1, 9), (2, 4), (1, 3), (3, 1), (2, 6), (1, 5), (2, 0), (3, 8), (1, 7)], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_csindex("a") + + rows = np.array(list(arr.itersorted(order=["a", "b"], batch_size=3)), dtype=dtype) + np.testing.assert_array_equal(rows, np.sort(data, order=["a", "b"])) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_append_keeps_index_current(kind): + rng = np.random.default_rng(4) + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.zeros(32, dtype=dtype) + data["id"] = np.arange(32, dtype=np.int64) + rng.shuffle(data["id"]) + data["payload"] = np.arange(32, dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(8,), blocks=(4,)) + arr.create_index(field="id", kind=kind) + + appended = np.array([(33, 100), (35, 101), (34, 102), (32, 103)], dtype=dtype) + all_data = np.concatenate((data, appended)) + arr.append(appended) + + assert arr.indexes[0]["stale"] is False + + expr = blosc2.lazyexpr("(id >= 31) & (id < 36)", arr.fields).where(arr) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = all_data[(all_data["id"] >= 31) & (all_data["id"] < 36)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + +def test_append_keeps_full_index_sorted_access_current(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(2, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + appended = np.array([(0, 100), (3, 101), (1, 5)], dtype=dtype) + arr.append(appended) + + expected = np.sort(np.concatenate((data, appended)), order=["a", "b"]) + np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], expected) From c42982925ac17f2c0b260d9673b0cd16128548c3 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 13:53:40 +0200 Subject: [PATCH 13/68] Polish the single-target NDArray indexing model - clarify that one active index is supported per field - keep name as a descriptor label rather than index identity - add target-aware descriptor metadata for field-backed indexes - document ordered access semantics as ascending and stable - document secondary-key tie refinement after primary full-index order - document append-maintained vs stale-on-mutation index behavior - add ordered-access planner introspection to will_use_index() and explain() - report ordered reuse, missing full-index cases, and filter/ordering reasons - simplify append-maintenance example to use a single csindex - add intent comments to the new indexing examples - update the follow-up indexing plan with the current implementation state - add a concrete plan section for materialized expression indexes - add regression coverage for target metadata and ordered explain behavior --- src/blosc2/indexing.py | 140 +++++++++++++++++++++++++++++---- src/blosc2/lazyexpr.py | 8 +- src/blosc2/ndarray.py | 59 +++++++++++++- tests/ndarray/test_indexing.py | 26 +++++- 4 files changed, 208 insertions(+), 25 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 8a9767f4..57459d0c 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -87,6 +87,19 @@ class SortedRun: length: int +@dataclass(slots=True) +class OrderedIndexPlan: + usable: bool + reason: str + descriptor: dict | None = None + base: blosc2.NDArray | None = None + field: str | None = None + order_fields: list[str | None] | None = None + total_rows: int = 0 + selected_rows: int = 0 + secondary_refinement: bool = False + + def _default_index_store() -> dict: return {"version": INDEX_FORMAT_VERSION, "indexes": {}} @@ -114,6 +127,8 @@ def _copy_nested_dict(value: dict | None) -> dict | None: def _copy_descriptor(descriptor: dict) -> dict: copied = descriptor.copy() copied["levels"] = _copy_nested_dict(descriptor.get("levels")) + if descriptor.get("target") is not None: + copied["target"] = descriptor["target"].copy() if descriptor.get("light") is not None: copied["light"] = descriptor["light"].copy() if descriptor.get("reduced") is not None: @@ -161,6 +176,10 @@ def _supported_index_dtype(dtype: np.dtype) -> bool: return np.dtype(dtype).kind in {"b", "i", "u", "f", "m", "M"} +def _field_target_descriptor(field: str | None) -> dict: + return {"source": "field", "field": field} + + def _field_dtype(array: blosc2.NDArray, field: str | None) -> np.dtype: if field is None: return np.dtype(array.dtype) @@ -925,6 +944,7 @@ def _build_descriptor( ) -> dict: return { "name": name or _field_token(field), + "target": _field_target_descriptor(field), "field": field, "kind": kind, "version": INDEX_FORMAT_VERSION, @@ -2232,6 +2252,32 @@ def _normalize_order_fields(array: blosc2.NDArray, order: str | list[str] | None return fields +def plan_array_order( + array: blosc2.NDArray, order: str | list[str] | None = None, *, require_full: bool = False +) -> OrderedIndexPlan: + try: + order_fields = _normalize_order_fields(array, order) + except (TypeError, ValueError) as exc: + return OrderedIndexPlan(False, str(exc)) + primary_field = order_fields[0] + descriptor = _full_descriptor_for_order(array, primary_field) + if descriptor is None: + if require_full: + return OrderedIndexPlan(False, f"field {primary_field!r} must have an associated full index") + return OrderedIndexPlan(False, "no matching full index was found for ordered access") + return OrderedIndexPlan( + True, + "ordered access will reuse a full index", + descriptor=_copy_descriptor(descriptor), + base=array, + field=primary_field, + order_fields=order_fields, + total_rows=int(array.shape[0]), + selected_rows=int(array.shape[0]), + secondary_refinement=len(order_fields) > 1, + ) + + def _positions_in_input_order( positions: np.ndarray, start: int | None, stop: int | None, step: int | None ) -> np.ndarray: @@ -2311,19 +2357,55 @@ def ordered_indices( step: int | None = None, require_full: bool = False, ) -> np.ndarray | None: - order_fields = _normalize_order_fields(array, order) - primary_field = order_fields[0] - descriptor = _full_descriptor_for_order(array, primary_field) - if descriptor is None: + ordered_plan = plan_array_order(array, order=order, require_full=require_full) + if not ordered_plan.usable: if require_full: - raise ValueError(f"field {primary_field!r} must have an associated full index") + raise ValueError(ordered_plan.reason) return None + order_fields = ordered_plan.order_fields + descriptor = ordered_plan.descriptor positions = _ordered_positions_from_exact_positions( array, descriptor, np.arange(int(array.shape[0]), dtype=np.int64), order_fields ) return _positions_in_input_order(positions, start, stop, step) +def plan_ordered_query( + expression: str, operands: dict, where: dict, order: str | list[str] +) -> OrderedIndexPlan: + if len(where) != 1: + return OrderedIndexPlan(False, "ordered index reuse is only available for where(x) style filtering") + base = where["_where_x"] + if not isinstance(base, blosc2.NDArray) or base.ndim != 1: + return OrderedIndexPlan(False, "ordered index reuse requires a 1-D NDArray target") + + base_order_plan = plan_array_order(base, order=order, require_full=False) + if not base_order_plan.usable: + return base_order_plan + + filter_plan = plan_query(expression, operands, where, use_index=True) + if not filter_plan.usable: + return OrderedIndexPlan( + False, f"ordered access cannot reuse an index because filtering does not: {filter_plan.reason}" + ) + if filter_plan.base is not base or filter_plan.exact_positions is None: + return OrderedIndexPlan( + False, "ordered access currently requires exact row positions from filtering" + ) + + return OrderedIndexPlan( + True, + "ordered access will reuse a full index after exact filtering", + descriptor=base_order_plan.descriptor, + base=base, + field=base_order_plan.field, + order_fields=base_order_plan.order_fields, + total_rows=int(base.shape[0]), + selected_rows=len(filter_plan.exact_positions), + secondary_refinement=base_order_plan.secondary_refinement, + ) + + def ordered_query_indices( expression: str, operands: dict, @@ -2334,21 +2416,14 @@ def ordered_query_indices( stop: int | None = None, step: int | None = None, ) -> np.ndarray | None: - if len(where) != 1: - return None - base = where["_where_x"] - if not isinstance(base, blosc2.NDArray) or base.ndim != 1: - return None - - order_fields = _normalize_order_fields(base, order) - primary_field = order_fields[0] - descriptor = _full_descriptor_for_order(base, primary_field) - if descriptor is None: + ordered_plan = plan_ordered_query(expression, operands, where, order) + if not ordered_plan.usable: return None + base = ordered_plan.base + order_fields = ordered_plan.order_fields + descriptor = ordered_plan.descriptor plan = plan_query(expression, operands, where, use_index=True) - if not plan.usable or plan.base is not base or plan.exact_positions is None: - return None positions = _ordered_positions_from_exact_positions(base, descriptor, plan.exact_positions, order_fields) return _positions_in_input_order(positions, start, stop, step) @@ -2401,18 +2476,49 @@ def iter_sorted( def will_use_index(expr) -> bool: where = getattr(expr, "_where_args", None) + order = getattr(expr, "_order", None) + if order is not None: + return plan_ordered_query(expr.expression, expr.operands, where, order).usable return plan_query(expr.expression, expr.operands, where).usable def explain_query(expr) -> dict: where = getattr(expr, "_where_args", None) + order = getattr(expr, "_order", None) + if order is not None: + ordered_plan = plan_ordered_query(expr.expression, expr.operands, where, order) + filter_plan = plan_query(expr.expression, expr.operands, where) + return { + "will_use_index": ordered_plan.usable, + "reason": ordered_plan.reason, + "target": None if ordered_plan.descriptor is None else ordered_plan.descriptor.get("target"), + "field": ordered_plan.field, + "kind": None if ordered_plan.descriptor is None else ordered_plan.descriptor["kind"], + "level": "full" if ordered_plan.usable else None, + "ordered_access": True, + "order": ordered_plan.order_fields, + "secondary_refinement": ordered_plan.secondary_refinement, + "candidate_units": ordered_plan.selected_rows, + "total_units": ordered_plan.total_rows, + "candidate_chunks": ordered_plan.selected_rows, + "total_chunks": ordered_plan.total_rows, + "exact_rows": ordered_plan.selected_rows if ordered_plan.usable else None, + "filter_reason": filter_plan.reason, + "filter_level": filter_plan.level, + "descriptor": ordered_plan.descriptor, + } + plan = plan_query(expr.expression, expr.operands, where) return { "will_use_index": plan.usable, "reason": plan.reason, + "target": None if plan.descriptor is None else plan.descriptor.get("target"), "field": plan.field, "kind": None if plan.descriptor is None else plan.descriptor["kind"], "level": plan.level, + "ordered_access": False, + "order": None, + "secondary_refinement": False, "candidate_units": plan.selected_units, "total_units": plan.total_units, "candidate_chunks": plan.selected_units, diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 74d80371..b57f0534 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1843,9 +1843,11 @@ def slices_eval( # noqa: C901 index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) if _order is not None: - ordered_positions = indexing.ordered_query_indices(expression, operands, where, _order) - if ordered_positions is not None: - return ordered_positions + ordered_plan = indexing.plan_ordered_query(expression, operands, where, _order) + if ordered_plan.usable: + ordered_positions = indexing.ordered_query_indices(expression, operands, where, _order) + if ordered_positions is not None: + return ordered_positions if _indices and _order is None and index_plan.usable and index_plan.exact_positions is not None: return np.asarray(index_plan.exact_positions, dtype=np.int64) if index_plan.usable and not (_indices or _order): diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 02571b16..d9570f3d 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4735,7 +4735,10 @@ def create_index( field : str or None, optional Field to index for structured dtypes. Use ``None`` to index the array values. kind : {"ultralight", "light", "medium", "full"}, optional - Index tier to build. + Index tier to build. Use ``light`` or ``medium`` for faster/lighter + filter-oriented indexes, and ``full`` when exact ordered access via + ``sort(order=...)``, ``indices(order=...)``, or ``itersorted(...)`` + should reuse the index directly. optlevel : int, optional Optimization level for index payload construction. granularity : str, optional @@ -4748,7 +4751,15 @@ def create_index( this can require substantially more memory than the final index itself, so the default is ``False`` and uses the out-of-core builders for ``light``, ``medium``, and ``full``. name : str or None, optional - Optional logical name for the index descriptor. + Optional logical label stored in the descriptor. Index identity is + still driven by the target field, so creating another index on the + same field replaces the previous one. + + Notes + ----- + The current indexing model supports one active index target per field. + Append operations keep compatible indexes current, while general + mutation and resize operations mark indexes as stale until rebuild. """ from . import indexing @@ -4765,16 +4776,24 @@ def create_index( ) def create_csindex(self, field: str | None = None, **kwargs: Any) -> dict: + """Create a fully sorted index for a 1-D array or structured field. + + This is a convenience wrapper for ``create_index(kind="full")`` and is + the required index tier for direct ordered reuse in + ``sort(order=...)``, ``indices(order=...)``, and ``itersorted(...)``. + """ from . import indexing return indexing.create_csindex(self, field=field, **kwargs) def drop_index(self, field: str | None = None, name: str | None = None) -> None: + """Drop an index by field or optional descriptor label.""" from . import indexing indexing.drop_index(self, field=field, name=name) def rebuild_index(self, field: str | None = None, name: str | None = None) -> dict: + """Rebuild an index by field or optional descriptor label.""" from . import indexing return indexing.rebuild_index(self, field=field, name=name) @@ -4998,6 +5017,11 @@ def indices(self, order: str | list[str] | None = None, **kwargs: Any) -> NDArra This is only valid for 1-dim structured arrays. + When the primary order key has a matching ``full`` index, the ordered + positions are produced directly from that index. Secondary keys refine + ties after the primary indexed order and the traversal is ascending and + stable. + See full documentation in :func:`indices`. """ return indices(self, order, **kwargs) @@ -5017,7 +5041,9 @@ def itersorted( ---------- order : str, list of str, optional Sort order to iterate. The first field must have an associated - ``full`` index. + ``full`` index. Traversal is ascending and stable; if only the + primary key is indexed, secondary keys refine ties after the primary + indexed order. start, stop, step : int or None, optional Optional slice applied to the ordered sequence before iteration. batch_size : int or None, optional @@ -5032,6 +5058,11 @@ def sort(self, order: str | list[str] | None = None, **kwargs: Any) -> NDArray: This is only valid for 1-dim structured arrays. + When the primary order key has a matching ``full`` index, the ordered + rows are gathered directly from that index. Secondary keys refine ties + after the primary indexed order and the traversal is ascending and + stable. + See full documentation in :func:`sort`. """ return sort(self, order, **kwargs) @@ -6380,7 +6411,14 @@ def indices(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: Returns ------- out: :ref:`NDArray` - The sorted array. + The ordered logical positions. + + Notes + ----- + If the primary order key has a matching ``full`` index, the positions are + returned directly from that index in ascending stable order. Secondary keys + refine ties after the primary indexed order. Otherwise this falls back to a + scan-plus-sort path. """ if not order: # Shortcut for this relatively rare case @@ -6422,6 +6460,13 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An ------- out: :ref:`NDArray` The sorted array. + + Notes + ----- + If the primary order key has a matching ``full`` index, rows are gathered + directly in ascending stable index order. Secondary keys refine ties after + the primary indexed order. Otherwise this falls back to a scan-plus-sort + path. """ if not order: return array @@ -6464,6 +6509,12 @@ def itersorted( Optional slice applied to the ordered sequence before iteration. batch_size : int or None, optional Internal prefetch size used during iteration. + + Notes + ----- + This requires a matching ``full`` index on the primary order key. The + iteration order is ascending and stable. Secondary keys refine ties after + the primary indexed order. """ if not isinstance(array, blosc2.NDArray): raise TypeError("itersorted() is only supported on NDArray") diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 57f16802..ed697a56 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -19,6 +19,7 @@ def test_scalar_index_matches_scan(kind): assert descriptor["kind"] == kind assert descriptor["field"] is None + assert descriptor["target"] == {"source": "field", "field": None} assert len(arr.indexes) == 1 expr = ((arr >= 120_000) & (arr < 125_000)).where(arr) @@ -40,7 +41,8 @@ def test_structured_field_index_matches_scan(kind): data["payload"] = np.linspace(0, 1, data.shape[0], dtype=np.float64) arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) - arr.create_index(field="id", kind=kind) + descriptor = arr.create_index(field="id", kind=kind) + assert descriptor["target"] == {"source": "field", "field": "id"} expr = blosc2.lazyexpr("(id >= 48_000) & (id < 51_000)", arr.fields).where(arr) assert expr.will_use_index() is True @@ -273,6 +275,14 @@ def test_filtered_ordered_queries_support_cross_field_exact_indexes(): expr.sort(order=["a", "b"]).compute()[:], np.sort(data[mask], order=["a", "b"]) ) + explained = expr.sort(order=["a", "b"]).explain() + assert explained["will_use_index"] is True + assert explained["ordered_access"] is True + assert explained["field"] == "a" + assert explained["target"] == {"source": "field", "field": "a"} + assert explained["secondary_refinement"] is True + assert explained["filter_reason"] == "multi-field exact indexes selected" + def test_itersorted_matches_numpy_sorted_order(): dtype = np.dtype([("a", np.int64), ("b", np.int64)]) @@ -287,6 +297,20 @@ def test_itersorted_matches_numpy_sorted_order(): np.testing.assert_array_equal(rows, np.sort(data, order=["a", "b"])) +def test_ordered_explain_reports_missing_full_index(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 2), (1, 9), (2, 4), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_index(field="b", kind="medium") + + expr = blosc2.lazyexpr("b >= 0", arr.fields).where(arr).sort(order="a") + explained = expr.explain() + + assert explained["will_use_index"] is False + assert explained["ordered_access"] is True + assert explained["reason"] == "no matching full index was found for ordered access" + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_append_keeps_index_current(kind): rng = np.random.default_rng(4) From 9a95bdbc685cce71f778339c86ed52c7191eb9a3 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 15:06:59 +0200 Subject: [PATCH 14/68] Add materialized expression indexes for NDArray - add create_expr_index(...) for explicit derived-value indexes - generalize index descriptors and sidecars to target field or expression streams - normalize expression targets by canonical expression keys and dependencies - reuse expression indexes for where(...) filtering on matching predicates - reuse full expression indexes for sort(order=...) and indices(order=...) - keep expression indexes current across append operations - persist and reopen expression indexes with target metadata intact - raise clear errors when expression ordering lacks a matching full index - add regression coverage for filtering, ordered reuse, persistence, and append maintenance - add an examples/ndarray/expression_index.py example - add bench/ndarray/expression_index_bench.py for expression-index timing comparisons - update examples to prefer the expr[:] idiom over expr.compute()[:] --- bench/ndarray/expression_index_bench.py | 426 ++++++++++++ examples/ndarray/expression_index.py | 33 + examples/ndarray/index_append_maintenance.py | 2 +- src/blosc2/indexing.py | 647 ++++++++++++++----- src/blosc2/lazyexpr.py | 7 +- src/blosc2/ndarray.py | 86 ++- tests/ndarray/test_indexing.py | 105 +++ 7 files changed, 1127 insertions(+), 179 deletions(-) create mode 100644 bench/ndarray/expression_index_bench.py create mode 100644 examples/ndarray/expression_index.py diff --git a/bench/ndarray/expression_index_bench.py b/bench/ndarray/expression_index_bench.py new file mode 100644 index 00000000..1398e82c --- /dev/null +++ b/bench/ndarray/expression_index_bench.py @@ -0,0 +1,426 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import os +import re +import statistics +import tempfile +import time +from pathlib import Path + +import numpy as np + +import blosc2 +from blosc2 import indexing as blosc2_indexing + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +CHUNK_LEN = 100_000 +BLOCK_LEN = 20_000 +DEFAULT_REPEATS = 3 +KINDS = ("ultralight", "light", "medium", "full") +DISTS = ("sorted", "block-shuffled", "random") +RNG_SEED = 0 +DEFAULT_OPLEVEL = 5 +EXPRESSION = "abs(x)" + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def make_ordered_x(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype.kind in {"i", "u"}: + return np.arange(-(size // 2), -(size // 2) + size, dtype=np.int64).astype(dtype, copy=False) + if dtype.kind == "f": + return np.linspace(-(size / 2), size / 2, num=size, endpoint=False, dtype=dtype) + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def fill_x(x: np.ndarray, ordered_x: np.ndarray, dist: str, rng: np.random.Generator) -> None: + size = x.shape[0] + if dist == "sorted": + x[:] = ordered_x + return + if dist == "block-shuffled": + nblocks = (size + BLOCK_LEN - 1) // BLOCK_LEN + order = rng.permutation(nblocks) + dest = 0 + for src_block in order: + src_start = int(src_block) * BLOCK_LEN + src_stop = min(src_start + BLOCK_LEN, size) + block_size = src_stop - src_start + x[dest : dest + block_size] = ordered_x[src_start:src_stop] + dest += block_size + return + if dist == "random": + x[:] = ordered_x + rng.shuffle(x) + return + raise ValueError(f"unsupported distribution {dist!r}") + + +def make_source_data(size: int, dist: str, x_dtype: np.dtype) -> np.ndarray: + dtype = np.dtype([("x", x_dtype), ("payload", np.float32)]) + data = np.zeros(size, dtype=dtype) + fill_x(data["x"], make_ordered_x(size, x_dtype), dist, np.random.default_rng(RNG_SEED)) + return data + + +def build_persistent_array(data: np.ndarray, path: Path) -> blosc2.NDArray: + return blosc2.asarray(data, urlpath=path, mode="w", chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) + + +def base_array_path(size_dir: Path, size: int, dist: str, x_dtype: np.dtype) -> Path: + return size_dir / f"expr_size_{size}_{dist}_{dtype_token(x_dtype)}.b2nd" + + +def indexed_array_path( + size_dir: Path, size: int, dist: str, kind: str, optlevel: int, x_dtype: np.dtype, in_mem: bool +) -> Path: + mode = "mem" if in_mem else "ooc" + return size_dir / f"expr_size_{size}_{dist}_{dtype_token(x_dtype)}.{kind}.opt{optlevel}.{mode}.b2nd" + + +def benchmark_scan_once(expr) -> tuple[float, int]: + start = time.perf_counter() + result = expr.compute(_use_index=False)[:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def benchmark_index_once(arr: blosc2.NDArray, cond) -> tuple[float, int]: + start = time.perf_counter() + result = arr[cond][:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def index_sizes(descriptor: dict) -> tuple[int, int]: + logical = 0 + disk = 0 + for level_info in descriptor["levels"].values(): + dtype = np.dtype(level_info["dtype"]) + logical += dtype.itemsize * level_info["nsegments"] + if level_info["path"]: + disk += os.path.getsize(level_info["path"]) + + for key in ("light", "reduced", "full"): + section = descriptor.get(key) + if section is None: + continue + for path_key in section: + if not path_key.endswith("_path"): + continue + arr = blosc2.open(section[path_key]) + logical += int(np.prod(arr.shape)) * arr.dtype.itemsize + disk += os.path.getsize(section[path_key]) + return logical, disk + + +def _source_data_factory(size: int, dist: str, x_dtype: np.dtype): + data = None + + def get_data() -> np.ndarray: + nonlocal data + if data is None: + data = make_source_data(size, dist, x_dtype) + return data + + return get_data + + +def _condition_expr(limit: object, dtype: np.dtype) -> str: + if np.dtype(dtype).kind == "f": + literal = repr(float(limit)) + else: + literal = str(int(limit)) + return f"(abs(x) >= 0) & (abs(x) < {literal})" + + +def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int, in_mem: bool) -> dict | None: + for descriptor in arr.indexes: + if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: + continue + target = descriptor.get("target") or {} + if ( + target.get("source") == "expression" + and target.get("expression_key") == EXPRESSION + and descriptor.get("kind") == kind + and int(descriptor.get("optlevel", -1)) == int(optlevel) + and bool(descriptor.get("ooc", False)) is (not bool(in_mem)) + and not descriptor.get("stale", False) + ): + return descriptor + return None + + +def _open_or_build_persistent_array(path: Path, get_data) -> blosc2.NDArray: + if path.exists(): + return blosc2.open(path, mode="a") + blosc2.remove_urlpath(path) + return build_persistent_array(get_data(), path) + + +def _open_or_build_indexed_array( + path: Path, get_data, kind: str, optlevel: int, in_mem: bool +) -> tuple[blosc2.NDArray, float]: + if path.exists(): + arr = blosc2.open(path, mode="a") + if _valid_index_descriptor(arr, kind, optlevel, in_mem) is not None: + return arr, 0.0 + if arr.indexes: + arr.drop_index(name=arr.indexes[0]["name"]) + blosc2.remove_urlpath(path) + + arr = build_persistent_array(get_data(), path) + build_start = time.perf_counter() + arr.create_expr_index(EXPRESSION, kind=kind, optlevel=optlevel, in_mem=in_mem) + return arr, time.perf_counter() - build_start + + +def benchmark_size( + size: int, size_dir: Path, dist: str, query_width: int, optlevel: int, x_dtype: np.dtype, in_mem: bool +) -> list[dict]: + get_data = _source_data_factory(size, dist, x_dtype) + arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist, x_dtype), get_data) + condition_str = _condition_expr(query_width, x_dtype) + condition = blosc2.lazyexpr(condition_str, arr.fields) + expr = condition.where(arr) + base_bytes = size * arr.dtype.itemsize + compressed_base_bytes = os.path.getsize(arr.urlpath) + + scan_ms = benchmark_scan_once(expr)[0] * 1_000 + + rows = [] + for kind in KINDS: + idx_arr, build_time = _open_or_build_indexed_array( + indexed_array_path(size_dir, size, dist, kind, optlevel, x_dtype, in_mem), + get_data, + kind, + optlevel, + in_mem, + ) + idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) + idx_expr = idx_cond.where(idx_arr) + explanation = idx_expr.explain() + logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) + cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) + + rows.append( + { + "size": size, + "dist": dist, + "kind": kind, + "optlevel": optlevel, + "in_mem": in_mem, + "query_rows": index_len, + "build_s": build_time, + "create_idx_ms": build_time * 1_000, + "scan_ms": scan_ms, + "cold_ms": cold_time * 1_000, + "cold_speedup": scan_ms / (cold_time * 1_000), + "warm_ms": None, + "warm_speedup": None, + "candidate_units": explanation["candidate_units"], + "total_units": explanation["total_units"], + "logical_index_bytes": logical_index_bytes, + "disk_index_bytes": disk_index_bytes, + "index_pct": logical_index_bytes / base_bytes * 100, + "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + "_arr": idx_arr, + "_cond": idx_cond, + } + ) + return rows + + +def measure_warm_queries(rows: list[dict], repeats: int) -> None: + if repeats <= 0: + return + for result in rows: + arr = result["_arr"] + cond = result["_cond"] + index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] + warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None + result["warm_ms"] = warm_ms + result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms + + +def parse_human_size(value: str) -> int: + value = value.strip() + if not value: + raise argparse.ArgumentTypeError("size must not be empty") + + suffixes = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000} + suffix = value[-1].lower() + if suffix in suffixes: + number = value[:-1] + if not number: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") + try: + parsed = int(number) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + size = parsed * suffixes[suffix] + else: + try: + size = int(value) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + + if size <= 0: + raise argparse.ArgumentTypeError("size must be a positive integer") + return size + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark python-blosc2 expression index kinds.") + parser.add_argument("--size", type=parse_human_size, help="Benchmark a single array size.") + parser.add_argument( + "--query-width", + type=parse_human_size, + default=1_000, + help="Upper bound for the `abs(x) < query_width` predicate. Default: 1000.", + ) + parser.add_argument("--repeats", type=int, default=DEFAULT_REPEATS, help="Warm-query repetitions.") + parser.add_argument("--outdir", type=Path, help="Directory where benchmark arrays and sidecars are kept.") + parser.add_argument("--optlevel", type=int, default=DEFAULT_OPLEVEL, help="Index optlevel. Default: 5.") + parser.add_argument( + "--dtype", + default="int64", + help="NumPy dtype for the source field. Examples: int64, int32, float64. Default: int64.", + ) + parser.add_argument( + "--dist", + choices=(*DISTS, "all"), + default="random", + help="Distribution for the source field. Use 'all' to benchmark every distribution.", + ) + parser.add_argument( + "--in-mem", + action=argparse.BooleanOptionalAction, + default=False, + help="Use the in-memory index builders. Disabled by default; pass --in-mem to force them.", + ) + return parser.parse_args() + + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def _table_rows(results: list[dict], columns: list[tuple[str, callable]]) -> tuple[list[str], list[list[str]], list[int]]: + headers = [header for header, _ in columns] + widths = [len(header) for header in headers] + rows = [[formatter(result) for _, formatter in columns] for result in results] + for row in rows: + widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] + return headers, rows, widths + + +def print_table(results: list[dict], columns: list[tuple[str, callable]]) -> None: + headers, rows, widths = _table_rows(results, columns) + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in rows: + print(_format_row(row, widths)) + + +def run_benchmarks( + sizes: tuple[int, ...], + dists: tuple[str, ...], + size_dir: Path, + dist_label: str, + query_width: int, + repeats: int, + optlevel: int, + x_dtype: np.dtype, + in_mem: bool, +) -> None: + all_results = [] + print("Expression range-query benchmark across index kinds") + print( + f"expr={EXPRESSION}, chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " + f"query_width={query_width:,}, optlevel={optlevel}, dtype={x_dtype.name}, in_mem={in_mem}" + ) + for dist in dists: + for size in sizes: + size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, x_dtype, in_mem) + all_results.extend(size_results) + + print() + print("Cold Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ], + ) + if repeats > 0: + measure_warm_queries(all_results, repeats) + print() + print("Warm Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ( + "speedup", + lambda result: f"{result['warm_speedup']:.2f}x" + if result["warm_speedup"] is not None + else "-", + ), + ], + ) + + +def main() -> None: + args = parse_args() + if args.repeats < 0: + raise SystemExit("--repeats must be >= 0") + try: + x_dtype = np.dtype(args.dtype) + except TypeError as exc: + raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc + if x_dtype.kind not in {"i", "u", "f"}: + raise SystemExit(f"--dtype only supports integer and floating-point dtypes; got {x_dtype}") + sizes = (args.size,) if args.size is not None else SIZES + dists = DISTS if args.dist == "all" else (args.dist,) + + if args.outdir is None: + with tempfile.TemporaryDirectory() as tmpdir: + run_benchmarks( + sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats, args.optlevel, x_dtype, args.in_mem + ) + else: + args.outdir.mkdir(parents=True, exist_ok=True) + run_benchmarks( + sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel, x_dtype, args.in_mem + ) + + +if __name__ == "__main__": + main() diff --git a/examples/ndarray/expression_index.py b/examples/ndarray/expression_index.py new file mode 100644 index 00000000..3b2992b3 --- /dev/null +++ b/examples/ndarray/expression_index.py @@ -0,0 +1,33 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np + +import blosc2 + +# Intent: show how to build an index on a derived expression stream and +# reuse it for both filtering and direct ordered reads. + +dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) +data = np.array( + [(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], + dtype=dtype, +) + +arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) +arr.create_expr_index("abs(x)", kind="full", name="abs_x") + +expr = blosc2.lazyexpr("(abs(x) >= 2) & (abs(x) < 8)", arr.fields).where(arr) + +print("Expression-indexed filter result:") +print(expr[:]) + +print("\nRows ordered by abs(x) via the full expression index:") +print(arr.sort(order="abs(x)")[:]) + +print("\nFiltered rows ordered by abs(x):") +print(expr.sort(order="abs(x)")[:]) diff --git a/examples/ndarray/index_append_maintenance.py b/examples/ndarray/index_append_maintenance.py index 5a944d5f..21076e48 100644 --- a/examples/ndarray/index_append_maintenance.py +++ b/examples/ndarray/index_append_maintenance.py @@ -25,7 +25,7 @@ expr = blosc2.lazyexpr("(id >= 4) & (id < 7)", arr.fields).where(arr) print("Indexed query after append:") -print(expr.compute()[:]) +print(expr[:]) print("\nSorted rows after append:") print(arr.sort(order="id")[:]) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 57459d0c..a5b12e64 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -8,7 +8,9 @@ from __future__ import annotations import ast +import hashlib import math +import re import tempfile from dataclasses import dataclass from pathlib import Path @@ -18,7 +20,8 @@ import blosc2 INDEXES_VLMETA_KEY = "blosc2_indexes" -INDEX_FORMAT_VERSION = 2 +INDEX_FORMAT_VERSION = 3 +SELF_TARGET_NAME = "__self__" FLAG_ALL_NAN = np.uint8(1 << 0) FLAG_HAS_NAN = np.uint8(1 << 1) @@ -37,12 +40,17 @@ FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 +def _sanitize_token(token: str) -> str: + return re.sub(r"[^0-9A-Za-z_.-]+", "_", token) + + @dataclass(slots=True) class IndexPlan: usable: bool reason: str descriptor: dict | None = None base: blosc2.NDArray | None = None + target: dict | None = None field: str | None = None level: str | None = None segment_len: int | None = None @@ -64,6 +72,7 @@ class SegmentPredicatePlan: base: blosc2.NDArray candidate_units: np.ndarray descriptor: dict + target: dict field: str | None level: str segment_len: int @@ -73,6 +82,7 @@ class SegmentPredicatePlan: class ExactPredicatePlan: base: blosc2.NDArray descriptor: dict + target: dict field: str | None lower: object | None = None lower_inclusive: bool = True @@ -114,6 +124,16 @@ def _field_token(field: str | None) -> str: return "__self__" if field is None else field +def _target_token(target: dict) -> str: + source = target.get("source") + if source == "field": + return _field_token(target.get("field")) + if source == "expression": + digest = hashlib.sha1(target["expression_key"].encode("utf-8")).hexdigest()[:12] + return f"__expr__{digest}" + raise ValueError(f"unsupported index target source {source!r}") + + def _copy_nested_dict(value: dict | None) -> dict | None: if value is None: return None @@ -180,6 +200,19 @@ def _field_target_descriptor(field: str | None) -> dict: return {"source": "field", "field": field} +def _expression_target_descriptor(expression: str, expression_key: str, dependencies: list[str]) -> dict: + return { + "source": "expression", + "expression": expression, + "expression_key": expression_key, + "dependencies": list(dependencies), + } + + +def _target_field(target: dict) -> str | None: + return target.get("field") if target.get("source") == "field" else None + + def _field_dtype(array: blosc2.NDArray, field: str | None) -> np.dtype: if field is None: return np.dtype(array.dtype) @@ -201,6 +234,68 @@ def _validate_index_target(array: blosc2.NDArray, field: str | None) -> np.dtype return dtype +class _OperandCanonicalizer(ast.NodeTransformer): + def __init__(self, operands: dict): + self.operands = operands + self.base: blosc2.NDArray | None = None + self.dependencies: list[str] = [] + self.valid = True + + def visit_Name(self, node: ast.Name) -> ast.AST: + operand = self.operands.get(node.id) + if operand is None: + return node + target = _operand_target(operand) + if target is None: + self.valid = False + return node + base, field = target + if self.base is None: + self.base = base + elif self.base is not base: + self.valid = False + return node + canonical = SELF_TARGET_NAME if field is None else field + self.dependencies.append(canonical) + return ast.copy_location(ast.Name(id=canonical, ctx=node.ctx), node) + + +def _normalize_expression_node( + node: ast.AST, operands: dict +) -> tuple[blosc2.NDArray, str, list[str]] | None: + canonicalizer = _OperandCanonicalizer(operands) + normalized = canonicalizer.visit( + ast.fix_missing_locations(ast.parse(ast.unparse(node), mode="eval")).body + ) + if not canonicalizer.valid or canonicalizer.base is None or not canonicalizer.dependencies: + return None + dependencies = list(dict.fromkeys(canonicalizer.dependencies)) + return canonicalizer.base, ast.unparse(normalized), dependencies + + +def _normalize_expression_target(expression: str, operands: dict) -> tuple[blosc2.NDArray, dict, np.dtype]: + try: + tree = ast.parse(expression, mode="eval") + except SyntaxError as exc: + raise ValueError("expression is not valid Python syntax") from exc + + normalized = _normalize_expression_node(tree.body, operands) + if normalized is None: + raise ValueError("expression indexes require operands from a single 1-D NDArray target") + base, expression_key, dependencies = normalized + if base.ndim != 1: + raise ValueError("expression indexes are only supported on 1-D NDArray objects") + target = _expression_target_descriptor(expression, expression_key, dependencies) + sample_stop = min(int(base.shape[0]), max(1, int(base.blocks[0]) if base.blocks else 1)) + sample = _slice_values_for_target(base, target, 0, sample_stop) + dtype = np.dtype(sample.dtype) + if sample.ndim != 1: + raise ValueError("expression indexes require expressions returning a 1-D scalar stream") + if not _supported_index_dtype(dtype): + raise TypeError(f"dtype {dtype} is not supported by the current index engine") + return base, target, dtype + + def _sanitize_sidecar_root(urlpath: str | Path) -> tuple[Path, str]: path = Path(urlpath) suffix = "".join(path.suffixes) @@ -208,10 +303,9 @@ def _sanitize_sidecar_root(urlpath: str | Path) -> tuple[Path, str]: return path, root -def _sidecar_path(array: blosc2.NDArray, field: str | None, kind: str, name: str) -> str: +def _sidecar_path(array: blosc2.NDArray, token: str, kind: str, name: str) -> str: path, root = _sanitize_sidecar_root(array.urlpath) - token = _field_token(field) - return str(path.with_name(f"{root}.__index__.{token}.{kind}.{name}.b2nd")) + return str(path.with_name(f"{root}.__index__.{_sanitize_token(token)}.{kind}.{name}.b2nd")) def _segment_len(array: blosc2.NDArray, level: str) -> int: @@ -224,27 +318,47 @@ def _segment_len(array: blosc2.NDArray, level: str) -> int: raise ValueError(f"unknown level {level!r}") -def _data_cache_key( - array: blosc2.NDArray, field: str | None, category: str, name: str -) -> tuple[int, str | None, str, str]: - return (_array_key(array), field, category, name) +def _data_cache_key(array: blosc2.NDArray, token: str, category: str, name: str): + return (_array_key(array), token, category, name) -def _clear_cached_data(array: blosc2.NDArray, field: str | None) -> None: - prefix = (_array_key(array), field) +def _clear_cached_data(array: blosc2.NDArray, token: str) -> None: + prefix = (_array_key(array), token) keys = [key for key in _DATA_CACHE if key[:2] == prefix] for key in keys: _DATA_CACHE.pop(key, None) -def _values_for_index(array: blosc2.NDArray, field: str | None) -> np.ndarray: - values = array[:] - return values if field is None else values[field] +def _operands_for_dependencies(values: np.ndarray, dependencies: list[str]) -> dict[str, np.ndarray]: + operands = {} + for dependency in dependencies: + if dependency == SELF_TARGET_NAME: + operands[dependency] = values + else: + operands[dependency] = values[dependency] + return operands + + +def _values_from_numpy_target(values: np.ndarray, target: dict) -> np.ndarray: + if target["source"] == "field": + field = target.get("field") + return values if field is None else values[field] + if target["source"] == "expression": + from .lazyexpr import ne_evaluate + + result = ne_evaluate( + target["expression_key"], _operands_for_dependencies(values, target["dependencies"]) + ) + return np.asarray(result) + raise ValueError(f"unsupported index target source {target['source']!r}") + + +def _values_for_target(array: blosc2.NDArray, target: dict) -> np.ndarray: + return _slice_values_for_target(array, target, 0, int(array.shape[0])) -def _slice_values_for_index(array: blosc2.NDArray, field: str | None, start: int, stop: int) -> np.ndarray: - values = array[start:stop] - return values if field is None else values[field] +def _slice_values_for_target(array: blosc2.NDArray, target: dict, start: int, stop: int) -> np.ndarray: + return _values_from_numpy_target(array[start:stop], target) def _summary_dtype(dtype: np.dtype) -> np.dtype: @@ -280,16 +394,16 @@ def _compute_segment_summaries(values: np.ndarray, dtype: np.dtype, segment_len: def _store_array_sidecar( array: blosc2.NDArray, - field: str | None, + token: str, kind: str, category: str, name: str, data: np.ndarray, persistent: bool, ) -> dict: - cache_key = _data_cache_key(array, field, category, name) + cache_key = _data_cache_key(array, token, category, name) if persistent: - path = _sidecar_path(array, field, kind, f"{category}.{name}") + path = _sidecar_path(array, token, kind, f"{category}.{name}") blosc2.remove_urlpath(path) blosc2.asarray(data, urlpath=path, mode="w") if isinstance(data, np.memmap): @@ -303,9 +417,9 @@ def _store_array_sidecar( def _load_array_sidecar( - array: blosc2.NDArray, field: str | None, category: str, name: str, path: str | None + array: blosc2.NDArray, token: str, category: str, name: str, path: str | None ) -> np.ndarray: - cache_key = _data_cache_key(array, field, category, name) + cache_key = _data_cache_key(array, token, category, name) cached = _DATA_CACHE.get(cache_key) if cached is not None: return cached @@ -318,7 +432,8 @@ def _load_array_sidecar( def _build_levels_descriptor( array: blosc2.NDArray, - field: str | None, + target: dict, + token: str, kind: str, dtype: np.dtype, values: np.ndarray, @@ -328,7 +443,7 @@ def _build_levels_descriptor( for level in SEGMENT_LEVELS_BY_KIND[kind]: segment_len = _segment_len(array, level) summaries = _compute_segment_summaries(values, dtype, segment_len) - sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) levels[level] = { "segment_len": segment_len, "nsegments": len(summaries), @@ -340,7 +455,8 @@ def _build_levels_descriptor( def _build_levels_descriptor_ooc( array: blosc2.NDArray, - field: str | None, + target: dict, + token: str, kind: str, dtype: np.dtype, persistent: bool, @@ -355,8 +471,8 @@ def _build_levels_descriptor_ooc( for idx in range(nsegments): start = idx * segment_len stop = min(start + segment_len, size) - summaries[idx] = _segment_summary(_slice_values_for_index(array, field, start, stop), dtype) - sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + summaries[idx] = _segment_summary(_slice_values_for_target(array, target, start, stop), dtype) + sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) levels[level] = { "segment_len": segment_len, "nsegments": len(summaries), @@ -368,7 +484,7 @@ def _build_levels_descriptor_ooc( def _build_full_descriptor( array: blosc2.NDArray, - field: str | None, + token: str, kind: str, values: np.ndarray, persistent: bool, @@ -376,8 +492,8 @@ def _build_full_descriptor( order = np.argsort(values, kind="stable") positions = order.astype(np.int64, copy=False) sorted_values = values[order] - values_sidecar = _store_array_sidecar(array, field, kind, "full", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar(array, field, kind, "full", "positions", positions, persistent) + values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) return { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], @@ -428,7 +544,7 @@ def _build_block_sorted_payload( def _build_reduced_descriptor( array: blosc2.NDArray, - field: str | None, + token: str, kind: str, values: np.ndarray, persistent: bool, @@ -436,11 +552,11 @@ def _build_reduced_descriptor( block_len = int(array.blocks[0]) sorted_values, positions, offsets, _ = _build_block_sorted_payload(values, block_len) - values_sidecar = _store_array_sidecar(array, field, kind, "reduced", "values", sorted_values, persistent) + values_sidecar = _store_array_sidecar(array, token, kind, "reduced", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar( - array, field, kind, "reduced", "positions", positions, persistent + array, token, kind, "reduced", "positions", positions, persistent ) - offsets_sidecar = _store_array_sidecar(array, field, kind, "reduced", "offsets", offsets, persistent) + offsets_sidecar = _store_array_sidecar(array, token, kind, "reduced", "offsets", offsets, persistent) return { "block_len": block_len, "values_path": values_sidecar["path"], @@ -456,8 +572,10 @@ def _open_temp_memmap(workdir: Path, name: str, dtype: np.dtype, shape: tuple[in def _build_reduced_descriptor_ooc( array: blosc2.NDArray, - field: str | None, + target: dict, + token: str, kind: str, + dtype: np.dtype, persistent: bool, workdir: Path, ) -> dict: @@ -467,16 +585,14 @@ def _build_reduced_descriptor_ooc( position_dtype = _position_dtype(block_len - 1) offsets = np.empty(nblocks + 1, dtype=np.int64) offsets[0] = 0 - sorted_values = _open_temp_memmap( - workdir, f"{kind}_reduced_values", np.dtype(_field_dtype(array, field)), (size,) - ) + sorted_values = _open_temp_memmap(workdir, f"{kind}_reduced_values", dtype, (size,)) positions = _open_temp_memmap(workdir, f"{kind}_reduced_positions", position_dtype, (size,)) cursor = 0 for block_id in range(nblocks): start = block_id * block_len stop = min(start + block_len, size) - block = _slice_values_for_index(array, field, start, stop) + block = _slice_values_for_target(array, target, start, stop) order = np.argsort(block, kind="stable") next_cursor = cursor + (stop - start) sorted_values[cursor:next_cursor] = block[order] @@ -484,11 +600,11 @@ def _build_reduced_descriptor_ooc( cursor = next_cursor offsets[block_id + 1] = cursor - values_sidecar = _store_array_sidecar(array, field, kind, "reduced", "values", sorted_values, persistent) + values_sidecar = _store_array_sidecar(array, token, kind, "reduced", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar( - array, field, kind, "reduced", "positions", positions, persistent + array, token, kind, "reduced", "positions", positions, persistent ) - offsets_sidecar = _store_array_sidecar(array, field, kind, "reduced", "offsets", offsets, persistent) + offsets_sidecar = _store_array_sidecar(array, token, kind, "reduced", "offsets", offsets, persistent) return { "block_len": block_len, "values_path": values_sidecar["path"], @@ -614,7 +730,7 @@ def _quantize_light_value_scalar(value, dtype: np.dtype, bits: int): def _build_light_descriptor( array: blosc2.NDArray, - field: str | None, + token: str, kind: str, values: np.ndarray, optlevel: int, @@ -629,11 +745,11 @@ def _build_light_descriptor( sorted_values = _quantize_light_values_array(sorted_values, value_lossy_bits) bucket_positions = (positions // bucket_len).astype(np.uint8, copy=False) - values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", sorted_values, persistent) + values_sidecar = _store_array_sidecar(array, token, kind, "light", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar( - array, field, kind, "light", "bucket_positions", bucket_positions, persistent + array, token, kind, "light", "bucket_positions", bucket_positions, persistent ) - offsets_sidecar = _store_array_sidecar(array, field, kind, "light", "offsets", offsets, persistent) + offsets_sidecar = _store_array_sidecar(array, token, kind, "light", "offsets", offsets, persistent) return { "block_len": block_len, "bucket_count": bucket_count, @@ -647,7 +763,8 @@ def _build_light_descriptor( def _build_light_descriptor_ooc( array: blosc2.NDArray, - field: str | None, + target: dict, + token: str, kind: str, dtype: np.dtype, optlevel: int, @@ -669,7 +786,7 @@ def _build_light_descriptor_ooc( for block_id in range(nblocks): start = block_id * block_len stop = min(start + block_len, size) - block = _slice_values_for_index(array, field, start, stop) + block = _slice_values_for_target(array, target, start, stop) order = np.argsort(block, kind="stable") block_values = block[order] if value_lossy_bits > 0: @@ -680,11 +797,11 @@ def _build_light_descriptor_ooc( cursor = next_cursor offsets[block_id + 1] = cursor - values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", sorted_values, persistent) + values_sidecar = _store_array_sidecar(array, token, kind, "light", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar( - array, field, kind, "light", "bucket_positions", bucket_positions, persistent + array, token, kind, "light", "bucket_positions", bucket_positions, persistent ) - offsets_sidecar = _store_array_sidecar(array, field, kind, "light", "offsets", offsets, persistent) + offsets_sidecar = _store_array_sidecar(array, token, kind, "light", "offsets", offsets, persistent) return { "block_len": block_len, "bucket_count": bucket_count, @@ -858,7 +975,8 @@ def _merge_run_pair( def _build_full_descriptor_ooc( array: blosc2.NDArray, - field: str | None, + target: dict, + token: str, kind: str, dtype: np.dtype, persistent: bool, @@ -869,10 +987,10 @@ def _build_full_descriptor_ooc( sorted_values = np.empty(0, dtype=dtype) positions = np.empty(0, dtype=np.int64) values_sidecar = _store_array_sidecar( - array, field, kind, "full", "values", sorted_values, persistent + array, token, kind, "full", "values", sorted_values, persistent ) positions_sidecar = _store_array_sidecar( - array, field, kind, "full", "positions", positions, persistent + array, token, kind, "full", "positions", positions, persistent ) return { "values_path": values_sidecar["path"], @@ -882,7 +1000,7 @@ def _build_full_descriptor_ooc( runs = [] for run_id, start in enumerate(range(0, size, run_items)): stop = min(start + run_items, size) - values = _slice_values_for_index(array, field, start, stop) + values = _slice_values_for_target(array, target, start, stop) positions = np.arange(start, stop, dtype=np.int64) order = np.lexsort((positions, values)) sorted_values = values[order] @@ -919,8 +1037,8 @@ def _build_full_descriptor_ooc( final_run = runs[0] sorted_values = np.load(final_run.values_path, mmap_mode="r") positions = np.load(final_run.positions_path, mmap_mode="r") - values_sidecar = _store_array_sidecar(array, field, kind, "full", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar(array, field, kind, "full", "positions", positions, persistent) + values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) return { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], @@ -929,7 +1047,8 @@ def _build_full_descriptor_ooc( def _build_descriptor( array: blosc2.NDArray, - field: str | None, + target: dict, + token: str, kind: str, optlevel: int, granularity: str, @@ -943,9 +1062,11 @@ def _build_descriptor( full: dict | None, ) -> dict: return { - "name": name or _field_token(field), - "target": _field_target_descriptor(field), - "field": field, + "name": name + or (target["expression"] if target["source"] == "expression" else _field_token(target.get("field"))), + "token": token, + "target": target.copy(), + "field": _target_field(target), "kind": kind, "version": INDEX_FORMAT_VERSION, "optlevel": optlevel, @@ -977,6 +1098,107 @@ def create_index( ) -> dict: del kwargs dtype = _validate_index_target(array, field) + target = _field_target_descriptor(field) + token = _target_token(target) + if kind not in SEGMENT_LEVELS_BY_KIND: + raise NotImplementedError(f"unsupported index kind {kind!r}") + if granularity != "chunk": + raise NotImplementedError("only chunk-based array indexes are implemented for now") + if persistent is None: + persistent = _is_persistent_array(array) + use_ooc = _resolve_ooc_mode(kind, in_mem) + + if use_ooc and kind in {"light", "medium", "full"}: + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + workdir = Path(tmpdir) + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) + light = ( + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, workdir) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) + if kind == "medium" + else None + ) + full = ( + _build_full_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) + if kind == "full" + else None + ) + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + granularity, + persistent, + True, + name, + dtype, + levels, + light, + reduced, + full, + ) + else: + values = _values_for_target(array, target) + levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent) + light = ( + _build_light_descriptor(array, token, kind, values, optlevel, persistent) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor(array, token, kind, values, persistent) if kind == "medium" else None + ) + full = _build_full_descriptor(array, token, kind, values, persistent) if kind == "full" else None + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + granularity, + persistent, + False, + name, + dtype, + levels, + light, + reduced, + full, + ) + + store = _load_store(array) + store["indexes"][token] = descriptor + _save_store(array, store) + return _copy_descriptor(descriptor) + + +def create_expr_index( + array: blosc2.NDArray, + expression: str, + *, + operands: dict | None = None, + kind: str = "light", + optlevel: int = 5, + granularity: str = "chunk", + persistent: bool | None = None, + in_mem: bool = False, + name: str | None = None, + **kwargs, +) -> dict: + del kwargs + if operands is None: + operands = array.fields if array.dtype.fields is not None else {"value": array} + base, target, dtype = _normalize_expression_target(expression, operands) + if base is not array: + raise ValueError( + "expression index operands must resolve to the same array passed to create_expr_index()" + ) if kind not in SEGMENT_LEVELS_BY_KIND: raise NotImplementedError(f"unsupported index kind {kind!r}") if granularity != "chunk": @@ -984,29 +1206,31 @@ def create_index( if persistent is None: persistent = _is_persistent_array(array) use_ooc = _resolve_ooc_mode(kind, in_mem) + token = _target_token(target) if use_ooc and kind in {"light", "medium", "full"}: with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: workdir = Path(tmpdir) - levels = _build_levels_descriptor_ooc(array, field, kind, dtype, persistent) + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) light = ( - _build_light_descriptor_ooc(array, field, kind, dtype, optlevel, persistent, workdir) + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, workdir) if kind == "light" else None ) reduced = ( - _build_reduced_descriptor_ooc(array, field, kind, persistent, workdir) + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) if kind == "medium" else None ) full = ( - _build_full_descriptor_ooc(array, field, kind, dtype, persistent, workdir) + _build_full_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) if kind == "full" else None ) descriptor = _build_descriptor( array, - field, + target, + token, kind, optlevel, granularity, @@ -1020,20 +1244,21 @@ def create_index( full, ) else: - values = _values_for_index(array, field) - levels = _build_levels_descriptor(array, field, kind, dtype, values, persistent) + values = _values_for_target(array, target) + levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent) light = ( - _build_light_descriptor(array, field, kind, values, optlevel, persistent) + _build_light_descriptor(array, token, kind, values, optlevel, persistent) if kind == "light" else None ) reduced = ( - _build_reduced_descriptor(array, field, kind, values, persistent) if kind == "medium" else None + _build_reduced_descriptor(array, token, kind, values, persistent) if kind == "medium" else None ) - full = _build_full_descriptor(array, field, kind, values, persistent) if kind == "full" else None + full = _build_full_descriptor(array, token, kind, values, persistent) if kind == "full" else None descriptor = _build_descriptor( array, - field, + target, + token, kind, optlevel, granularity, @@ -1048,7 +1273,7 @@ def create_index( ) store = _load_store(array) - store["indexes"][_field_token(field)] = descriptor + store["indexes"][token] = descriptor _save_store(array, store) return _copy_descriptor(descriptor) @@ -1094,49 +1319,53 @@ def _drop_descriptor_sidecars(descriptor: dict) -> None: _remove_sidecar_path(descriptor["full"]["positions_path"]) -def _replace_levels_descriptor( - array: blosc2.NDArray, descriptor: dict, field: str | None, kind: str, persistent: bool -) -> None: +def _replace_levels_descriptor(array: blosc2.NDArray, descriptor: dict, kind: str, persistent: bool) -> None: size = int(array.shape[0]) + target = descriptor["target"] + token = descriptor["token"] for level, level_info in descriptor["levels"].items(): segment_len = int(level_info["segment_len"]) start = 0 summaries = _compute_segment_summaries( - _slice_values_for_index(array, field, start, size), _field_dtype(array, field), segment_len + _slice_values_for_target(array, target, start, size), np.dtype(descriptor["dtype"]), segment_len ) - sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) level_info["path"] = sidecar["path"] level_info["dtype"] = sidecar["dtype"] level_info["nsegments"] = len(summaries) def _replace_levels_descriptor_tail( - array: blosc2.NDArray, descriptor: dict, field: str | None, kind: str, old_size: int, persistent: bool + array: blosc2.NDArray, descriptor: dict, kind: str, old_size: int, persistent: bool ) -> None: - dtype = _field_dtype(array, field) + target = descriptor["target"] + token = descriptor["token"] + dtype = np.dtype(descriptor["dtype"]) new_size = int(array.shape[0]) for level, level_info in descriptor["levels"].items(): segment_len = int(level_info["segment_len"]) start_segment = old_size // segment_len prefix = _load_level_summaries(array, descriptor, level)[:start_segment] tail_start = start_segment * segment_len - tail_values = _slice_values_for_index(array, field, tail_start, new_size) + tail_values = _slice_values_for_target(array, target, tail_start, new_size) tail_summaries = _compute_segment_summaries(tail_values, dtype, segment_len) summaries = np.concatenate((prefix, tail_summaries)) if len(prefix) else tail_summaries - sidecar = _store_array_sidecar(array, field, kind, "summary", level, summaries, persistent) + sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) level_info["path"] = sidecar["path"] level_info["dtype"] = sidecar["dtype"] level_info["nsegments"] = len(summaries) def _replace_reduced_descriptor_tail( - array: blosc2.NDArray, descriptor: dict, field: str | None, old_size: int, persistent: bool + array: blosc2.NDArray, descriptor: dict, old_size: int, persistent: bool ) -> None: reduced = descriptor["reduced"] + target = descriptor["target"] + token = descriptor["token"] block_len = int(reduced["block_len"]) start_block = old_size // block_len block_start = start_block * block_len - tail_values = _slice_values_for_index(array, field, block_start, int(array.shape[0])) + tail_values = _slice_values_for_target(array, target, block_start, int(array.shape[0])) sorted_values_tail, positions_tail, offsets_tail, _ = _build_block_sorted_payload(tail_values, block_len) values, positions, offsets = _load_reduced_arrays(array, descriptor) @@ -1147,13 +1376,13 @@ def _replace_reduced_descriptor_tail( kind = descriptor["kind"] values_sidecar = _store_array_sidecar( - array, field, kind, "reduced", "values", updated_values, persistent + array, token, kind, "reduced", "values", updated_values, persistent ) positions_sidecar = _store_array_sidecar( - array, field, kind, "reduced", "positions", updated_positions, persistent + array, token, kind, "reduced", "positions", updated_positions, persistent ) offsets_sidecar = _store_array_sidecar( - array, field, kind, "reduced", "offsets", updated_offsets, persistent + array, token, kind, "reduced", "offsets", updated_offsets, persistent ) reduced["values_path"] = values_sidecar["path"] reduced["positions_path"] = positions_sidecar["path"] @@ -1161,13 +1390,15 @@ def _replace_reduced_descriptor_tail( def _replace_light_descriptor_tail( - array: blosc2.NDArray, descriptor: dict, field: str | None, old_size: int, persistent: bool + array: blosc2.NDArray, descriptor: dict, old_size: int, persistent: bool ) -> None: light = descriptor["light"] + target = descriptor["target"] + token = descriptor["token"] block_len = int(light["block_len"]) start_block = old_size // block_len block_start = start_block * block_len - tail_values = _slice_values_for_index(array, field, block_start, int(array.shape[0])) + tail_values = _slice_values_for_target(array, target, block_start, int(array.shape[0])) value_lossy_bits = int(light["value_lossy_bits"]) bucket_len = int(light["bucket_len"]) sorted_values_tail, positions_tail, offsets_tail, _ = _build_block_sorted_payload(tail_values, block_len) @@ -1182,12 +1413,12 @@ def _replace_light_descriptor_tail( updated_offsets = np.concatenate((offsets[: start_block + 1], prefix_items + offsets_tail[1:])) kind = descriptor["kind"] - values_sidecar = _store_array_sidecar(array, field, kind, "light", "values", updated_values, persistent) + values_sidecar = _store_array_sidecar(array, token, kind, "light", "values", updated_values, persistent) positions_sidecar = _store_array_sidecar( - array, field, kind, "light", "bucket_positions", updated_bucket_positions, persistent + array, token, kind, "light", "bucket_positions", updated_bucket_positions, persistent ) offsets_sidecar = _store_array_sidecar( - array, field, kind, "light", "offsets", updated_offsets, persistent + array, token, kind, "light", "offsets", updated_offsets, persistent ) light["values_path"] = values_sidecar["path"] light["bucket_positions_path"] = positions_sidecar["path"] @@ -1197,20 +1428,20 @@ def _replace_light_descriptor_tail( def _replace_full_descriptor( array: blosc2.NDArray, descriptor: dict, - field: str | None, sorted_values: np.ndarray, positions: np.ndarray, persistent: bool, ) -> None: kind = descriptor["kind"] - values_sidecar = _store_array_sidecar(array, field, kind, "full", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar(array, field, kind, "full", "positions", positions, persistent) + token = descriptor["token"] + values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) + positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) descriptor["full"]["values_path"] = values_sidecar["path"] descriptor["full"]["positions_path"] = positions_sidecar["path"] def _append_full_descriptor( - array: blosc2.NDArray, descriptor: dict, field: str | None, old_size: int, appended_values: np.ndarray + array: blosc2.NDArray, descriptor: dict, old_size: int, appended_values: np.ndarray ) -> None: full = descriptor.get("full") if full is None: @@ -1225,9 +1456,7 @@ def _append_full_descriptor( appended_positions[order], np.dtype(descriptor["dtype"]), ) - _replace_full_descriptor( - array, descriptor, field, merged_values, merged_positions, descriptor["persistent"] - ) + _replace_full_descriptor(array, descriptor, merged_values, merged_positions, descriptor["persistent"]) def append_to_indexes(array: blosc2.NDArray, old_size: int, appended_values: np.ndarray) -> None: @@ -1236,19 +1465,19 @@ def append_to_indexes(array: blosc2.NDArray, old_size: int, appended_values: np. return for descriptor in store["indexes"].values(): - field = descriptor["field"] kind = descriptor["kind"] persistent = descriptor["persistent"] - field_values = appended_values if field is None else appended_values[field] + target = descriptor["target"] + target_values = _values_from_numpy_target(appended_values, target) if descriptor.get("stale", False): continue if kind == "full": - _append_full_descriptor(array, descriptor, field, old_size, field_values) + _append_full_descriptor(array, descriptor, old_size, target_values) elif kind == "medium": - _replace_reduced_descriptor_tail(array, descriptor, field, old_size, persistent) + _replace_reduced_descriptor_tail(array, descriptor, old_size, persistent) elif kind == "light": - _replace_light_descriptor_tail(array, descriptor, field, old_size, persistent) - _replace_levels_descriptor_tail(array, descriptor, field, kind, old_size, persistent) + _replace_light_descriptor_tail(array, descriptor, old_size, persistent) + _replace_levels_descriptor_tail(array, descriptor, kind, old_size, persistent) descriptor["shape"] = tuple(array.shape) descriptor["chunks"] = tuple(array.chunks) descriptor["blocks"] = tuple(array.blocks) @@ -1261,7 +1490,7 @@ def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None token = _resolve_index_token(store, field, name) descriptor = store["indexes"].pop(token) _save_store(array, store) - _clear_cached_data(array, descriptor["field"]) + _clear_cached_data(array, descriptor["token"]) _drop_descriptor_sidecars(descriptor) @@ -1270,6 +1499,19 @@ def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | N token = _resolve_index_token(store, field, name) descriptor = store["indexes"][token] drop_index(array, field=descriptor["field"], name=descriptor["name"]) + if descriptor["target"]["source"] == "expression": + operands = array.fields if array.dtype.fields is not None else {SELF_TARGET_NAME: array} + return create_expr_index( + array, + descriptor["target"]["expression_key"], + operands=operands, + kind=descriptor["kind"], + optlevel=descriptor["optlevel"], + granularity=descriptor["granularity"], + persistent=descriptor["persistent"], + in_mem=not descriptor.get("ooc", False), + name=descriptor["name"], + ) return create_index( array, field=descriptor["field"], @@ -1301,7 +1543,11 @@ def mark_indexes_stale(array: blosc2.NDArray) -> None: def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: - descriptor = _load_store(array)["indexes"].get(_field_token(field)) + return _descriptor_for_target(array, _field_target_descriptor(field)) + + +def _descriptor_for_target(array: blosc2.NDArray, target: dict) -> dict | None: + descriptor = _load_store(array)["indexes"].get(_target_token(target)) if descriptor is None or descriptor.get("stale", False): return None if descriptor.get("version") != INDEX_FORMAT_VERSION: @@ -1317,15 +1563,15 @@ def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: def _load_level_summaries(array: blosc2.NDArray, descriptor: dict, level: str) -> np.ndarray: level_info = descriptor["levels"][level] - return _load_array_sidecar(array, descriptor["field"], "summary", level, level_info["path"]) + return _load_array_sidecar(array, descriptor["token"], "summary", level, level_info["path"]) def _load_full_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: full = descriptor.get("full") if full is None: raise RuntimeError("full index metadata is not available") - values = _load_array_sidecar(array, descriptor["field"], "full", "values", full["values_path"]) - positions = _load_array_sidecar(array, descriptor["field"], "full", "positions", full["positions_path"]) + values = _load_array_sidecar(array, descriptor["token"], "full", "values", full["values_path"]) + positions = _load_array_sidecar(array, descriptor["token"], "full", "positions", full["positions_path"]) return values, positions @@ -1335,11 +1581,11 @@ def _load_reduced_arrays( reduced = descriptor.get("reduced") if reduced is None: raise RuntimeError("reduced index metadata is not available") - values = _load_array_sidecar(array, descriptor["field"], "reduced", "values", reduced["values_path"]) + values = _load_array_sidecar(array, descriptor["token"], "reduced", "values", reduced["values_path"]) positions = _load_array_sidecar( - array, descriptor["field"], "reduced", "positions", reduced["positions_path"] + array, descriptor["token"], "reduced", "positions", reduced["positions_path"] ) - offsets = _load_array_sidecar(array, descriptor["field"], "reduced", "offsets", reduced["offsets_path"]) + offsets = _load_array_sidecar(array, descriptor["token"], "reduced", "offsets", reduced["offsets_path"]) return values, positions, offsets @@ -1347,11 +1593,11 @@ def _load_light_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndar light = descriptor.get("light") if light is None: raise RuntimeError("light index metadata is not available") - values = _load_array_sidecar(array, descriptor["field"], "light", "values", light["values_path"]) + values = _load_array_sidecar(array, descriptor["token"], "light", "values", light["values_path"]) positions = _load_array_sidecar( - array, descriptor["field"], "light", "bucket_positions", light["bucket_positions_path"] + array, descriptor["token"], "light", "bucket_positions", light["bucket_positions_path"] ) - offsets = _load_array_sidecar(array, descriptor["field"], "light", "offsets", light["offsets_path"]) + offsets = _load_array_sidecar(array, descriptor["token"], "light", "offsets", light["offsets_path"]) return values, positions, offsets @@ -1443,39 +1689,48 @@ def _compare_operator(node: ast.AST) -> str | None: return None +def _compare_target_from_node(node: ast.AST, operands: dict) -> tuple[blosc2.NDArray, dict] | None: + if isinstance(node, ast.Name): + operand = operands.get(node.id) + target = _operand_target(operand) if operand is not None else None + if target is None: + return None + base, field = target + if base.ndim != 1: + return None + return base, _field_target_descriptor(field) + + normalized = _normalize_expression_node(node, operands) + if normalized is None: + return None + base, expression_key, dependencies = normalized + return base, _expression_target_descriptor(ast.unparse(node), expression_key, dependencies) + + def _target_from_compare( node: ast.Compare, operands: dict -) -> tuple[blosc2.NDArray, str | None, str, object] | None: +) -> tuple[blosc2.NDArray, dict, str, object] | None: if len(node.ops) != 1 or len(node.comparators) != 1: return None op = _compare_operator(node.ops[0]) if op is None: return None - left_target = operands.get(node.left.id) if isinstance(node.left, ast.Name) else None - right_target = ( - operands.get(node.comparators[0].id) if isinstance(node.comparators[0], ast.Name) else None - ) - try: + left_target = _compare_target_from_node(node.left, operands) + right_target = _compare_target_from_node(node.comparators[0], operands) if left_target is not None: value = _literal_value(node.comparators[0]) - target = _operand_target(left_target) elif right_target is not None: value = _literal_value(node.left) - target = _operand_target(right_target) op = _flip_operator(op) else: return None except ValueError: return None - if target is None: - return None - base, field = target - if base.ndim != 1: - return None - return base, field, op, value + base, target = left_target if left_target is not None else right_target + return base, target, op, value def _finest_level(descriptor: dict) -> str: @@ -1487,8 +1742,8 @@ def _plan_segment_compare(node: ast.Compare, operands: dict) -> SegmentPredicate target = _target_from_compare(node, operands) if target is None: return None - base, field, op, value = target - descriptor = _descriptor_for(base, field) + base, target_info, op, value = target + descriptor = _descriptor_for_target(base, target_info) if descriptor is None: return None level = _finest_level(descriptor) @@ -1503,7 +1758,8 @@ def _plan_segment_compare(node: ast.Compare, operands: dict) -> SegmentPredicate base=base, candidate_units=candidate_units, descriptor=descriptor, - field=field, + target=target_info, + field=_target_field(target_info), level=level, segment_len=level_info["segment_len"], ) @@ -1531,6 +1787,7 @@ def _merge_segment_plans( base=left.base, candidate_units=candidate_units, descriptor=left.descriptor, + target=left.target, field=left.field, level=left.level, segment_len=left.segment_len, @@ -1593,8 +1850,8 @@ def _plan_exact_compare(node: ast.Compare, operands: dict) -> ExactPredicatePlan target = _target_from_compare(node, operands) if target is None: return None - base, field, op, value = target - descriptor = _descriptor_for(base, field) + base, target_info, op, value = target + descriptor = _descriptor_for_target(base, target_info) if descriptor is None or descriptor.get("kind") not in {"light", "medium", "full"}: return None try: @@ -1605,7 +1862,8 @@ def _plan_exact_compare(node: ast.Compare, operands: dict) -> ExactPredicatePlan return ExactPredicatePlan( base=base, descriptor=descriptor, - field=field, + target=target_info, + field=_target_field(target_info), lower=value, lower_inclusive=True, upper=value, @@ -1613,25 +1871,45 @@ def _plan_exact_compare(node: ast.Compare, operands: dict) -> ExactPredicatePlan ) if op == ">": return ExactPredicatePlan( - base=base, descriptor=descriptor, field=field, lower=value, lower_inclusive=False + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + lower=value, + lower_inclusive=False, ) if op == ">=": return ExactPredicatePlan( - base=base, descriptor=descriptor, field=field, lower=value, lower_inclusive=True + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + lower=value, + lower_inclusive=True, ) if op == "<": return ExactPredicatePlan( - base=base, descriptor=descriptor, field=field, upper=value, upper_inclusive=False + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + upper=value, + upper_inclusive=False, ) if op == "<=": return ExactPredicatePlan( - base=base, descriptor=descriptor, field=field, upper=value, upper_inclusive=True + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + upper=value, + upper_inclusive=True, ) return None def _same_base(left: ExactPredicatePlan, right: ExactPredicatePlan) -> bool: - return left.base is right.base and left.field == right.field + return left.base is right.base and left.descriptor["token"] == right.descriptor["token"] def _merge_lower_bound( @@ -1676,6 +1954,7 @@ def _merge_exact_plans( return ExactPredicatePlan( base=left.base, descriptor=left.descriptor, + target=left.target, field=left.field, lower=lower, lower_inclusive=lower_inclusive, @@ -1837,6 +2116,7 @@ def _bucket_masks_from_light( search_plan = ExactPredicatePlan( base=plan.base, descriptor=plan.descriptor, + target=plan.target, field=plan.field, lower=lower, lower_inclusive=lower_inclusive, @@ -1898,22 +2178,22 @@ def _multi_exact_positions(plans: list[ExactPredicatePlan]) -> tuple[blosc2.NDAr if not plans: return None base = plans[0].base - merged_by_field: dict[str | None, ExactPredicatePlan] = {} + merged_by_target: dict[str, ExactPredicatePlan] = {} for plan in plans: if plan.base is not base: return None - key = plan.field - current = merged_by_field.get(key) + key = plan.descriptor["token"] + current = merged_by_target.get(key) if current is None: - merged_by_field[key] = plan + merged_by_target[key] = plan continue merged = _merge_exact_plans(current, plan, "and") if merged is None: return None - merged_by_field[key] = merged + merged_by_target[key] = merged exact_arrays = [] - for plan in merged_by_field.values(): + for plan in merged_by_target.values(): positions = _exact_positions_from_plan(plan) if positions is None: return None @@ -1937,6 +2217,7 @@ def _plan_multi_exact_query(plans: list[ExactPredicatePlan]) -> IndexPlan | None "multi-field exact indexes selected", descriptor=_copy_descriptor(plans[0].descriptor), base=base, + target=plans[0].descriptor.get("target"), field=None, level="exact", total_units=int(base.shape[0]), @@ -1954,6 +2235,7 @@ def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: f"{kind} exact index selected", descriptor=_copy_descriptor(exact_plan.descriptor), base=exact_plan.base, + target=exact_plan.descriptor.get("target"), field=exact_plan.field, level=kind, total_units=exact_plan.base.shape[0], @@ -1970,6 +2252,7 @@ def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: f"{kind} exact index selected", descriptor=_copy_descriptor(exact_plan.descriptor), base=exact_plan.base, + target=exact_plan.descriptor.get("target"), field=exact_plan.field, level=kind, total_units=exact_plan.base.shape[0], @@ -1986,6 +2269,7 @@ def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: "light approximate-order index selected", descriptor=_copy_descriptor(exact_plan.descriptor), base=exact_plan.base, + target=exact_plan.descriptor.get("target"), field=exact_plan.field, level=kind, total_units=total_units, @@ -2036,6 +2320,7 @@ def plan_query(expression: str, operands: dict, where: dict | None, *, use_index "available index does not prune any units for this predicate", descriptor=_copy_descriptor(segment_plan.descriptor), base=segment_plan.base, + target=segment_plan.descriptor.get("target"), field=segment_plan.field, level=segment_plan.level, segment_len=segment_plan.segment_len, @@ -2049,6 +2334,7 @@ def plan_query(expression: str, operands: dict, where: dict | None, *, use_index f"{segment_plan.level} summaries selected", descriptor=_copy_descriptor(segment_plan.descriptor), base=segment_plan.base, + target=segment_plan.descriptor.get("target"), field=segment_plan.field, level=segment_plan.level, segment_len=segment_plan.segment_len, @@ -2087,7 +2373,7 @@ def evaluate_segment_query( return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) -def evaluate_light_query( +def evaluate_light_query( # noqa: C901 expression: str, operands: dict, ne_args: dict, where: dict, plan: IndexPlan ) -> np.ndarray: del expression, operands, ne_args @@ -2126,7 +2412,10 @@ def evaluate_light_query( where_x.get_1d_span_numpy(span, chunk_id, local_start, stop - start) else: span = where_x[start:stop] - field_values = span if plan.field is None else span[plan.field] + if plan.target is not None and plan.target.get("source") == "expression": + field_values = _values_from_numpy_target(span, plan.target) + else: + field_values = span if plan.field is None else span[plan.field] match = np.ones(len(field_values), dtype=bool) if plan.lower is not None: match &= field_values >= plan.lower if plan.lower_inclusive else field_values > plan.lower @@ -2233,37 +2522,65 @@ def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: return _gather_positions(where["_where_x"], plan.exact_positions) -def _normalize_order_fields(array: blosc2.NDArray, order: str | list[str] | None) -> list[str | None]: +def _normalize_primary_order_target(array: blosc2.NDArray, order: str | None) -> tuple[dict, str | None]: + if order is None: + return _field_target_descriptor(None), None + if array.dtype.fields is not None and order in array.dtype.fields: + return _field_target_descriptor(order), order + operands = array.fields if array.dtype.fields is not None else {SELF_TARGET_NAME: array} + base, target, _ = _normalize_expression_target(order, operands) + if base is not array: + raise ValueError("ordered expressions must resolve to the target array") + return target, None + + +def _normalize_order_fields( + array: blosc2.NDArray, order: str | list[str] | None +) -> tuple[dict, list[str | None]]: if order is None: if array.dtype.fields is None: - return [None] - return list(array.dtype.names) + return _field_target_descriptor(None), [None] + return _field_target_descriptor(array.dtype.names[0]), list(array.dtype.names) if isinstance(order, list): fields = list(order) else: fields = [order] - if array.dtype.fields is None: - if fields != [None]: - raise ValueError("order is only supported for structured arrays") - return [None] - for field in fields: - if field not in array.dtype.fields: - raise ValueError(f"field {field!r} is not present in the dtype") - return fields + primary_target, primary_field = _normalize_primary_order_target(array, fields[0]) + normalized_order = [primary_field if primary_field is not None else fields[0]] + if len(fields) > 1: + if array.dtype.fields is None: + raise ValueError("secondary order keys are only supported for structured arrays") + for field in fields[1:]: + if field not in array.dtype.fields: + raise ValueError(f"field {field!r} is not present in the dtype") + normalized_order.extend(fields[1:]) + return primary_target, normalized_order + + +def is_expression_order(array: blosc2.NDArray, order: str | list[str] | None) -> bool: + if order is None: + return False + primary = order[0] if isinstance(order, list) else order + try: + target, _ = _normalize_primary_order_target(array, primary) + except (TypeError, ValueError): + return False + return target["source"] == "expression" def plan_array_order( array: blosc2.NDArray, order: str | list[str] | None = None, *, require_full: bool = False ) -> OrderedIndexPlan: try: - order_fields = _normalize_order_fields(array, order) + primary_target, order_fields = _normalize_order_fields(array, order) except (TypeError, ValueError) as exc: return OrderedIndexPlan(False, str(exc)) - primary_field = order_fields[0] - descriptor = _full_descriptor_for_order(array, primary_field) + primary_field = _target_field(primary_target) + descriptor = _full_descriptor_for_order(array, primary_target) if descriptor is None: if require_full: - return OrderedIndexPlan(False, f"field {primary_field!r} must have an associated full index") + label = primary_field if primary_field is not None else primary_target.get("expression") + return OrderedIndexPlan(False, f"order target {label!r} must have an associated full index") return OrderedIndexPlan(False, "no matching full index was found for ordered access") return OrderedIndexPlan( True, @@ -2288,8 +2605,8 @@ def _positions_in_input_order( return positions[slice(start, stop, step)] -def _full_descriptor_for_order(array: blosc2.NDArray, field: str | None) -> dict | None: - descriptor = _descriptor_for(array, field) +def _full_descriptor_for_order(array: blosc2.NDArray, target: dict) -> dict | None: + descriptor = _descriptor_for_target(array, target) if descriptor is None or descriptor.get("kind") != "full": return None return descriptor diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index b57f0534..07ca8d70 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1824,7 +1824,10 @@ def slices_eval( # noqa: C901 # Get the dtype of the array to sort dtype_ = operands["_where_x"].dtype # Now, use only the fields that are necessary for the sorting - dtype_ = np.dtype([(f, dtype_[f]) for f in _order]) + if dtype_.fields is not None and all(f in dtype_.fields for f in _order): + dtype_ = np.dtype([(f, dtype_[f]) for f in _order]) + else: + dtype_ = np.dtype(np.int64) # Iterate over the operands and get the chunks chunk_operands = {} @@ -1848,6 +1851,8 @@ def slices_eval( # noqa: C901 ordered_positions = indexing.ordered_query_indices(expression, operands, where, _order) if ordered_positions is not None: return ordered_positions + elif indexing.is_expression_order(where["_where_x"], _order): + raise ValueError("expression order requires a matching full expression index") if _indices and _order is None and index_plan.usable and index_plan.exact_positions is not None: return np.asarray(index_plan.exact_positions, dtype=np.int64) if index_plan.usable and not (_indices or _order): diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index d9570f3d..91c57a07 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4786,6 +4786,60 @@ def create_csindex(self, field: str | None = None, **kwargs: Any) -> dict: return indexing.create_csindex(self, field=field, **kwargs) + def create_expr_index( + self, + expression: str, + *, + operands: dict | None = None, + kind: str = "light", + optlevel: int = 3, + granularity: str = "chunk", + persistent: bool | None = None, + in_mem: bool = False, + name: str | None = None, + **kwargs: Any, + ) -> dict: + """Create an index on a derived 1-D expression stream. + + Parameters + ---------- + expression : str + Deterministic scalar expression to materialize and index. Structured + arrays typically use field names directly, such as ``"abs(x)"`` or + ``"a + b"``. For plain 1-D arrays, provide ``operands`` explicitly + or use the default ``"value"`` name. + operands : dict or None, optional + Operand mapping used for normalization and evaluation. When omitted, + structured arrays default to ``self.fields`` and plain arrays use + ``{"value": self}``. + kind, optlevel, granularity, persistent, in_mem, name + Same meaning as in :meth:`create_index`. Setting ``in_mem=True`` + materializes the derived expression stream in RAM and can allocate + additional temporary arrays for sorting and block payloads, so the + default remains ``False`` and uses the out-of-core builders for + ``light``, ``medium``, and ``full``. + + Notes + ----- + Expression indexes are matched by normalized expression identity. The + current implementation supports one active index target per normalized + expression key. + """ + from . import indexing + + return indexing.create_expr_index( + self, + expression, + operands=operands, + kind=kind, + optlevel=optlevel, + granularity=granularity, + persistent=persistent, + in_mem=in_mem, + name=name, + **kwargs, + ) + def drop_index(self, field: str | None = None, name: str | None = None) -> None: """Drop an index by field or optional descriptor label.""" from . import indexing @@ -6402,9 +6456,10 @@ def indices(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: The (structured) array to be sorted. order: str, list of str, optional Specifies which fields to compare first, second, etc. A single - field can be specified as a string. Not all fields need to be - specified, only the ones by which the array is to be sorted. - If None, the array is not sorted. + field can be specified as a string. The primary order key may also be + an indexed expression such as ``"abs(x)"`` when a matching ``full`` + expression index exists. Not all fields need to be specified, only the + ones by which the array is to be sorted. If None, the array is not sorted. kwargs: Any, optional Keyword arguments that are supported by the :func:`empty` constructor. @@ -6415,9 +6470,10 @@ def indices(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: Notes ----- - If the primary order key has a matching ``full`` index, the positions are - returned directly from that index in ascending stable order. Secondary keys - refine ties after the primary indexed order. Otherwise this falls back to a + If the primary order key has a matching ``full`` field or expression index, + the positions are returned directly from that index in ascending stable + order. Secondary keys refine ties after the primary indexed order. + Field-based orders without a matching full index fall back to a scan-plus-sort path. """ if not order: @@ -6430,6 +6486,8 @@ def indices(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: ordered = indexing.ordered_indices(array, order=order) if ordered is not None: return blosc2.asarray(ordered, **kwargs) + if indexing.is_expression_order(array, order): + raise ValueError("expression order requires a matching full expression index") # Create a lazy array to access the sort machinery there # This is a bit of a hack, but it is the simplest way to do it @@ -6451,8 +6509,10 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An The (structured) array to be sorted. order: str, list of str, optional Specifies which fields to compare first, second, etc. A single - field can be specified as a string. Not all fields need to be - specified, only the ones by which the array is to be sorted. + field can be specified as a string. The primary order key may also be + an indexed expression such as ``"abs(x)"`` when a matching ``full`` + expression index exists. Not all fields need to be specified, only the + ones by which the array is to be sorted. kwargs: Any, optional Keyword arguments that are supported by the :func:`empty` constructor. @@ -6463,10 +6523,10 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An Notes ----- - If the primary order key has a matching ``full`` index, rows are gathered - directly in ascending stable index order. Secondary keys refine ties after - the primary indexed order. Otherwise this falls back to a scan-plus-sort - path. + If the primary order key has a matching ``full`` field or expression index, + rows are gathered directly in ascending stable index order. Secondary keys + refine ties after the primary indexed order. Field-based orders without a + matching full index fall back to a scan-plus-sort path. """ if not order: return array @@ -6477,6 +6537,8 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An ordered = indexing.read_sorted(array, order=order) if ordered is not None: return blosc2.asarray(ordered, **kwargs) + if indexing.is_expression_order(array, order): + raise ValueError("expression order requires a matching full expression index") # Create a lazy array to access the sort machinery there # This is a bit of a hack, but it is the simplest way to do it diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index ed697a56..fc1a141e 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -349,3 +349,108 @@ def test_append_keeps_full_index_sorted_access_current(): expected = np.sort(np.concatenate((data, appended)), order=["a", "b"]) np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], expected) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_expression_index_matches_scan(kind): + rng = np.random.default_rng(9) + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.zeros(150_000, dtype=dtype) + data["x"] = np.arange(-75_000, 75_000, dtype=np.int64) + rng.shuffle(data["x"]) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(15_000,), blocks=(3_000,)) + descriptor = arr.create_expr_index("abs(x)", kind=kind) + + assert descriptor["target"]["source"] == "expression" + assert descriptor["target"]["expression_key"] == "abs(x)" + assert descriptor["target"]["dependencies"] == ["x"] + + expr = blosc2.lazyexpr("(abs(x) >= 123) & (abs(x) < 456)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(np.abs(data["x"]) >= 123) & (np.abs(data["x"]) < 456)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + +def test_full_expression_index_reuses_ordered_access(): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array( + [(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full", name="abs_x") + + expected_positions = np.argsort(np.abs(data["x"]), kind="stable") + np.testing.assert_array_equal(arr.indices(order="abs(x)")[:], expected_positions) + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], data[expected_positions]) + + expr = blosc2.lazyexpr("(abs(x) >= 2) & (abs(x) < 8)", arr.fields).where(arr) + mask = (np.abs(data["x"]) >= 2) & (np.abs(data["x"]) < 8) + filtered_positions = np.where(mask)[0] + filtered_order = np.argsort(np.abs(data["x"][mask]), kind="stable") + np.testing.assert_array_equal( + expr.indices(order="abs(x)").compute()[:], filtered_positions[filtered_order] + ) + np.testing.assert_array_equal( + expr.sort(order="abs(x)").compute()[:], data[filtered_positions[filtered_order]] + ) + + explained = expr.sort(order="abs(x)").explain() + assert explained["will_use_index"] is True + assert explained["ordered_access"] is True + assert explained["target"]["source"] == "expression" + assert explained["target"]["expression_key"] == "abs(x)" + + +def test_persistent_expression_index_survives_reopen(tmp_path): + path = tmp_path / "expr_indexed_array.b2nd" + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.zeros(80_000, dtype=dtype) + data["x"] = np.arange(-40_000, 40_000, dtype=np.int64) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) + descriptor = arr.create_expr_index("abs(x)", kind="medium") + + reopened = blosc2.open(path, mode="a") + assert reopened.indexes[0]["target"]["source"] == "expression" + assert reopened.indexes[0]["target"]["expression_key"] == "abs(x)" + assert reopened.indexes[0]["reduced"]["values_path"] == descriptor["reduced"]["values_path"] + + expr = blosc2.lazyexpr("(abs(x) >= 777) & (abs(x) < 999)", reopened.fields).where(reopened) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_append_keeps_expression_index_current(kind): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3), (-6, 4), (9, 5)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind=kind) + + appended = np.array([(-4, 6), (12, 7), (-11, 8), (5, 9)], dtype=dtype) + all_data = np.concatenate((data, appended)) + arr.append(appended) + + assert arr.indexes[0]["stale"] is False + + expr = blosc2.lazyexpr("(abs(x) >= 4) & (abs(x) < 12)", arr.fields).where(arr) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = all_data[(np.abs(all_data["x"]) >= 4) & (np.abs(all_data["x"]) < 12)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + if kind == "full": + expected_positions = np.argsort(np.abs(all_data["x"]), kind="stable") + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], all_data[expected_positions]) From c1c89254516018477147bcf5db5106f57286ecc5 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 2 Apr 2026 16:52:40 +0200 Subject: [PATCH 15/68] Add append-run maintenance for full indexes Keep append-heavy full indexes cheap by storing each appended tail as a sorted run instead of rewriting the compact base sidecars on every append. Teach full loads to merge compact base + append runs on demand, with cache reuse for repeated reads, and clean up run sidecars correctly on replace/drop. Extend full descriptor metadata with run tracking while keeping the prototype index format version unchanged. Add regression tests for repeated appends on field and expression full indexes, including persistent reopen. --- src/blosc2/indexing.py | 107 +++++++++++++++++++++++++++++---- tests/ndarray/test_indexing.py | 69 +++++++++++++++++++++ 2 files changed, 165 insertions(+), 11 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index a5b12e64..d2202018 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -20,7 +20,7 @@ import blosc2 INDEXES_VLMETA_KEY = "blosc2_indexes" -INDEX_FORMAT_VERSION = 3 +INDEX_FORMAT_VERSION = 1 SELF_TARGET_NAME = "__self__" FLAG_ALL_NAN = np.uint8(1 << 0) @@ -155,6 +155,8 @@ def _copy_descriptor(descriptor: dict) -> dict: copied["reduced"] = descriptor["reduced"].copy() if descriptor.get("full") is not None: copied["full"] = descriptor["full"].copy() + if "runs" in copied["full"]: + copied["full"]["runs"] = [run.copy() for run in copied["full"]["runs"]] return copied @@ -497,6 +499,8 @@ def _build_full_descriptor( return { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], + "runs": [], + "next_run_id": 0, } @@ -995,6 +999,8 @@ def _build_full_descriptor_ooc( return { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], + "runs": [], + "next_run_id": 0, } run_items = max(int(array.chunks[0]), min(size, FULL_OOC_RUN_ITEMS)) runs = [] @@ -1042,6 +1048,8 @@ def _build_full_descriptor_ooc( return { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], + "runs": [], + "next_run_id": 0, } @@ -1317,6 +1325,9 @@ def _drop_descriptor_sidecars(descriptor: dict) -> None: if descriptor.get("full") is not None: _remove_sidecar_path(descriptor["full"]["values_path"]) _remove_sidecar_path(descriptor["full"]["positions_path"]) + for run in descriptor["full"].get("runs", ()): + _remove_sidecar_path(run.get("values_path")) + _remove_sidecar_path(run.get("positions_path")) def _replace_levels_descriptor(array: blosc2.NDArray, descriptor: dict, kind: str, persistent: bool) -> None: @@ -1434,10 +1445,41 @@ def _replace_full_descriptor( ) -> None: kind = descriptor["kind"] token = descriptor["token"] + full = descriptor["full"] + for run in full.get("runs", ()): + _remove_sidecar_path(run.get("values_path")) + _remove_sidecar_path(run.get("positions_path")) + _clear_cached_data(array, token) values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) - descriptor["full"]["values_path"] = values_sidecar["path"] - descriptor["full"]["positions_path"] = positions_sidecar["path"] + full["values_path"] = values_sidecar["path"] + full["positions_path"] = positions_sidecar["path"] + full["runs"] = [] + full["next_run_id"] = 0 + + +def _store_full_run_descriptor( + array: blosc2.NDArray, + descriptor: dict, + run_id: int, + sorted_values: np.ndarray, + positions: np.ndarray, +) -> dict: + kind = descriptor["kind"] + token = descriptor["token"] + persistent = descriptor["persistent"] + values_sidecar = _store_array_sidecar( + array, token, kind, "full_run", f"{run_id}.values", sorted_values, persistent + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full_run", f"{run_id}.positions", positions, persistent + ) + return { + "id": run_id, + "length": len(sorted_values), + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + } def _append_full_descriptor( @@ -1446,17 +1488,21 @@ def _append_full_descriptor( full = descriptor.get("full") if full is None: raise RuntimeError("full index metadata is not available") - existing_values, existing_positions = _load_full_arrays(array, descriptor) appended_positions = np.arange(old_size, old_size + len(appended_values), dtype=np.int64) order = np.lexsort((appended_positions, appended_values)) - merged_values, merged_positions = _merge_sorted_slices( - existing_values, - existing_positions, + run_id = int(full.get("next_run_id", 0)) + run = _store_full_run_descriptor( + array, + descriptor, + run_id, appended_values[order], appended_positions[order], - np.dtype(descriptor["dtype"]), ) - _replace_full_descriptor(array, descriptor, merged_values, merged_positions, descriptor["persistent"]) + runs = list(full.get("runs", ())) + runs.append(run) + full["runs"] = runs + full["next_run_id"] = run_id + 1 + _clear_full_merge_cache(array, descriptor["token"]) def append_to_indexes(array: blosc2.NDArray, old_size: int, appended_values: np.ndarray) -> None: @@ -1566,12 +1612,51 @@ def _load_level_summaries(array: blosc2.NDArray, descriptor: dict, level: str) - return _load_array_sidecar(array, descriptor["token"], "summary", level, level_info["path"]) +def _full_merge_cache_key(array: blosc2.NDArray, token: str, name: str): + return _data_cache_key(array, token, "full_merged", name) + + +def _clear_full_merge_cache(array: blosc2.NDArray, token: str) -> None: + _DATA_CACHE.pop(_full_merge_cache_key(array, token, "values"), None) + _DATA_CACHE.pop(_full_merge_cache_key(array, token, "positions"), None) + + +def _load_full_run_arrays( + array: blosc2.NDArray, descriptor: dict, run: dict +) -> tuple[np.ndarray, np.ndarray]: + run_id = int(run["id"]) + token = descriptor["token"] + values = _load_array_sidecar(array, token, "full_run", f"{run_id}.values", run["values_path"]) + positions = _load_array_sidecar(array, token, "full_run", f"{run_id}.positions", run["positions_path"]) + return values, positions + + def _load_full_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: full = descriptor.get("full") if full is None: raise RuntimeError("full index metadata is not available") - values = _load_array_sidecar(array, descriptor["token"], "full", "values", full["values_path"]) - positions = _load_array_sidecar(array, descriptor["token"], "full", "positions", full["positions_path"]) + token = descriptor["token"] + runs = full.get("runs", ()) + if runs: + cached_values = _DATA_CACHE.get(_full_merge_cache_key(array, token, "values")) + cached_positions = _DATA_CACHE.get(_full_merge_cache_key(array, token, "positions")) + if cached_values is not None and cached_positions is not None: + return cached_values, cached_positions + + values = _load_array_sidecar(array, token, "full", "values", full["values_path"]) + positions = _load_array_sidecar(array, token, "full", "positions", full["positions_path"]) + if runs: + dtype = np.dtype(descriptor["dtype"]) + merged_values = values + merged_positions = positions + for run in runs: + run_values, run_positions = _load_full_run_arrays(array, descriptor, run) + merged_values, merged_positions = _merge_sorted_slices( + merged_values, merged_positions, run_values, run_positions, dtype + ) + _DATA_CACHE[_full_merge_cache_key(array, token, "values")] = merged_values + _DATA_CACHE[_full_merge_cache_key(array, token, "positions")] = merged_positions + return merged_values, merged_positions return values, positions diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index fc1a141e..5506e099 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -351,6 +351,52 @@ def test_append_keeps_full_index_sorted_access_current(): np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], expected) +def test_repeated_appends_keep_full_index_current(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + batches = [ + np.array([(0, 100), (3, 101)], dtype=dtype), + np.array([(2, 102), (1, 103), (4, 104)], dtype=dtype), + ] + expected = data + for nrun, batch in enumerate(batches, start=1): + arr.append(batch) + expected = np.concatenate((expected, batch)) + assert len(arr.indexes[0]["full"]["runs"]) == nrun + + expr = blosc2.lazyexpr("(a >= 1) & (a < 4)", arr.fields).where(arr) + assert expr.will_use_index() is True + + expected_mask = (expected["a"] >= 1) & (expected["a"] < 4) + np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], np.sort(expected, order=["a", "b"])) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_persistent_full_index_runs_survive_reopen(tmp_path): + path = tmp_path / "full_index_runs.b2nd" + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + batch1 = np.array([(0, 100), (3, 101)], dtype=dtype) + batch2 = np.array([(2, 102), (1, 103), (4, 104)], dtype=dtype) + expected = np.concatenate((data, batch1, batch2)) + arr.append(batch1) + arr.append(batch2) + + reopened = blosc2.open(path, mode="a") + assert len(reopened.indexes[0]["full"]["runs"]) == 2 + + expr = blosc2.lazyexpr("(a >= 1) & (a < 4)", reopened.fields).where(reopened) + expected_mask = (expected["a"] >= 1) & (expected["a"] < 4) + np.testing.assert_array_equal(reopened.sort(order=["a", "b"])[:], np.sort(expected, order=["a", "b"])) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_expression_index_matches_scan(kind): rng = np.random.default_rng(9) @@ -454,3 +500,26 @@ def test_append_keeps_expression_index_current(kind): if kind == "full": expected_positions = np.argsort(np.abs(all_data["x"]), kind="stable") np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], all_data[expected_positions]) + + +def test_repeated_appends_keep_full_expression_index_current(): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full") + + batches = [ + np.array([(-4, 4), (12, 5)], dtype=dtype), + np.array([(-11, 6), (5, 7)], dtype=dtype), + ] + expected = data + for nrun, batch in enumerate(batches, start=1): + arr.append(batch) + expected = np.concatenate((expected, batch)) + assert len(arr.indexes[0]["full"]["runs"]) == nrun + + expr = blosc2.lazyexpr("(abs(x) >= 4) & (abs(x) < 12)", arr.fields).where(arr) + expected_mask = (np.abs(expected["x"]) >= 4) & (np.abs(expected["x"]) < 12) + expected_positions = np.argsort(np.abs(expected["x"]), kind="stable") + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], expected[expected_positions]) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) From 7027fef081d637e02cef7cb762d7b400c9dfe89d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 3 Apr 2026 13:27:22 +0200 Subject: [PATCH 16/68] Improve full-index selective lookup with L1 and L2 caches --- bench/ndarray/expression_index_bench.py | 69 ++++- bench/ndarray/index_query_bench.py | 61 +++- src/blosc2/indexing.py | 378 +++++++++++++++++++++++- src/blosc2/ndarray.py | 6 + tests/ndarray/test_indexing.py | 87 ++++++ 5 files changed, 583 insertions(+), 18 deletions(-) diff --git a/bench/ndarray/expression_index_bench.py b/bench/ndarray/expression_index_bench.py index 1398e82c..87d8ba8c 100644 --- a/bench/ndarray/expression_index_bench.py +++ b/bench/ndarray/expression_index_bench.py @@ -29,6 +29,7 @@ RNG_SEED = 0 DEFAULT_OPLEVEL = 5 EXPRESSION = "abs(x)" +FULL_QUERY_MODES = ("auto", "selective-ooc", "whole-load") def dtype_token(dtype: np.dtype) -> str: @@ -103,6 +104,21 @@ def benchmark_index_once(arr: blosc2.NDArray, cond) -> tuple[float, int]: return elapsed, len(result) +def _with_full_query_mode(full_query_mode: str): + class _FullQueryModeScope: + def __enter__(self): + self.previous = os.environ.get("BLOSC2_FULL_EXACT_QUERY_MODE") + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = full_query_mode + + def __exit__(self, exc_type, exc, tb): + if self.previous is None: + os.environ.pop("BLOSC2_FULL_EXACT_QUERY_MODE", None) + else: + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = self.previous + + return _FullQueryModeScope() + + def index_sizes(descriptor: dict) -> tuple[int, int]: logical = 0 disk = 0 @@ -187,7 +203,14 @@ def _open_or_build_indexed_array( def benchmark_size( - size: int, size_dir: Path, dist: str, query_width: int, optlevel: int, x_dtype: np.dtype, in_mem: bool + size: int, + size_dir: Path, + dist: str, + query_width: int, + optlevel: int, + x_dtype: np.dtype, + in_mem: bool, + full_query_mode: str, ) -> list[dict]: get_data = _source_data_factory(size, dist, x_dtype) arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist, x_dtype), get_data) @@ -210,9 +233,10 @@ def benchmark_size( ) idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) idx_expr = idx_cond.where(idx_arr) - explanation = idx_expr.explain() + with _with_full_query_mode(full_query_mode): + explanation = idx_expr.explain() + cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) - cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) rows.append( { @@ -231,6 +255,8 @@ def benchmark_size( "warm_speedup": None, "candidate_units": explanation["candidate_units"], "total_units": explanation["total_units"], + "lookup_path": explanation.get("lookup_path"), + "full_query_mode": full_query_mode, "logical_index_bytes": logical_index_bytes, "disk_index_bytes": disk_index_bytes, "index_pct": logical_index_bytes / base_bytes * 100, @@ -248,7 +274,8 @@ def measure_warm_queries(rows: list[dict], repeats: int) -> None: for result in rows: arr = result["_arr"] cond = result["_cond"] - index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] + with _with_full_query_mode(result["full_query_mode"]): + index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None result["warm_ms"] = warm_ms result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms @@ -310,6 +337,12 @@ def parse_args() -> argparse.Namespace: default=False, help="Use the in-memory index builders. Disabled by default; pass --in-mem to force them.", ) + parser.add_argument( + "--full-query-mode", + choices=FULL_QUERY_MODES, + default="auto", + help="How full exact queries should run during the benchmark: auto, selective-ooc, or whole-load.", + ) return parser.parse_args() @@ -344,16 +377,18 @@ def run_benchmarks( optlevel: int, x_dtype: np.dtype, in_mem: bool, + full_query_mode: str, ) -> None: all_results = [] print("Expression range-query benchmark across index kinds") print( f"expr={EXPRESSION}, chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " - f"query_width={query_width:,}, optlevel={optlevel}, dtype={x_dtype.name}, in_mem={in_mem}" + f"query_width={query_width:,}, optlevel={optlevel}, dtype={x_dtype.name}, in_mem={in_mem}, " + f"full_query_mode={full_query_mode}" ) for dist in dists: for size in sizes: - size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, x_dtype, in_mem) + size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, x_dtype, in_mem, full_query_mode) all_results.extend(size_results) print() @@ -413,12 +448,30 @@ def main() -> None: if args.outdir is None: with tempfile.TemporaryDirectory() as tmpdir: run_benchmarks( - sizes, dists, Path(tmpdir), args.dist, args.query_width, args.repeats, args.optlevel, x_dtype, args.in_mem + sizes, + dists, + Path(tmpdir), + args.dist, + args.query_width, + args.repeats, + args.optlevel, + x_dtype, + args.in_mem, + args.full_query_mode, ) else: args.outdir.mkdir(parents=True, exist_ok=True) run_benchmarks( - sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel, x_dtype, args.in_mem + sizes, + dists, + args.outdir, + args.dist, + args.query_width, + args.repeats, + args.optlevel, + x_dtype, + args.in_mem, + args.full_query_mode, ) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index cde0fd1a..7b0cf496 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -28,6 +28,7 @@ DISTS = ("sorted", "block-shuffled", "random") RNG_SEED = 0 DEFAULT_OPLEVEL = 5 +FULL_QUERY_MODES = ("auto", "selective-ooc", "whole-load") def dtype_token(dtype: np.dtype) -> str: @@ -124,6 +125,21 @@ def benchmark_index_once(arr: blosc2.NDArray, cond) -> tuple[float, int]: return elapsed, len(result) +def _with_full_query_mode(full_query_mode: str): + class _FullQueryModeScope: + def __enter__(self): + self.previous = os.environ.get("BLOSC2_FULL_EXACT_QUERY_MODE") + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = full_query_mode + + def __exit__(self, exc_type, exc, tb): + if self.previous is None: + os.environ.pop("BLOSC2_FULL_EXACT_QUERY_MODE", None) + else: + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = self.previous + + return _FullQueryModeScope() + + def index_sizes(descriptor: dict) -> tuple[int, int]: logical = 0 disk = 0 @@ -253,7 +269,14 @@ def _open_or_build_indexed_array( def benchmark_size( - size: int, size_dir: Path, dist: str, query_width: int, optlevel: int, id_dtype: np.dtype, in_mem: bool + size: int, + size_dir: Path, + dist: str, + query_width: int, + optlevel: int, + id_dtype: np.dtype, + in_mem: bool, + full_query_mode: str, ) -> list[dict]: get_data = _source_data_factory(size, dist, id_dtype) get_ordered_ids = _ordered_ids_factory(size, id_dtype) @@ -278,9 +301,10 @@ def benchmark_size( ) idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) idx_expr = idx_cond.where(idx_arr) - explanation = idx_expr.explain() + with _with_full_query_mode(full_query_mode): + explanation = idx_expr.explain() + cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) - cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) rows.append( { @@ -299,6 +323,8 @@ def benchmark_size( "warm_speedup": None, "candidate_units": explanation["candidate_units"], "total_units": explanation["total_units"], + "lookup_path": explanation.get("lookup_path"), + "full_query_mode": full_query_mode, "logical_index_bytes": logical_index_bytes, "disk_index_bytes": disk_index_bytes, "index_pct": logical_index_bytes / base_bytes * 100, @@ -316,7 +342,8 @@ def measure_warm_queries(rows: list[dict], repeats: int) -> None: for result in rows: arr = result["_arr"] cond = result["_cond"] - index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] + with _with_full_query_mode(result["full_query_mode"]): + index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None result["warm_ms"] = warm_ms result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms @@ -396,6 +423,12 @@ def parse_args() -> argparse.Namespace: default=False, help="Use the in-memory index builders. Disabled by default; pass --in-mem to force them.", ) + parser.add_argument( + "--full-query-mode", + choices=FULL_QUERY_MODES, + default="auto", + help="How full exact queries should run during the benchmark: auto, selective-ooc, or whole-load.", + ) return parser.parse_args() @@ -424,11 +457,21 @@ def main() -> None: args.optlevel, id_dtype, args.in_mem, + args.full_query_mode, ) else: args.outdir.mkdir(parents=True, exist_ok=True) run_benchmarks( - sizes, dists, args.outdir, args.dist, args.query_width, args.repeats, args.optlevel, id_dtype, args.in_mem + sizes, + dists, + args.outdir, + args.dist, + args.query_width, + args.repeats, + args.optlevel, + id_dtype, + args.in_mem, + args.full_query_mode, ) @@ -442,16 +485,20 @@ def run_benchmarks( optlevel: int, id_dtype: np.dtype, in_mem: bool, + full_query_mode: str, ) -> None: all_results = [] print("Structured range-query benchmark across index kinds") print( f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " - f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}" + f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " + f"full_query_mode={full_query_mode}" ) for dist in dists: for size in sizes: - size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, id_dtype, in_mem) + size_results = benchmark_size( + size, size_dir, dist, query_width, optlevel, id_dtype, in_mem, full_query_mode + ) all_results.extend(size_results) print() diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index d2202018..939e117c 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -10,6 +10,7 @@ import ast import hashlib import math +import os import re import tempfile from dataclasses import dataclass @@ -35,9 +36,11 @@ _IN_MEMORY_INDEXES: dict[int, dict] = {} _DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} +_SIDECAR_HANDLE_CACHE: dict[tuple[int, str | None, str, str], object] = {} BLOCK_GATHER_POSITIONS_THRESHOLD = 32 FULL_OOC_RUN_ITEMS = 2_000_000 FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 +FULL_SELECTIVE_OOC_MAX_SPANS = 128 def _sanitize_token(token: str) -> str: @@ -329,6 +332,25 @@ def _clear_cached_data(array: blosc2.NDArray, token: str) -> None: keys = [key for key in _DATA_CACHE if key[:2] == prefix] for key in keys: _DATA_CACHE.pop(key, None) + handle_keys = [key for key in _SIDECAR_HANDLE_CACHE if key[:2] == prefix] + for key in handle_keys: + _SIDECAR_HANDLE_CACHE.pop(key, None) + + +def _sidecar_handle_cache_key(array: blosc2.NDArray, token: str, category: str, name: str): + return (_array_key(array), token, category, name) + + +def _open_sidecar_handle(array: blosc2.NDArray, token: str, category: str, name: str, path: str | None): + cache_key = _sidecar_handle_cache_key(array, token, category, name) + cached = _SIDECAR_HANDLE_CACHE.get(cache_key) + if cached is not None: + return cached + if path is None: + raise RuntimeError("sidecar handle path is not available") + handle = blosc2.open(path) + _SIDECAR_HANDLE_CACHE[cache_key] = handle + return handle def _operands_for_dependencies(values: np.ndarray, dependencies: list[str]) -> dict[str, np.ndarray]: @@ -367,6 +389,10 @@ def _summary_dtype(dtype: np.dtype) -> np.dtype: return np.dtype([("min", dtype), ("max", dtype), ("flags", np.uint8)]) +def _boundary_dtype(dtype: np.dtype) -> np.dtype: + return np.dtype([("start", dtype), ("end", dtype)]) + + def _segment_summary(segment: np.ndarray, dtype: np.dtype): flags = np.uint8(0) if dtype.kind == "f": @@ -394,6 +420,18 @@ def _compute_segment_summaries(values: np.ndarray, dtype: np.dtype, segment_len: return summaries +def _compute_sorted_boundaries(values: np.ndarray, dtype: np.dtype, segment_len: int) -> np.ndarray: + nsegments = math.ceil(values.shape[0] / segment_len) + boundaries = np.empty(nsegments, dtype=_boundary_dtype(dtype)) + + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, values.shape[0]) + segment = values[start:stop] + boundaries[idx] = (segment[0], segment[-1]) + return boundaries + + def _store_array_sidecar( array: blosc2.NDArray, token: str, @@ -484,6 +522,38 @@ def _build_levels_descriptor_ooc( return levels +def _sidecar_storage_geometry( + path: str | None, fallback_chunk_len: int, fallback_block_len: int +) -> tuple[int, int]: + if path is None: + return fallback_chunk_len, fallback_block_len + sidecar = blosc2.open(path) + return int(sidecar.chunks[0]), int(sidecar.blocks[0]) + + +def _rebuild_full_navigation_sidecars( + array: blosc2.NDArray, + token: str, + kind: str, + full: dict, + sorted_values: np.ndarray, + persistent: bool, +) -> None: + chunk_len, block_len = _sidecar_storage_geometry( + full.get("values_path"), int(array.chunks[0]), int(array.blocks[0]) + ) + l1 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), chunk_len) + l2 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), block_len) + l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent) + l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent) + full["l1_path"] = l1_sidecar["path"] + full["l2_path"] = l2_sidecar["path"] + full["sidecar_chunk_len"] = int(chunk_len) + full["sidecar_block_len"] = int(block_len) + full["l1_dtype"] = l1_sidecar["dtype"] + full["l2_dtype"] = l2_sidecar["dtype"] + + def _build_full_descriptor( array: blosc2.NDArray, token: str, @@ -496,12 +566,14 @@ def _build_full_descriptor( sorted_values = values[order] values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) - return { + full = { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], "runs": [], "next_run_id": 0, } + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + return full def _position_dtype(max_value: int) -> np.dtype: @@ -996,12 +1068,14 @@ def _build_full_descriptor_ooc( positions_sidecar = _store_array_sidecar( array, token, kind, "full", "positions", positions, persistent ) - return { + full = { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], "runs": [], "next_run_id": 0, } + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + return full run_items = max(int(array.chunks[0]), min(size, FULL_OOC_RUN_ITEMS)) runs = [] for run_id, start in enumerate(range(0, size, run_items)): @@ -1045,12 +1119,14 @@ def _build_full_descriptor_ooc( positions = np.load(final_run.positions_path, mmap_mode="r") values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) - return { + full = { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], "runs": [], "next_run_id": 0, } + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + return full def _build_descriptor( @@ -1325,6 +1401,8 @@ def _drop_descriptor_sidecars(descriptor: dict) -> None: if descriptor.get("full") is not None: _remove_sidecar_path(descriptor["full"]["values_path"]) _remove_sidecar_path(descriptor["full"]["positions_path"]) + _remove_sidecar_path(descriptor["full"].get("l1_path")) + _remove_sidecar_path(descriptor["full"].get("l2_path")) for run in descriptor["full"].get("runs", ()): _remove_sidecar_path(run.get("values_path")) _remove_sidecar_path(run.get("positions_path")) @@ -1449,6 +1527,8 @@ def _replace_full_descriptor( for run in full.get("runs", ()): _remove_sidecar_path(run.get("values_path")) _remove_sidecar_path(run.get("positions_path")) + _remove_sidecar_path(full.get("l1_path")) + _remove_sidecar_path(full.get("l2_path")) _clear_cached_data(array, token) values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) @@ -1456,6 +1536,7 @@ def _replace_full_descriptor( full["positions_path"] = positions_sidecar["path"] full["runs"] = [] full["next_run_id"] = 0 + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) def _store_full_run_descriptor( @@ -1570,6 +1651,127 @@ def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | N ) +def _copy_sidecar_to_temp_run(path: str, length: int, dtype: np.dtype, workdir: Path, prefix: str) -> Path: + sidecar = blosc2.open(path) + out_path = workdir / f"{prefix}.npy" + output = np.lib.format.open_memmap(out_path, mode="w+", dtype=dtype, shape=(length,)) + chunk_len = int(sidecar.chunks[0]) + for chunk_id, start in enumerate(range(0, length, chunk_len)): + stop = min(start + chunk_len, length) + span = np.empty(stop - start, dtype=dtype) + sidecar.get_1d_span_numpy(span, chunk_id, 0, stop - start) + output[start:stop] = span + output.flush() + del output + return out_path + + +def _materialize_sorted_run( + values: np.ndarray, + positions: np.ndarray, + length: int, + value_dtype: np.dtype, + workdir: Path, + prefix: str, +) -> SortedRun: + values_path = workdir / f"{prefix}.values.npy" + positions_path = workdir / f"{prefix}.positions.npy" + run_values = np.lib.format.open_memmap(values_path, mode="w+", dtype=value_dtype, shape=(length,)) + run_positions = np.lib.format.open_memmap(positions_path, mode="w+", dtype=np.int64, shape=(length,)) + run_values[:] = values + run_positions[:] = positions + run_values.flush() + run_positions.flush() + del run_values, run_positions + return SortedRun(values_path, positions_path, length) + + +def _full_compaction_runs(array: blosc2.NDArray, descriptor: dict, workdir: Path) -> list[SortedRun]: + full = descriptor["full"] + dtype = np.dtype(descriptor["dtype"]) + token = descriptor["token"] + runs = [] + if full["values_path"] is not None and full["positions_path"] is not None: + length = int(array.shape[0]) - sum(int(run["length"]) for run in full.get("runs", ())) + base_values_path = _copy_sidecar_to_temp_run( + full["values_path"], length, dtype, workdir, "compact_base_values" + ) + base_positions_path = _copy_sidecar_to_temp_run( + full["positions_path"], length, np.dtype(np.int64), workdir, "compact_base_positions" + ) + runs.append(SortedRun(base_values_path, base_positions_path, length)) + else: + values = _load_array_sidecar(array, token, "full", "values", full["values_path"]) + positions = _load_array_sidecar(array, token, "full", "positions", full["positions_path"]) + runs.append(_materialize_sorted_run(values, positions, len(values), dtype, workdir, "compact_base")) + + for run in full.get("runs", ()): + run_length = int(run["length"]) + run_id = int(run["id"]) + if run["values_path"] is not None and run["positions_path"] is not None: + run_values_path = _copy_sidecar_to_temp_run( + run["values_path"], run_length, dtype, workdir, f"run_{run_id}_values" + ) + run_positions_path = _copy_sidecar_to_temp_run( + run["positions_path"], run_length, np.dtype(np.int64), workdir, f"run_{run_id}_positions" + ) + runs.append(SortedRun(run_values_path, run_positions_path, run_length)) + continue + run_values, run_positions = _load_full_run_arrays(array, descriptor, run) + runs.append( + _materialize_sorted_run(run_values, run_positions, run_length, dtype, workdir, f"run_{run_id}") + ) + return runs + + +def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> dict: + store = _load_store(array) + token = _resolve_index_token(store, field, name) + descriptor = store["indexes"][token] + if descriptor["kind"] != "full": + raise NotImplementedError("compact_index() is currently only implemented for full indexes") + if descriptor.get("stale", False): + raise RuntimeError("cannot compact a stale index; rebuild it first") + + full = descriptor["full"] + if not full.get("runs"): + if full.get("l1_path") is None or full.get("l2_path") is None: + sorted_values, positions = _load_full_arrays(array, descriptor) + _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) + _clear_full_merge_cache(array, descriptor["token"]) + _save_store(array, store) + return _copy_descriptor(descriptor) + + dtype = np.dtype(descriptor["dtype"]) + with tempfile.TemporaryDirectory(prefix="blosc2-index-compact-") as tmpdir: + workdir = Path(tmpdir) + runs = _full_compaction_runs(array, descriptor, workdir) + merge_id = 0 + while len(runs) > 1: + next_runs = [] + for idx in range(0, len(runs), 2): + if idx + 1 >= len(runs): + next_runs.append(runs[idx]) + continue + next_runs.append( + _merge_run_pair( + runs[idx], runs[idx + 1], workdir, dtype, merge_id, FULL_OOC_MERGE_BUFFER_ITEMS + ) + ) + merge_id += 1 + runs = next_runs + final_run = runs[0] + sorted_values = np.load(final_run.values_path, mmap_mode="r") + positions = np.load(final_run.positions_path, mmap_mode="r") + _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) + final_run.values_path.unlink(missing_ok=True) + final_run.positions_path.unlink(missing_ok=True) + + _clear_full_merge_cache(array, descriptor["token"]) + _save_store(array, store) + return _copy_descriptor(descriptor) + + def get_indexes(array: blosc2.NDArray) -> list[dict]: store = _load_store(array) return [_copy_descriptor(store["indexes"][key]) for key in sorted(store["indexes"])] @@ -1631,6 +1833,32 @@ def _load_full_run_arrays( return values, positions +def _load_full_navigation_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + l1_path = full.get("l1_path") + l2_path = full.get("l2_path") + if l1_path is None or l2_path is None: + raise RuntimeError("full index navigation metadata is not available") + token = descriptor["token"] + l1 = _load_array_sidecar(array, token, "full_nav", "l1", l1_path) + l2 = _load_array_sidecar(array, token, "full_nav", "l2", l2_path) + return l1, l2 + + +def _load_full_sidecar_handles(array: blosc2.NDArray, descriptor: dict): + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + token = descriptor["token"] + values_sidecar = _open_sidecar_handle(array, token, "full_handle", "values", full["values_path"]) + positions_sidecar = _open_sidecar_handle( + array, token, "full_handle", "positions", full["positions_path"] + ) + return values_sidecar, positions_sidecar + + def _load_full_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: full = descriptor.get("full") if full is None: @@ -2143,11 +2371,126 @@ def _search_bounds(values: np.ndarray, plan: ExactPredicatePlan) -> tuple[int, i return lo, hi +def _candidate_units_from_boundaries(boundaries: np.ndarray, plan: ExactPredicatePlan) -> np.ndarray: + if len(boundaries) == 0: + return np.zeros(0, dtype=bool) + starts = boundaries["start"] + ends = boundaries["end"] + candidate = np.ones(len(boundaries), dtype=bool) + if plan.lower is not None: + candidate &= ends >= plan.lower if plan.lower_inclusive else ends > plan.lower + if plan.upper is not None: + candidate &= starts <= plan.upper if plan.upper_inclusive else starts < plan.upper + return candidate + + +def _full_query_mode_override() -> str: + mode = os.getenv("BLOSC2_FULL_EXACT_QUERY_MODE", "auto").strip().lower() + if mode not in {"auto", "selective-ooc", "whole-load"}: + return "auto" + return mode + + +def _contiguous_true_runs(mask: np.ndarray) -> list[tuple[int, int]]: + true_ids = np.flatnonzero(mask) + if len(true_ids) == 0: + return [] + breaks = np.nonzero(np.diff(true_ids) != 1)[0] + 1 + runs = [] + start = 0 + for stop in (*breaks, len(true_ids)): + part = true_ids[start:stop] + runs.append((int(part[0]), int(part[-1]) + 1)) + start = stop + return runs + + +def _full_supports_selective_ooc_lookup(array: blosc2.NDArray, descriptor: dict) -> bool: + full = descriptor.get("full") + if full is None or full.get("runs"): + return False + if not descriptor.get("persistent", False): + return False + if full.get("values_path") is None or full.get("positions_path") is None: + return False + if full.get("l1_path") is None or full.get("l2_path") is None: + return False + try: + values_sidecar, positions_sidecar = _load_full_sidecar_handles(array, descriptor) + except Exception: + return False + return ( + _supports_block_reads(array) + and _supports_block_reads(values_sidecar) + and _supports_block_reads(positions_sidecar) + ) + + +def _exact_positions_from_full_selective_ooc( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + full = descriptor["full"] + l1, l2 = _load_full_navigation_arrays(array, descriptor) + candidate_chunks = _candidate_units_from_boundaries(l1, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64) + + candidate_blocks = _candidate_units_from_boundaries(l2, plan) + if not np.any(candidate_blocks): + return np.empty(0, dtype=np.int64) + + values_sidecar, positions_sidecar = _load_full_sidecar_handles(array, descriptor) + dtype = np.dtype(descriptor["dtype"]) + chunk_len = int(full["sidecar_chunk_len"]) + block_len = int(full["sidecar_block_len"]) + size = int(descriptor["shape"][0]) + parts = [] + span_count = 0 + + for chunk_id in np.flatnonzero(candidate_chunks): + chunk_start = int(chunk_id) * chunk_len + chunk_stop = min(chunk_start + chunk_len, size) + first_block = chunk_start // block_len + nblocks = math.ceil((chunk_stop - chunk_start) / block_len) + block_mask = np.asarray(candidate_blocks[first_block : first_block + nblocks], dtype=bool) + if not np.any(block_mask): + continue + span_runs = _contiguous_true_runs(block_mask) + span_count += len(span_runs) + if span_count > FULL_SELECTIVE_OOC_MAX_SPANS: + raise RuntimeError("too many candidate spans for selective full lookup") + + for block_start_idx, block_stop_idx in span_runs: + span_start = chunk_start + block_start_idx * block_len + span_stop = min(chunk_start + block_stop_idx * block_len, chunk_stop) + local_start = span_start - chunk_start + span_items = span_stop - span_start + span_values = np.empty(span_items, dtype=dtype) + values_sidecar.get_1d_span_numpy(span_values, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(span_values, plan) + if lo >= hi: + continue + matched = np.empty(hi - lo, dtype=np.int64) + positions_sidecar.get_1d_span_numpy(matched, int(chunk_id), local_start + lo, hi - lo) + parts.append(matched) + + if not parts: + return np.empty(0, dtype=np.int64) + positions = np.concatenate(parts) if len(parts) > 1 else parts[0] + return np.sort(positions.astype(np.int64, copy=False), kind="stable") + + def _exact_positions_from_full( array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan ) -> np.ndarray: if _range_is_empty(plan): return np.empty(0, dtype=np.int64) + mode = _full_query_mode_override() + if mode != "whole-load" and _full_supports_selective_ooc_lookup(array, descriptor): + try: + return _exact_positions_from_full_selective_ooc(array, descriptor, plan) + except RuntimeError: + pass sorted_values, positions = _load_full_arrays(array, descriptor) lo, hi = _search_bounds(sorted_values, plan) if lo >= hi: @@ -2619,6 +2962,31 @@ def _normalize_primary_order_target(array: blosc2.NDArray, order: str | None) -> return target, None +def _full_run_count(descriptor: dict | None) -> int: + if descriptor is None or descriptor.get("full") is None: + return 0 + return len(descriptor["full"].get("runs", ())) + + +def _full_lookup_path(descriptor: dict | None, *, ordered: bool) -> str | None: + if descriptor is None or descriptor.get("kind") != "full": + return None + if _full_run_count(descriptor): + return "in-memory-merge" + if ordered: + return "in-memory-order" + mode = _full_query_mode_override() + if mode == "whole-load": + return "whole-load" + if ( + descriptor.get("persistent") + and descriptor["full"].get("l1_path") + and descriptor["full"].get("l2_path") + ): + return "compact-selective-ooc" + return "in-memory" + + def _normalize_order_fields( array: blosc2.NDArray, order: str | list[str] | None ) -> tuple[dict, list[str | None]]: @@ -2907,6 +3275,8 @@ def explain_query(expr) -> dict: "exact_rows": ordered_plan.selected_rows if ordered_plan.usable else None, "filter_reason": filter_plan.reason, "filter_level": filter_plan.level, + "full_runs": _full_run_count(ordered_plan.descriptor), + "lookup_path": _full_lookup_path(ordered_plan.descriptor, ordered=True), "descriptor": ordered_plan.descriptor, } @@ -2926,5 +3296,7 @@ def explain_query(expr) -> dict: "candidate_chunks": plan.selected_units, "total_chunks": plan.total_units, "exact_rows": None if plan.exact_positions is None else len(plan.exact_positions), + "full_runs": _full_run_count(plan.descriptor), + "lookup_path": _full_lookup_path(plan.descriptor, ordered=False), "descriptor": plan.descriptor, } diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 91c57a07..63a1f5a3 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4852,6 +4852,12 @@ def rebuild_index(self, field: str | None = None, name: str | None = None) -> di return indexing.rebuild_index(self, field=field, name=name) + def compact_index(self, field: str | None = None, name: str | None = None) -> dict: + """Compact a ``full`` index by merging the compact base and append runs.""" + from . import indexing + + return indexing.compact_index(self, field=field, name=name) + @property def indexes(self) -> list[dict]: from . import indexing diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 5506e099..06af76d6 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -397,6 +397,31 @@ def test_persistent_full_index_runs_survive_reopen(tmp_path): np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) +def test_persistent_compact_full_exact_query_avoids_whole_sidecar_load(monkeypatch, tmp_path): + path = tmp_path / "full_selective_ooc.b2nd" + rng = np.random.default_rng(12) + data = np.arange(120_000, dtype=np.int64) + rng.shuffle(data) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(12_000,), blocks=(2_000,)) + arr.create_csindex() + + reopened = blosc2.open(path, mode="a") + indexing = __import__("blosc2.indexing", fromlist=["_load_array_sidecar"]) + original_load = indexing._load_array_sidecar + + def guarded_load(array, token, category, name, sidecar_path): + if category == "full" and name in {"values", "positions"}: + raise AssertionError("compact full exact lookup should not whole-load full sidecars") + return original_load(array, token, category, name, sidecar_path) + + monkeypatch.setattr(indexing, "_load_array_sidecar", guarded_load) + + expr = ((reopened >= 50_000) & (reopened < 50_010)).where(reopened) + explained = expr.explain() + assert explained["lookup_path"] == "compact-selective-ooc" + np.testing.assert_array_equal(expr.compute()[:], data[(data >= 50_000) & (data < 50_010)]) + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_expression_index_matches_scan(kind): rng = np.random.default_rng(9) @@ -523,3 +548,65 @@ def test_repeated_appends_keep_full_expression_index_current(): expected_positions = np.argsort(np.abs(expected["x"]), kind="stable") np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], expected[expected_positions]) np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_compact_full_index_clears_runs_and_preserves_results(tmp_path): + path = tmp_path / "compact_full_runs.b2nd" + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + batch1 = np.array([(0, 100), (3, 101)], dtype=dtype) + batch2 = np.array([(2, 102), (1, 103), (4, 104)], dtype=dtype) + expected = np.concatenate((data, batch1, batch2)) + arr.append(batch1) + arr.append(batch2) + + before = arr.indexes[0] + assert len(before["full"]["runs"]) == 2 + run_paths = [(run["values_path"], run["positions_path"]) for run in before["full"]["runs"]] + + compacted = arr.compact_index("a") + assert compacted["kind"] == "full" + assert compacted["full"]["runs"] == [] + assert compacted["full"]["l1_path"] is not None + assert compacted["full"]["l2_path"] is not None + + reopened = blosc2.open(path, mode="a") + assert reopened.indexes[0]["full"]["runs"] == [] + for values_path, positions_path in run_paths: + with pytest.raises(FileNotFoundError): + blosc2.open(values_path) + with pytest.raises(FileNotFoundError): + blosc2.open(positions_path) + + expr = blosc2.lazyexpr("(a >= 1) & (a < 4)", reopened.fields).where(reopened) + explained = expr.explain() + assert explained["full_runs"] == 0 + assert explained["lookup_path"] == "compact-selective-ooc" + expected_mask = (expected["a"] >= 1) & (expected["a"] < 4) + np.testing.assert_array_equal(reopened.sort(order=["a", "b"])[:], np.sort(expected, order=["a", "b"])) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_compact_full_expression_index_preserves_results(): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full") + + batch1 = np.array([(-4, 4), (12, 5)], dtype=dtype) + batch2 = np.array([(-11, 6), (5, 7)], dtype=dtype) + expected = np.concatenate((data, batch1, batch2)) + arr.append(batch1) + arr.append(batch2) + + compacted = arr.compact_index() + assert compacted["full"]["runs"] == [] + + expr = blosc2.lazyexpr("(abs(x) >= 4) & (abs(x) < 12)", arr.fields).where(arr) + expected_mask = (np.abs(expected["x"]) >= 4) & (np.abs(expected["x"]) < 12) + expected_positions = np.argsort(np.abs(expected["x"]), kind="stable") + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], expected[expected_positions]) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) From da8785e30e495457e434088fd84fd5a3a123d43f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 3 Apr 2026 13:44:15 +0200 Subject: [PATCH 17/68] Add bounded run fallback and document compact_index API --- doc/reference/ndarray.rst | 16 ++++ src/blosc2/indexing.py | 163 ++++++++++++++++++++++++++++++++- src/blosc2/ndarray.py | 46 +++++++++- tests/ndarray/test_indexing.py | 49 ++++++++++ 4 files changed, 271 insertions(+), 3 deletions(-) diff --git a/doc/reference/ndarray.rst b/doc/reference/ndarray.rst index c70ea255..36a21408 100644 --- a/doc/reference/ndarray.rst +++ b/doc/reference/ndarray.rst @@ -31,6 +31,22 @@ In addition, all the functions from the :ref:`Lazy Functions ` s .. automethod:: __getitem__ .. automethod:: __setitem__ + Index Methods + ------------- + + The following methods are part of the public NDArray indexing lifecycle. + Use ``create_index`` / ``create_expr_index`` to build indexes, + ``rebuild_index`` when a stale index must be refreshed after unsupported + mutations, and ``compact_index`` to consolidate append-heavy ``full`` + indexes explicitly. + + .. automethod:: create_index + .. automethod:: create_csindex + .. automethod:: create_expr_index + .. automethod:: drop_index + .. automethod:: rebuild_index + .. automethod:: compact_index + Constructors ------------ .. _NDArrayConstructors: diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 939e117c..be6cb445 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -41,6 +41,8 @@ FULL_OOC_RUN_ITEMS = 2_000_000 FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 FULL_SELECTIVE_OOC_MAX_SPANS = 128 +FULL_RUN_BOUNDED_FALLBACK_RUNS = 8 +FULL_RUN_BOUNDED_FALLBACK_ITEMS = 1_000_000 def _sanitize_token(token: str) -> str: @@ -1859,6 +1861,18 @@ def _load_full_sidecar_handles(array: blosc2.NDArray, descriptor: dict): return values_sidecar, positions_sidecar +def _load_full_run_sidecar_handles(array: blosc2.NDArray, descriptor: dict, run: dict): + run_id = int(run["id"]) + token = descriptor["token"] + values_sidecar = _open_sidecar_handle( + array, token, "full_run_handle", f"{run_id}.values", run["values_path"] + ) + positions_sidecar = _open_sidecar_handle( + array, token, "full_run_handle", f"{run_id}.positions", run["positions_path"] + ) + return values_sidecar, positions_sidecar + + def _load_full_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: full = descriptor.get("full") if full is None: @@ -2384,6 +2398,18 @@ def _candidate_units_from_boundaries(boundaries: np.ndarray, plan: ExactPredicat return candidate +def _full_runs_need_bounded_fallback(descriptor: dict) -> bool: + full = descriptor.get("full") + if full is None: + return False + runs = tuple(full.get("runs", ())) + if not runs: + return False + if len(runs) >= FULL_RUN_BOUNDED_FALLBACK_RUNS: + return True + return sum(int(run["length"]) for run in runs) >= FULL_RUN_BOUNDED_FALLBACK_ITEMS + + def _full_query_mode_override() -> str: mode = os.getenv("BLOSC2_FULL_EXACT_QUERY_MODE", "auto").strip().lower() if mode not in {"auto", "selective-ooc", "whole-load"}: @@ -2405,6 +2431,35 @@ def _contiguous_true_runs(mask: np.ndarray) -> list[tuple[int, int]]: return runs +def _sorted_chunk_boundaries_from_handle( + array: blosc2.NDArray, + token: str, + category: str, + name: str, + values_sidecar, + dtype: np.dtype, +) -> np.ndarray: + cache_key = _data_cache_key(array, token, category, name) + cached = _DATA_CACHE.get(cache_key) + if cached is not None: + return cached + + size = int(values_sidecar.shape[0]) + chunk_len = int(values_sidecar.chunks[0]) + nchunks = math.ceil(size / chunk_len) + boundaries = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + start_value = np.empty(1, dtype=dtype) + end_value = np.empty(1, dtype=dtype) + for chunk_id in range(nchunks): + chunk_start = chunk_id * chunk_len + chunk_stop = min(chunk_start + chunk_len, size) + values_sidecar.get_1d_span_numpy(start_value, chunk_id, 0, 1) + values_sidecar.get_1d_span_numpy(end_value, chunk_id, chunk_stop - chunk_start - 1, 1) + boundaries[chunk_id] = (start_value[0], end_value[0]) + _DATA_CACHE[cache_key] = boundaries + return boundaries + + def _full_supports_selective_ooc_lookup(array: blosc2.NDArray, descriptor: dict) -> bool: full = descriptor.get("full") if full is None or full.get("runs"): @@ -2426,7 +2481,39 @@ def _full_supports_selective_ooc_lookup(array: blosc2.NDArray, descriptor: dict) ) -def _exact_positions_from_full_selective_ooc( +def _exact_positions_from_sorted_chunks( + values_sidecar, + positions_sidecar, + boundaries: np.ndarray, + plan: ExactPredicatePlan, + chunk_len: int, + dtype: np.dtype, +) -> np.ndarray: + candidate_chunks = _candidate_units_from_boundaries(boundaries, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64) + + parts = [] + size = int(values_sidecar.shape[0]) + for chunk_id in np.flatnonzero(candidate_chunks): + chunk_start = int(chunk_id) * chunk_len + chunk_stop = min(chunk_start + chunk_len, size) + span_items = chunk_stop - chunk_start + span_values = np.empty(span_items, dtype=dtype) + values_sidecar.get_1d_span_numpy(span_values, int(chunk_id), 0, span_items) + lo, hi = _search_bounds(span_values, plan) + if lo >= hi: + continue + matched = np.empty(hi - lo, dtype=np.int64) + positions_sidecar.get_1d_span_numpy(matched, int(chunk_id), lo, hi - lo) + parts.append(matched) + + if not parts: + return np.empty(0, dtype=np.int64) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + +def _exact_positions_from_compact_full_base( array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan ) -> np.ndarray: full = descriptor["full"] @@ -2443,7 +2530,7 @@ def _exact_positions_from_full_selective_ooc( dtype = np.dtype(descriptor["dtype"]) chunk_len = int(full["sidecar_chunk_len"]) block_len = int(full["sidecar_block_len"]) - size = int(descriptor["shape"][0]) + size = int(values_sidecar.shape[0]) parts = [] span_count = 0 @@ -2480,12 +2567,82 @@ def _exact_positions_from_full_selective_ooc( return np.sort(positions.astype(np.int64, copy=False), kind="stable") +def _exact_positions_from_full_runs_bounded( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + full = descriptor["full"] + dtype = np.dtype(descriptor["dtype"]) + parts = [] + + base_descriptor = descriptor.copy() + base_full = full.copy() + base_full["runs"] = [] + base_descriptor["full"] = base_full + if _full_supports_selective_ooc_lookup(array, base_descriptor): + base_positions = _exact_positions_from_compact_full_base(array, base_descriptor, plan) + if len(base_positions): + parts.append(base_positions) + else: + base_values = _load_array_sidecar(array, descriptor["token"], "full", "values", full["values_path"]) + base_positions = _load_array_sidecar( + array, descriptor["token"], "full", "positions", full["positions_path"] + ) + lo, hi = _search_bounds(base_values, plan) + if lo < hi: + parts.append(base_positions[lo:hi].astype(np.int64, copy=False)) + + for run in full.get("runs", ()): + if run.get("values_path") is None or run.get("positions_path") is None: + run_values, raw_run_positions = _load_full_run_arrays(array, descriptor, run) + lo, hi = _search_bounds(run_values, plan) + run_positions = ( + np.empty(0, dtype=np.int64) + if lo >= hi + else raw_run_positions[lo:hi].astype(np.int64, copy=False) + ) + else: + run_values_sidecar, run_positions_sidecar = _load_full_run_sidecar_handles( + array, descriptor, run + ) + chunk_boundaries = _sorted_chunk_boundaries_from_handle( + array, + descriptor["token"], + "full_run_bounds", + f"{int(run['id'])}.chunks", + run_values_sidecar, + dtype, + ) + run_positions = _exact_positions_from_sorted_chunks( + run_values_sidecar, + run_positions_sidecar, + chunk_boundaries, + plan, + int(run_values_sidecar.chunks[0]), + dtype, + ) + if len(run_positions): + parts.append(run_positions) + + if not parts: + return np.empty(0, dtype=np.int64) + positions = np.concatenate(parts) if len(parts) > 1 else parts[0] + return np.sort(positions.astype(np.int64, copy=False), kind="stable") + + +def _exact_positions_from_full_selective_ooc( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + return _exact_positions_from_compact_full_base(array, descriptor, plan) + + def _exact_positions_from_full( array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan ) -> np.ndarray: if _range_is_empty(plan): return np.empty(0, dtype=np.int64) mode = _full_query_mode_override() + if mode != "whole-load" and _full_runs_need_bounded_fallback(descriptor): + return _exact_positions_from_full_runs_bounded(array, descriptor, plan) if mode != "whole-load" and _full_supports_selective_ooc_lookup(array, descriptor): try: return _exact_positions_from_full_selective_ooc(array, descriptor, plan) @@ -2972,6 +3129,8 @@ def _full_lookup_path(descriptor: dict | None, *, ordered: bool) -> str | None: if descriptor is None or descriptor.get("kind") != "full": return None if _full_run_count(descriptor): + if not ordered and _full_runs_need_bounded_fallback(descriptor): + return "run-bounded-ooc" return "in-memory-merge" if ordered: return "in-memory-order" diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 63a1f5a3..5c796c34 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4853,7 +4853,51 @@ def rebuild_index(self, field: str | None = None, name: str | None = None) -> di return indexing.rebuild_index(self, field=field, name=name) def compact_index(self, field: str | None = None, name: str | None = None) -> dict: - """Compact a ``full`` index by merging the compact base and append runs.""" + """Compact a ``full`` index by merging its compact base and append runs. + + Parameters + ---------- + field : str or None, optional + Structured field identifying the target ``full`` index. Use + ``None`` to compact the value index for a plain 1-D array. + name : str or None, optional + Optional logical index label. When omitted and the array has a + single index, that index is selected automatically. + + Returns + ------- + out : dict + The updated index descriptor after compaction. + + Notes + ----- + This is currently implemented only for ``kind="full"`` indexes. It is + a structural maintenance operation: the compact base sidecars and any + pending append runs are merged into one compact ``full.values`` sidecar + and one compact ``full.positions`` sidecar. For persistent indexes, the + compact lookup metadata is rebuilt as part of the process and + ``full["runs"]`` becomes empty afterwards. + + Compaction does not change query results. It is useful after many + append operations, where ``full`` maintenance stays cheap on append by + recording sorted runs but later queries may still have extra work until + the index is consolidated explicitly. + + Examples + -------- + >>> import blosc2 + >>> import numpy as np + >>> dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + >>> data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + >>> arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + >>> _ = arr.create_index(field="id", kind="full") + >>> _ = arr.append(np.array([(0, 100), (3, 101)], dtype=dtype)) + >>> len(arr.indexes[0]["full"]["runs"]) + 1 + >>> compacted = arr.compact_index("id") + >>> compacted["full"]["runs"] + [] + """ from . import indexing return indexing.compact_index(self, field=field, name=name) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 06af76d6..7ef0b1dc 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -610,3 +610,52 @@ def test_compact_full_expression_index_preserves_results(): expected_positions = np.argsort(np.abs(expected["x"]), kind="stable") np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], expected[expected_positions]) np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_persistent_large_run_full_query_uses_bounded_fallback(monkeypatch, tmp_path): + path = tmp_path / "large_run_fallback.b2nd" + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.array([(10, 0), (20, 1), (30, 2), (40, 3)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(4,), blocks=(2,)) + arr.create_index(field="id", kind="full") + + for run in range(8): + batch = np.array([(100 + run, 10 + run)], dtype=dtype) + arr.append(batch) + + reopened = blosc2.open(path, mode="a") + indexing = __import__("blosc2.indexing", fromlist=["_load_full_arrays"]) + + def guarded_load_full_arrays(*args, **kwargs): + raise AssertionError("large-run bounded fallback should avoid _load_full_arrays") + + monkeypatch.setattr(indexing, "_load_full_arrays", guarded_load_full_arrays) + expr = blosc2.lazyexpr("(id >= 103) & (id <= 106)", reopened.fields).where(reopened) + explained = expr.explain() + assert explained["lookup_path"] == "run-bounded-ooc" + snapshot = reopened[:] + expected = snapshot[(snapshot["id"] >= 103) & (snapshot["id"] <= 106)] + np.testing.assert_array_equal(expr.compute()[:], expected) + + +def test_large_run_full_expression_query_uses_bounded_fallback(monkeypatch): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full") + + for run, value in enumerate(range(20, 28)): + arr.append(np.array([(value, 10 + run)], dtype=dtype)) + + indexing = __import__("blosc2.indexing", fromlist=["_load_full_arrays"]) + + def guarded_load_full_arrays(*args, **kwargs): + raise AssertionError("large-run bounded fallback should avoid _load_full_arrays") + + monkeypatch.setattr(indexing, "_load_full_arrays", guarded_load_full_arrays) + expr = blosc2.lazyexpr("(abs(x) >= 22) & (abs(x) <= 25)", arr.fields).where(arr) + explained = expr.explain() + assert explained["lookup_path"] == "run-bounded-ooc" + snapshot = arr[:] + expected = snapshot[(np.abs(snapshot["x"]) >= 22) & (np.abs(snapshot["x"]) <= 25)] + np.testing.assert_array_equal(expr.compute()[:], expected) From 6b80e29f400a821045c9ef78778db88a8f8cf043 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 3 Apr 2026 14:14:09 +0200 Subject: [PATCH 18/68] Add tutorial on indexes --- doc/getting_started/tutorials.rst | 1 + .../tutorials/14.indexing-arrays.ipynb | 563 ++++++++++++++++++ src/blosc2/ndarray.py | 2 +- 3 files changed, 565 insertions(+), 1 deletion(-) create mode 100644 doc/getting_started/tutorials/14.indexing-arrays.ipynb diff --git a/doc/getting_started/tutorials.rst b/doc/getting_started/tutorials.rst index 563ba8ea..44bfdca6 100644 --- a/doc/getting_started/tutorials.rst +++ b/doc/getting_started/tutorials.rst @@ -17,3 +17,4 @@ Tutorials tutorials/09.ucodecs-ufilters tutorials/10.prefilters tutorials/11.vlarray + tutorials/14.indexing-arrays diff --git a/doc/getting_started/tutorials/14.indexing-arrays.ipynb b/doc/getting_started/tutorials/14.indexing-arrays.ipynb new file mode 100644 index 00000000..a7515c19 --- /dev/null +++ b/doc/getting_started/tutorials/14.indexing-arrays.ipynb @@ -0,0 +1,563 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b5f43e5fb3d24d4e", + "metadata": {}, + "source": [ + "# Indexing Arrays\n", + "\n", + "Blosc2 can attach indexes to 1-D `NDArray` objects and to fields inside 1-D structured arrays. These indexes accelerate selective queries, and `full` indexes can also drive ordered access directly through `sort(order=...)`, `indices(order=...)`, and `itersorted(...)`.\n", + "\n", + "This tutorial covers:\n", + "\n", + "- how to create field and expression indexes,\n", + "- how to tell whether a query is using an index,\n", + "- what sort of acceleration different index kinds can deliver on a selective query,\n", + "- how index persistence works,\n", + "- when to rebuild indexes,\n", + "- and a recommended workflow for keeping append-heavy `full` indexes compact.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2b6f2bb4ad3a4cb8", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "8c510216bc394cf9", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:11:20.790474Z", + "start_time": "2026-04-03T12:11:20.514656Z" + } + }, + "outputs": [], + "source": [ + "import statistics\n", + "import time\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "\n", + "import blosc2\n", + "\n", + "\n", + "def show_index_summary(label, descriptor):\n", + " print(\n", + " f\"{label}: kind={descriptor['kind']}, persistent={descriptor['persistent']}, \"\n", + " f\"ooc={descriptor['ooc']}, stale={descriptor['stale']}\"\n", + " )\n", + "\n", + "\n", + "def explain_subset(expr):\n", + " info = expr.explain()\n", + " keep = {}\n", + " for key in (\"will_use_index\", \"reason\", \"kind\", \"level\", \"lookup_path\", \"full_runs\"):\n", + " if key in info:\n", + " keep[key] = info[key]\n", + " return keep\n", + "\n", + "\n", + "def median_ms(func, repeats=5, warmup=1):\n", + " for _ in range(warmup):\n", + " func()\n", + " samples = []\n", + " for _ in range(repeats):\n", + " t0 = time.perf_counter()\n", + " func()\n", + " samples.append((time.perf_counter() - t0) * 1e3)\n", + " return statistics.median(samples)\n", + "\n", + "\n", + "paths = [\n", + " Path(\"indexing_tutorial_medium.b2nd\"),\n", + " Path(\"indexing_tutorial_append_full.b2nd\"),\n", + "]\n", + "for path in paths:\n", + " blosc2.remove_urlpath(path)" + ] + }, + { + "cell_type": "markdown", + "id": "28fbc94b52634f32", + "metadata": {}, + "source": [ + "## Index kinds and how to create them\n", + "\n", + "Blosc2 currently supports four index kinds:\n", + "\n", + "- `ultralight`: compact summaries only,\n", + "- `light`: summary levels plus lightweight per-block payloads,\n", + "- `medium`: richer exact-filter payloads,\n", + "- `full`: globally sorted payloads for exact filtering and ordered reuse.\n", + "\n", + "There is one active index per target field or expression. If you create another index on the same target, it replaces the previous one. The easiest way to compare kinds is to build them on separate arrays.\n", + "\n", + "This example uses one million rows so the timing differences are visible without turning the tutorial into a long benchmark." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "d1a5a37585a045ca", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:12:14.968055Z", + "start_time": "2026-04-03T12:12:02.218201Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ultralight: kind=ultralight, persistent=False, ooc=False, stale=False\n", + "light: kind=light, persistent=False, ooc=True, stale=False\n", + "medium: kind=medium, persistent=False, ooc=True, stale=False\n", + "full: kind=full, persistent=False, ooc=True, stale=False\n" + ] + } + ], + "source": [ + "N_ROWS = 10_000_000\n", + "QUERY_TEXT = \"(id >= -25.0) & (id < 25.0)\"\n", + "\n", + "rng = np.random.default_rng(0)\n", + "dtype = np.dtype([(\"id\", np.float64), (\"payload\", np.int32)])\n", + "data = np.zeros(N_ROWS, dtype=dtype)\n", + "# Build a predictable id column, then shuffle it so the source data is not already ordered.\n", + "data[\"id\"] = np.arange(data.shape[0], dtype=np.float64) - data.shape[0] / 2\n", + "rng.shuffle(data[\"id\"])\n", + "data[\"payload\"] = np.arange(data.shape[0], dtype=np.int32)\n", + "\n", + "chunks = (250_000,)\n", + "blocks = (50_000,)\n", + "\n", + "indexed_arrays = {}\n", + "for kind in (\"ultralight\", \"light\", \"medium\", \"full\"):\n", + " arr = blosc2.asarray(data.copy(), chunks=chunks, blocks=blocks)\n", + " descriptor = arr.create_index(field=\"id\", kind=kind)\n", + " indexed_arrays[kind] = arr\n", + " show_index_summary(kind, descriptor)" + ] + }, + { + "cell_type": "markdown", + "id": "bc1cc9b122fe4052", + "metadata": {}, + "source": [ + "## Using an index for filtering\n", + "\n", + "Range predicates are planned automatically when you use `where(...)`. You can inspect the plan with `explain()` and compare the indexed result with a scan by passing `_use_index=False` to `compute()`." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "f1b3aaec965b42d6", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:12:20.542680Z", + "start_time": "2026-04-03T12:12:20.096087Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'medium', 'level': 'exact', 'lookup_path': None, 'full_runs': 0}\n", + "Matched rows: 50\n" + ] + } + ], + "source": [ + "medium_arr = indexed_arrays[\"medium\"]\n", + "expr = blosc2.lazyexpr(QUERY_TEXT, medium_arr.fields).where(medium_arr)\n", + "\n", + "print(explain_subset(expr))\n", + "\n", + "indexed = expr.compute()[:]\n", + "scanned = expr.compute(_use_index=False)[:]\n", + "np.testing.assert_array_equal(indexed, scanned)\n", + "print(f\"Matched rows: {len(indexed)}\")" + ] + }, + { + "cell_type": "markdown", + "id": "1db4bd16a95a48dd", + "metadata": {}, + "source": [ + "### Timing the query with and without indexes\n", + "\n", + "The next cell measures the same selective predicate on all four index kinds and compares it with a forced full scan. On this exact workload, `medium` and `full` usually show the clearest benefit because they carry richer exact-filter payloads." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c9e932b7561b4ff4", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:12:32.747545Z", + "start_time": "2026-04-03T12:12:24.632866Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selective filter over 10,000,000 rows\n", + "kind scan_ms index_ms speedup\n", + "ultralight 363.919 363.733 1.00x\n", + "light 363.916 22.456 16.21x\n", + "medium 366.537 24.952 14.69x\n", + "full 365.223 23.544 15.51x\n" + ] + } + ], + "source": [ + "timing_rows = []\n", + "expected = None\n", + "for kind, arr in indexed_arrays.items():\n", + " expr = blosc2.lazyexpr(QUERY_TEXT, arr.fields).where(arr)\n", + " result = expr.compute()[:]\n", + " if expected is None:\n", + " expected = result\n", + " else:\n", + " np.testing.assert_array_equal(result, expected)\n", + "\n", + " scan_ms = median_ms(lambda expr=expr: expr.compute(_use_index=False)[:], repeats=3)\n", + " index_ms = median_ms(lambda expr=expr: expr.compute()[:], repeats=3)\n", + " timing_rows.append((kind, scan_ms, index_ms, scan_ms / index_ms))\n", + "\n", + "print(f\"Selective filter over {N_ROWS:,} rows\")\n", + "print(f\"{'kind':<12} {'scan_ms':>10} {'index_ms':>10} {'speedup':>10}\")\n", + "for kind, scan_ms, index_ms, speedup in timing_rows:\n", + " print(f\"{kind:<12} {scan_ms:10.3f} {index_ms:10.3f} {speedup:10.2f}x\")" + ] + }, + { + "cell_type": "markdown", + "id": "7679d86361304087", + "metadata": {}, + "source": [ + "## `full` indexes and ordered access\n", + "\n", + "A `full` index stores a global sorted payload. This is the required index tier for direct ordered reuse. `create_csindex()` is just a convenience wrapper for `create_index(kind=\"full\")`." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "9ffcb0d8d06a4daa", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:13:02.734653Z", + "start_time": "2026-04-03T12:13:02.675861Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sorted positions: [7 5 3 1 6 4 2 0]\n", + "Sorted rows:\n", + "[(1, 2) (1, 4) (1, 6) (1, 8) (2, 3) (2, 5) (2, 7) (2, 9)]\n" + ] + } + ], + "source": [ + "ordered_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int64)])\n", + "ordered_data = np.array(\n", + " [(2, 9), (1, 8), (2, 7), (1, 6), (2, 5), (1, 4), (2, 3), (1, 2)],\n", + " dtype=ordered_dtype,\n", + ")\n", + "ordered_arr = blosc2.asarray(ordered_data, chunks=(4,), blocks=(2,))\n", + "ordered_arr.create_csindex(\"id\")\n", + "\n", + "print(\"Sorted positions:\", ordered_arr.indices(order=[\"id\", \"payload\"])[:])\n", + "print(\"Sorted rows:\")\n", + "print(ordered_arr.sort(order=[\"id\", \"payload\"])[:])" + ] + }, + { + "cell_type": "markdown", + "id": "a77189a036524546", + "metadata": {}, + "source": [ + "## Expression indexes\n", + "\n", + "You can also index a deterministic scalar expression stream. Expression indexes are matched by normalized expression identity, so the same expression can be reused for filtering and ordered access." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7d337ce2f9fb4f32", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:13:10.701850Z", + "start_time": "2026-04-03T12:13:10.650458Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'in-memory', 'full_runs': 0}\n", + "Expression-order positions: [2 6 4 5 1 7]\n" + ] + } + ], + "source": [ + "expr_dtype = np.dtype([(\"x\", np.int64), (\"payload\", np.int32)])\n", + "expr_data = np.array([(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], dtype=expr_dtype)\n", + "expr_arr = blosc2.asarray(expr_data, chunks=(4,), blocks=(2,))\n", + "expr_arr.create_expr_index(\"abs(x)\", kind=\"full\", name=\"abs_x\")\n", + "\n", + "ordered_expr = blosc2.lazyexpr(\"(abs(x) >= 2) & (abs(x) < 8)\", expr_arr.fields).where(expr_arr)\n", + "print(explain_subset(ordered_expr))\n", + "print(\"Expression-order positions:\", ordered_expr.indices(order=\"abs(x)\").compute()[:])" + ] + }, + { + "cell_type": "markdown", + "id": "0a0a629ffed5480d", + "metadata": {}, + "source": [ + "## Persistence: automatic or manual?\n", + "\n", + "Index persistence follows the base array by default:\n", + "\n", + "- for a persistent array (`urlpath=...`), `persistent=None` means the index sidecars are persisted automatically,\n", + "- for an in-memory array, the index lives only in memory,\n", + "- on a persistent array, `persistent=False` keeps the index process-local instead of writing sidecars.\n", + "\n", + "In practice, if you want an index to survive reopen, persist the array and use the default behavior." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0be5f512928f48db", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:13:16.392311Z", + "start_time": "2026-04-03T12:13:13.976166Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "persistent medium: kind=medium, persistent=True, ooc=True, stale=False\n", + "Reopened index count: 1\n", + "Persisted sidecar path: indexing_tutorial_medium.__index__.id.medium.reduced.values.b2nd\n" + ] + } + ], + "source": [ + "persistent_arr = blosc2.asarray(data, urlpath=paths[0], mode=\"w\", chunks=chunks, blocks=blocks)\n", + "persistent_descriptor = persistent_arr.create_index(field=\"id\", kind=\"medium\")\n", + "show_index_summary(\"persistent medium\", persistent_descriptor)\n", + "\n", + "reopened = blosc2.open(paths[0], mode=\"a\")\n", + "print(f\"Reopened index count: {len(reopened.indexes)}\")\n", + "print(f\"Persisted sidecar path: {reopened.indexes[0]['reduced']['values_path']}\")" + ] + }, + { + "cell_type": "markdown", + "id": "5bfb14d1e0f945b7", + "metadata": {}, + "source": [ + "## When to rebuild an index\n", + "\n", + "Appending is special-cased and keeps compatible indexes current. General mutation and resize operations do not. After unsupported mutations, the index is marked stale and should be refreshed explicitly with `rebuild_index()`." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "11f0cd1b910b409a", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:13:18.234684Z", + "start_time": "2026-04-03T12:13:18.166991Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stale after direct mutation: True\n", + "Stale after rebuild: False\n" + ] + } + ], + "source": [ + "mutable_arr = blosc2.asarray(np.arange(20, dtype=np.int64), chunks=(10,), blocks=(5,))\n", + "mutable_arr.create_index(kind=\"full\")\n", + "mutable_arr[:3] = -1\n", + "\n", + "print(\"Stale after direct mutation:\", mutable_arr.indexes[0][\"stale\"])\n", + "mutable_arr.rebuild_index()\n", + "print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])" + ] + }, + { + "cell_type": "markdown", + "id": "328a2c209dc246ba", + "metadata": {}, + "source": [ + "## Recommended workflow for append-heavy `full` indexes\n", + "\n", + "Appending to a `full` index is intentionally cheap: appended tails become sorted runs instead of forcing an immediate rewrite of the compact base sidecars.\n", + "\n", + "That means the recommended workflow is:\n", + "\n", + "1. create a persistent `full` index once,\n", + "2. append freely during ingestion,\n", + "3. let queries keep working while runs accumulate,\n", + "4. call `compact_index()` after ingestion windows or before latency-sensitive read phases.\n", + "\n", + "The next example uses a larger append-heavy array and times the same selective query before and after compaction. The exact query path reports whether it is using a compact lookup layout or a run-aware fallback. After compaction, `full[\"runs\"]` becomes empty again." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "2e1a47a9cf7246e6", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:13:30.781045Z", + "start_time": "2026-04-03T12:13:28.516376Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n", + "Pending runs: 40\n", + "Median query time before compaction: 3.250 ms\n", + "After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n", + "Pending runs: 0\n", + "Median query time after compaction: 0.939 ms\n", + "Speedup after compaction: 3.46x\n" + ] + } + ], + "source": [ + "append_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int32)])\n", + "base_rows = 200_000\n", + "append_batch = 500\n", + "num_runs = 40\n", + "\n", + "append_data = np.zeros(base_rows, dtype=append_dtype)\n", + "append_data[\"id\"] = np.arange(base_rows, dtype=np.int64)\n", + "append_data[\"payload\"] = np.arange(base_rows, dtype=np.int32)\n", + "\n", + "append_arr = blosc2.asarray(append_data, urlpath=paths[1], mode=\"w\", chunks=(20_000,), blocks=(4_000,))\n", + "append_arr.create_index(field=\"id\", kind=\"full\")\n", + "\n", + "for run in range(num_runs):\n", + " start = 300_000 + run * append_batch\n", + " batch = np.zeros(append_batch, dtype=append_dtype)\n", + " batch[\"id\"] = np.arange(start, start + append_batch, dtype=np.int64)\n", + " batch[\"payload\"] = np.arange(append_batch, dtype=np.int32)\n", + " append_arr.append(batch)\n", + "\n", + "append_query = \"(id >= 310_000) & (id < 310_020)\"\n", + "append_expr = blosc2.lazyexpr(append_query, append_arr.fields).where(append_arr)\n", + "before_info = explain_subset(append_expr)\n", + "before_ms = median_ms(lambda: append_expr.compute()[:], repeats=5)\n", + "print(\"Before compaction:\", before_info)\n", + "print(\"Pending runs:\", len(append_arr.indexes[0][\"full\"][\"runs\"]))\n", + "print(f\"Median query time before compaction: {before_ms:.3f} ms\")\n", + "\n", + "append_arr.compact_index(\"id\")\n", + "append_expr = blosc2.lazyexpr(append_query, append_arr.fields).where(append_arr)\n", + "after_info = explain_subset(append_expr)\n", + "after_ms = median_ms(lambda: append_expr.compute()[:], repeats=5)\n", + "print(\"After compaction:\", after_info)\n", + "print(\"Pending runs:\", len(append_arr.indexes[0][\"full\"][\"runs\"]))\n", + "print(f\"Median query time after compaction: {after_ms:.3f} ms\")\n", + "print(f\"Speedup after compaction: {before_ms / after_ms:.2f}x\")" + ] + }, + { + "cell_type": "markdown", + "id": "1eb8f667d1ff4aba", + "metadata": {}, + "source": [ + "## Practical guidance\n", + "\n", + "- Use `medium` when your main goal is faster selective filtering.\n", + "- Use `full` when you also want ordered reuse through `sort(order=...)`, `indices(order=...)`, or `itersorted(...)`.\n", + "- Persist the base array if you want indexes to survive reopen automatically.\n", + "- After unsupported mutations, use `rebuild_index()`.\n", + "- For append-heavy `full` indexes, compact explicitly at convenient maintenance boundaries instead of on every append.\n", + "- Measure your own workload: compact indexes, predicate selectivity, and ordered access needs all affect which kind is best.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "9833102355db4ec0", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-03T12:13:36.744016Z", + "start_time": "2026-04-03T12:13:36.726709Z" + } + }, + "outputs": [], + "source": [ + "for path in paths:\n", + " blosc2.remove_urlpath(path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17489b2c3d2ac57", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 5c796c34..12c01481 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4721,7 +4721,7 @@ def create_index( self, field: str | None = None, kind: str = "light", - optlevel: int = 3, + optlevel: int = 5, granularity: str = "chunk", persistent: bool | None = None, in_mem: bool = False, From 1dd1f25086b2d797305bb726627969bf915a81e9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 3 Apr 2026 14:17:13 +0200 Subject: [PATCH 19/68] Docstrings for LazyExpr.explain() --- src/blosc2/indexing.py | 7 +++++++ src/blosc2/lazyexpr.py | 22 ++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index be6cb445..7ff8779e 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -3412,6 +3412,13 @@ def will_use_index(expr) -> bool: def explain_query(expr) -> dict: + """Return planning details for a lazy query. + + This is an internal helper behind :meth:`blosc2.LazyExpr.explain`. The + returned mapping summarizes whether indexing can be used, which index kind + was selected, and additional diagnostics such as candidate counts and the + lookup path chosen for ``full`` indexes. + """ where = getattr(expr, "_where_args", None) order = getattr(expr, "_order", None) if order is not None: diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 07ca8d70..fd6a158d 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -3728,6 +3728,28 @@ def will_use_index(self) -> bool: return indexing.will_use_index(self) def explain(self) -> dict: + """Explain how this lazy query will be executed. + + Returns a dictionary describing the planner decision for the current + query. Typical fields include whether an index will be used, the chosen + index kind and level, candidate counts, and the lookup path selected + for ``full`` indexes. + + Returns: + dict: Query planning metadata for the current expression. + + Examples: + >>> import numpy as np + >>> import blosc2 + >>> arr = blosc2.asarray(np.arange(10)) + >>> _ = arr.create_index(kind="full") + >>> expr = blosc2.lazyexpr("(a >= 3) & (a < 6)", {"a": arr}).where(arr) + >>> info = expr.explain() + >>> info["will_use_index"] + True + >>> info["kind"] + 'full' + """ from . import indexing return indexing.explain_query(self) From 2e4785ee9890f58c09481642a2e7828de8084c95 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 4 Apr 2026 09:22:35 +0200 Subject: [PATCH 20/68] Rework light/medium indexes to chunk-local OPSI-style layout Replace the old block-local persistent payload format for light and medium with a chunk-local canonical layout using fully sorted chunk payloads, per-chunk offsets, chunk-level L1 boundaries, and persistent intrachunk L2 navigation sidecars. Update the builders, loaders, rebuild/append paths, and descriptor validation so rebuilt light and medium indexes only use the new chunk-local-v1 format and drop reliance on the old block-flattened payload assumptions. Add new persistent exact-query paths for light and medium that use chunk-level pruning plus L2-guided selective reads through sidecar span helpers, while preserving scan-equivalent output order. Switch light to chunk-local bucket geometry derived from the payload block length, allow wider bucket dtypes, and keep medium positions chunk-local instead of block-local. Improve explain() reporting for the new OOC lookup path with lookup_path="chunk-nav-ooc" and navigation candidate counts. --- src/blosc2/indexing.py | 883 +++++++++++++++++++++++++-------- tests/ndarray/test_indexing.py | 26 + 2 files changed, 691 insertions(+), 218 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 7ff8779e..f752ba2b 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -65,11 +65,16 @@ class IndexPlan: exact_positions: np.ndarray | None = None bucket_masks: np.ndarray | None = None bucket_len: int | None = None + chunk_len: int | None = None block_len: int | None = None lower: object | None = None lower_inclusive: bool = True upper: object | None = None upper_inclusive: bool = True + candidate_chunks: int = 0 + candidate_nav_segments: int = 0 + candidate_base_spans: int = 0 + lookup_path: str | None = None @dataclass(slots=True) @@ -442,12 +447,20 @@ def _store_array_sidecar( name: str, data: np.ndarray, persistent: bool, + *, + chunks: tuple[int, ...] | None = None, + blocks: tuple[int, ...] | None = None, ) -> dict: cache_key = _data_cache_key(array, token, category, name) if persistent: path = _sidecar_path(array, token, kind, f"{category}.{name}") blosc2.remove_urlpath(path) - blosc2.asarray(data, urlpath=path, mode="w") + kwargs = {"urlpath": path, "mode": "w"} + if chunks is not None: + kwargs["chunks"] = chunks + if blocks is not None: + kwargs["blocks"] = blocks + blosc2.asarray(data, **kwargs) if isinstance(data, np.memmap): _DATA_CACHE.pop(cache_key, None) else: @@ -627,20 +640,30 @@ def _build_reduced_descriptor( values: np.ndarray, persistent: bool, ) -> dict: - block_len = int(array.blocks[0]) - sorted_values, positions, offsets, _ = _build_block_sorted_payload(values, block_len) - - values_sidecar = _store_array_sidecar(array, token, kind, "reduced", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar( - array, token, kind, "reduced", "positions", positions, persistent + chunk_len = int(array.chunks[0]) + nav_segment_len = int(array.blocks[0]) + sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( + values, chunk_len, nav_segment_len ) - offsets_sidecar = _store_array_sidecar(array, token, kind, "reduced", "offsets", offsets, persistent) - return { - "block_len": block_len, - "values_path": values_sidecar["path"], - "positions_path": positions_sidecar["path"], - "offsets_path": offsets_sidecar["path"], - } + l1 = _compute_sorted_boundaries(sorted_values, np.dtype(values.dtype), chunk_len) + reduced = _chunk_index_payload_storage( + array, + token, + kind, + "reduced", + "values", + sorted_values, + "positions", + positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, + ) + reduced["position_dtype"] = positions.dtype.str + return reduced def _open_temp_memmap(workdir: Path, name: str, dtype: np.dtype, shape: tuple[int, ...]) -> np.memmap: @@ -648,49 +671,207 @@ def _open_temp_memmap(workdir: Path, name: str, dtype: np.dtype, shape: tuple[in return np.lib.format.open_memmap(path, mode="w+", dtype=dtype, shape=shape) -def _build_reduced_descriptor_ooc( +def _segment_row_count(chunk_len: int, nav_segment_len: int) -> int: + return max(1, math.ceil(chunk_len / nav_segment_len)) + + +def _sidecar_block_len(sidecar: dict, fallback_block_len: int) -> int: + path = sidecar.get("path") + if path is None: + return fallback_block_len + return int(blosc2.open(path).blocks[0]) + + +def _build_chunk_sorted_payload( + values: np.ndarray, + chunk_len: int, + nav_segment_len: int, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.dtype]: + size = values.shape[0] + nchunks = math.ceil(size / chunk_len) + position_dtype = _position_dtype(chunk_len - 1) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty_like(values) + positions = np.empty(size, dtype=position_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(values.dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(values.dtype)) + + cursor = 0 + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk = values[start:stop] + order = np.argsort(chunk, kind="stable") + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_sorted = chunk[order] + sorted_values[cursor:next_cursor] = chunk_sorted + positions[cursor:next_cursor] = order.astype(position_dtype, copy=False) + offsets[chunk_id + 1] = next_cursor + l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) + + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = cursor + segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, next_cursor) + l2[row_start + segment_id] = (sorted_values[seg_start], sorted_values[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, positions, offsets, l2, position_dtype + + +def _build_chunk_sorted_payload_ooc( array: blosc2.NDArray, target: dict, - token: str, - kind: str, dtype: np.dtype, - persistent: bool, workdir: Path, -) -> dict: + prefix: str, + chunk_len: int, + nav_segment_len: int, +) -> tuple[np.memmap, np.memmap, np.ndarray, np.ndarray, np.dtype]: size = int(array.shape[0]) - block_len = int(array.blocks[0]) - nblocks = math.ceil(size / block_len) - position_dtype = _position_dtype(block_len - 1) - offsets = np.empty(nblocks + 1, dtype=np.int64) + nchunks = math.ceil(size / chunk_len) + position_dtype = _position_dtype(chunk_len - 1) + offsets = np.empty(nchunks + 1, dtype=np.int64) offsets[0] = 0 - sorted_values = _open_temp_memmap(workdir, f"{kind}_reduced_values", dtype, (size,)) - positions = _open_temp_memmap(workdir, f"{kind}_reduced_positions", position_dtype, (size,)) + sorted_values = _open_temp_memmap(workdir, f"{prefix}_values", dtype, (size,)) + positions = _open_temp_memmap(workdir, f"{prefix}_positions", position_dtype, (size,)) + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) cursor = 0 - for block_id in range(nblocks): - start = block_id * block_len - stop = min(start + block_len, size) - block = _slice_values_for_target(array, target, start, stop) - order = np.argsort(block, kind="stable") - next_cursor = cursor + (stop - start) - sorted_values[cursor:next_cursor] = block[order] + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk = _slice_values_for_target(array, target, start, stop) + order = np.argsort(chunk, kind="stable") + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_sorted = chunk[order] + sorted_values[cursor:next_cursor] = chunk_sorted positions[cursor:next_cursor] = order.astype(position_dtype, copy=False) + offsets[chunk_id + 1] = next_cursor + l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) + + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = cursor + segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, next_cursor) + l2[row_start + segment_id] = (sorted_values[seg_start], sorted_values[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] cursor = next_cursor - offsets[block_id + 1] = cursor - values_sidecar = _store_array_sidecar(array, token, kind, "reduced", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar( - array, token, kind, "reduced", "positions", positions, persistent + return sorted_values, positions, offsets, l2, position_dtype + + +def _chunk_index_payload_storage( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + payload_name: str, + payload: np.ndarray, + aux_name: str, + aux_payload: np.ndarray, + offsets: np.ndarray, + l1: np.ndarray, + l2: np.ndarray, + persistent: bool, + chunk_len: int, + nav_segment_len: int, +) -> dict: + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + payload_sidecar = _store_array_sidecar( + array, + token, + kind, + category, + payload_name, + payload, + persistent, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + ) + aux_sidecar = _store_array_sidecar( + array, + token, + kind, + category, + aux_name, + aux_payload, + persistent, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + ) + offsets_sidecar = _store_array_sidecar(array, token, kind, category, "offsets", offsets, persistent) + l1_sidecar = _store_array_sidecar(array, token, kind, f"{category}_nav", "l1", l1, persistent) + l2_sidecar = _store_array_sidecar( + array, + token, + kind, + f"{category}_nav", + "l2", + l2, + persistent, + chunks=(nsegments_per_chunk,), + blocks=(min(nsegments_per_chunk, max(1, nsegments_per_chunk)),), ) - offsets_sidecar = _store_array_sidecar(array, token, kind, "reduced", "offsets", offsets, persistent) return { - "block_len": block_len, - "values_path": values_sidecar["path"], - "positions_path": positions_sidecar["path"], + "layout": "chunk-local-v1", + "chunk_len": chunk_len, + "nav_segment_len": nav_segment_len, + "nsegments_per_chunk": nsegments_per_chunk, + "values_path": payload_sidecar["path"], + f"{aux_name}_path": aux_sidecar["path"], "offsets_path": offsets_sidecar["path"], + "l1_path": l1_sidecar["path"], + "l2_path": l2_sidecar["path"], } +def _build_reduced_descriptor_ooc( + array: blosc2.NDArray, + target: dict, + token: str, + kind: str, + dtype: np.dtype, + persistent: bool, + workdir: Path, +) -> dict: + chunk_len = int(array.chunks[0]) + nav_segment_len = int(array.blocks[0]) + sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload_ooc( + array, target, dtype, workdir, f"{kind}_reduced", chunk_len, nav_segment_len + ) + l1 = _compute_sorted_boundaries(np.asarray(sorted_values), dtype, chunk_len) + reduced = _chunk_index_payload_storage( + array, + token, + kind, + "reduced", + "values", + sorted_values, + "positions", + positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, + ) + reduced["position_dtype"] = positions.dtype.str + return reduced + + def _light_bucket_count(block_len: int) -> int: return max(1, min(64, block_len)) @@ -814,29 +995,40 @@ def _build_light_descriptor( optlevel: int, persistent: bool, ) -> dict: - block_len = int(array.blocks[0]) - bucket_count = _light_bucket_count(block_len) - bucket_len = math.ceil(block_len / bucket_count) + chunk_len = int(array.chunks[0]) + nav_segment_len = int(array.blocks[0]) + bucket_len = max(1, math.ceil(nav_segment_len / 64)) + bucket_count = math.ceil(chunk_len / bucket_len) value_lossy_bits = _light_value_lossy_bits(values.dtype, optlevel) - sorted_values, positions, offsets, _ = _build_block_sorted_payload(values, block_len) + sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( + values, chunk_len, nav_segment_len + ) if value_lossy_bits > 0: sorted_values = _quantize_light_values_array(sorted_values, value_lossy_bits) - bucket_positions = (positions // bucket_len).astype(np.uint8, copy=False) - - values_sidecar = _store_array_sidecar(array, token, kind, "light", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar( - array, token, kind, "light", "bucket_positions", bucket_positions, persistent + bucket_dtype = _position_dtype(bucket_count - 1) + bucket_positions = (positions // bucket_len).astype(bucket_dtype, copy=False) + l1 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), chunk_len) + light = _chunk_index_payload_storage( + array, + token, + kind, + "light", + "values", + sorted_values, + "bucket_positions", + bucket_positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, ) - offsets_sidecar = _store_array_sidecar(array, token, kind, "light", "offsets", offsets, persistent) - return { - "block_len": block_len, - "bucket_count": bucket_count, - "bucket_len": bucket_len, - "value_lossy_bits": value_lossy_bits, - "values_path": values_sidecar["path"], - "bucket_positions_path": positions_sidecar["path"], - "offsets_path": offsets_sidecar["path"], - } + light["bucket_count"] = bucket_count + light["bucket_len"] = bucket_len + light["value_lossy_bits"] = value_lossy_bits + light["bucket_dtype"] = bucket_positions.dtype.str + return light def _build_light_descriptor_ooc( @@ -849,46 +1041,43 @@ def _build_light_descriptor_ooc( persistent: bool, workdir: Path, ) -> dict: - size = int(array.shape[0]) - block_len = int(array.blocks[0]) - nblocks = math.ceil(size / block_len) - bucket_count = _light_bucket_count(block_len) - bucket_len = math.ceil(block_len / bucket_count) + chunk_len = int(array.chunks[0]) + nav_segment_len = int(array.blocks[0]) + bucket_len = max(1, math.ceil(nav_segment_len / 64)) + bucket_count = math.ceil(chunk_len / bucket_len) value_lossy_bits = _light_value_lossy_bits(dtype, optlevel) - offsets = np.empty(nblocks + 1, dtype=np.int64) - offsets[0] = 0 - sorted_values = _open_temp_memmap(workdir, f"{kind}_light_values", dtype, (size,)) - bucket_positions = _open_temp_memmap(workdir, f"{kind}_light_bucket_positions", np.uint8, (size,)) - - cursor = 0 - for block_id in range(nblocks): - start = block_id * block_len - stop = min(start + block_len, size) - block = _slice_values_for_target(array, target, start, stop) - order = np.argsort(block, kind="stable") - block_values = block[order] - if value_lossy_bits > 0: - block_values = _quantize_light_values_array(block_values, value_lossy_bits) - next_cursor = cursor + (stop - start) - sorted_values[cursor:next_cursor] = block_values - bucket_positions[cursor:next_cursor] = (order // bucket_len).astype(np.uint8, copy=False) - cursor = next_cursor - offsets[block_id + 1] = cursor - - values_sidecar = _store_array_sidecar(array, token, kind, "light", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar( - array, token, kind, "light", "bucket_positions", bucket_positions, persistent + sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload_ooc( + array, target, dtype, workdir, f"{kind}_light", chunk_len, nav_segment_len ) - offsets_sidecar = _store_array_sidecar(array, token, kind, "light", "offsets", offsets, persistent) - return { - "block_len": block_len, - "bucket_count": bucket_count, - "bucket_len": bucket_len, - "value_lossy_bits": value_lossy_bits, - "values_path": values_sidecar["path"], - "bucket_positions_path": positions_sidecar["path"], - "offsets_path": offsets_sidecar["path"], - } + if value_lossy_bits > 0: + sorted_values[:] = _quantize_light_values_array(np.asarray(sorted_values), value_lossy_bits) + bucket_dtype = _position_dtype(bucket_count - 1) + bucket_positions = _open_temp_memmap( + workdir, f"{kind}_light_bucket_positions", bucket_dtype, positions.shape + ) + bucket_positions[:] = (np.asarray(positions) // bucket_len).astype(bucket_dtype, copy=False) + l1 = _compute_sorted_boundaries(np.asarray(sorted_values), dtype, chunk_len) + light = _chunk_index_payload_storage( + array, + token, + kind, + "light", + "values", + sorted_values, + "bucket_positions", + bucket_positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, + ) + light["bucket_count"] = bucket_count + light["bucket_len"] = bucket_len + light["value_lossy_bits"] = value_lossy_bits + light["bucket_dtype"] = bucket_positions.dtype.str + return light def _scalar_compare(left, right, dtype: np.dtype) -> int: @@ -1396,10 +1585,14 @@ def _drop_descriptor_sidecars(descriptor: dict) -> None: _remove_sidecar_path(descriptor["light"]["values_path"]) _remove_sidecar_path(descriptor["light"]["bucket_positions_path"]) _remove_sidecar_path(descriptor["light"]["offsets_path"]) + _remove_sidecar_path(descriptor["light"].get("l1_path")) + _remove_sidecar_path(descriptor["light"].get("l2_path")) if descriptor.get("reduced") is not None: _remove_sidecar_path(descriptor["reduced"]["values_path"]) _remove_sidecar_path(descriptor["reduced"]["positions_path"]) _remove_sidecar_path(descriptor["reduced"]["offsets_path"]) + _remove_sidecar_path(descriptor["reduced"].get("l1_path")) + _remove_sidecar_path(descriptor["reduced"].get("l2_path")) if descriptor.get("full") is not None: _remove_sidecar_path(descriptor["full"]["values_path"]) _remove_sidecar_path(descriptor["full"]["positions_path"]) @@ -1450,70 +1643,34 @@ def _replace_levels_descriptor_tail( def _replace_reduced_descriptor_tail( array: blosc2.NDArray, descriptor: dict, old_size: int, persistent: bool ) -> None: - reduced = descriptor["reduced"] + del old_size target = descriptor["target"] - token = descriptor["token"] - block_len = int(reduced["block_len"]) - start_block = old_size // block_len - block_start = start_block * block_len - tail_values = _slice_values_for_target(array, target, block_start, int(array.shape[0])) - sorted_values_tail, positions_tail, offsets_tail, _ = _build_block_sorted_payload(tail_values, block_len) - - values, positions, offsets = _load_reduced_arrays(array, descriptor) - prefix_items = int(offsets[start_block]) - updated_values = np.concatenate((values[:prefix_items], sorted_values_tail)) - updated_positions = np.concatenate((positions[:prefix_items], positions_tail)) - updated_offsets = np.concatenate((offsets[: start_block + 1], prefix_items + offsets_tail[1:])) - - kind = descriptor["kind"] - values_sidecar = _store_array_sidecar( - array, token, kind, "reduced", "values", updated_values, persistent - ) - positions_sidecar = _store_array_sidecar( - array, token, kind, "reduced", "positions", updated_positions, persistent - ) - offsets_sidecar = _store_array_sidecar( - array, token, kind, "reduced", "offsets", updated_offsets, persistent + reduced = descriptor["reduced"] + for key in ("values_path", "positions_path", "offsets_path", "l1_path", "l2_path"): + _remove_sidecar_path(reduced.get(key)) + rebuilt = _build_reduced_descriptor( + array, descriptor["token"], descriptor["kind"], _values_for_target(array, target), persistent ) - reduced["values_path"] = values_sidecar["path"] - reduced["positions_path"] = positions_sidecar["path"] - reduced["offsets_path"] = offsets_sidecar["path"] + descriptor["reduced"] = rebuilt def _replace_light_descriptor_tail( array: blosc2.NDArray, descriptor: dict, old_size: int, persistent: bool ) -> None: - light = descriptor["light"] + del old_size target = descriptor["target"] - token = descriptor["token"] - block_len = int(light["block_len"]) - start_block = old_size // block_len - block_start = start_block * block_len - tail_values = _slice_values_for_target(array, target, block_start, int(array.shape[0])) - value_lossy_bits = int(light["value_lossy_bits"]) - bucket_len = int(light["bucket_len"]) - sorted_values_tail, positions_tail, offsets_tail, _ = _build_block_sorted_payload(tail_values, block_len) - if value_lossy_bits > 0: - sorted_values_tail = _quantize_light_values_array(sorted_values_tail, value_lossy_bits) - bucket_positions_tail = (positions_tail // bucket_len).astype(np.uint8, copy=False) - - values, bucket_positions, offsets = _load_light_arrays(array, descriptor) - prefix_items = int(offsets[start_block]) - updated_values = np.concatenate((values[:prefix_items], sorted_values_tail)) - updated_bucket_positions = np.concatenate((bucket_positions[:prefix_items], bucket_positions_tail)) - updated_offsets = np.concatenate((offsets[: start_block + 1], prefix_items + offsets_tail[1:])) - - kind = descriptor["kind"] - values_sidecar = _store_array_sidecar(array, token, kind, "light", "values", updated_values, persistent) - positions_sidecar = _store_array_sidecar( - array, token, kind, "light", "bucket_positions", updated_bucket_positions, persistent - ) - offsets_sidecar = _store_array_sidecar( - array, token, kind, "light", "offsets", updated_offsets, persistent + light = descriptor["light"] + for key in ("values_path", "bucket_positions_path", "offsets_path", "l1_path", "l2_path"): + _remove_sidecar_path(light.get(key)) + rebuilt = _build_light_descriptor( + array, + descriptor["token"], + descriptor["kind"], + _values_for_target(array, target), + descriptor["optlevel"], + persistent, ) - light["values_path"] = values_sidecar["path"] - light["bucket_positions_path"] = positions_sidecar["path"] - light["offsets_path"] = offsets_sidecar["path"] + descriptor["light"] = rebuilt def _replace_full_descriptor( @@ -1802,8 +1959,14 @@ def _descriptor_for_target(array: blosc2.NDArray, target: dict) -> dict | None: return None if descriptor.get("version") != INDEX_FORMAT_VERSION: return None - if descriptor.get("kind") == "light" and "values_path" not in descriptor.get("light", {}): - return None + if descriptor.get("kind") == "light": + light = descriptor.get("light", {}) + if light.get("layout") != "chunk-local-v1" or "values_path" not in light: + return None + if descriptor.get("kind") == "medium": + reduced = descriptor.get("reduced", {}) + if reduced.get("layout") != "chunk-local-v1" or "values_path" not in reduced: + return None if tuple(descriptor.get("shape", ())) != tuple(array.shape): return None if tuple(descriptor.get("chunks", ())) != tuple(array.chunks): @@ -1916,6 +2079,39 @@ def _load_reduced_arrays( return values, positions, offsets +def _load_reduced_navigation_arrays( + array: blosc2.NDArray, descriptor: dict +) -> tuple[np.ndarray, np.ndarray]: + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + token = descriptor["token"] + l1 = _load_array_sidecar(array, token, "reduced_nav", "l1", reduced["l1_path"]) + l2 = _load_array_sidecar(array, token, "reduced_nav", "l2", reduced["l2_path"]) + return l1, l2 + + +def _load_reduced_l1_array(array: blosc2.NDArray, descriptor: dict) -> np.ndarray: + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + token = descriptor["token"] + return _load_array_sidecar(array, token, "reduced_nav", "l1", reduced["l1_path"]) + + +def _load_reduced_sidecar_handles(array: blosc2.NDArray, descriptor: dict): + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + token = descriptor["token"] + values_sidecar = _open_sidecar_handle(array, token, "reduced_handle", "values", reduced["values_path"]) + positions_sidecar = _open_sidecar_handle( + array, token, "reduced_handle", "positions", reduced["positions_path"] + ) + l2_sidecar = _open_sidecar_handle(array, token, "reduced_nav_handle", "l2", reduced["l2_path"]) + return values_sidecar, positions_sidecar, l2_sidecar + + def _load_light_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray]: light = descriptor.get("light") if light is None: @@ -1928,6 +2124,37 @@ def _load_light_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndar return values, positions, offsets +def _load_light_navigation_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + token = descriptor["token"] + l1 = _load_array_sidecar(array, token, "light_nav", "l1", light["l1_path"]) + l2 = _load_array_sidecar(array, token, "light_nav", "l2", light["l2_path"]) + return l1, l2 + + +def _load_light_l1_array(array: blosc2.NDArray, descriptor: dict) -> np.ndarray: + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + token = descriptor["token"] + return _load_array_sidecar(array, token, "light_nav", "l1", light["l1_path"]) + + +def _load_light_sidecar_handles(array: blosc2.NDArray, descriptor: dict): + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + token = descriptor["token"] + values_sidecar = _open_sidecar_handle(array, token, "light_handle", "values", light["values_path"]) + bucket_sidecar = _open_sidecar_handle( + array, token, "light_handle", "bucket_positions", light["bucket_positions_path"] + ) + l2_sidecar = _open_sidecar_handle(array, token, "light_nav_handle", "l2", light["l2_path"]) + return values_sidecar, bucket_sidecar, l2_sidecar + + def _normalize_scalar(value, dtype: np.dtype): if isinstance(value, np.generic): return value.item() @@ -2655,31 +2882,223 @@ def _exact_positions_from_full( return np.sort(positions[lo:hi], kind="stable") +def _chunk_nav_supports_selective_ooc_lookup(array: blosc2.NDArray, descriptor: dict, kind: str) -> bool: + if descriptor.get("kind") != kind or not descriptor.get("persistent", False): + return False + meta = descriptor.get("light" if kind == "light" else "reduced") + if meta is None or meta.get("layout") != "chunk-local-v1": + return False + required_paths = ("values_path", "l1_path", "l2_path") + if any(meta.get(name) is None for name in required_paths): + return False + if kind == "light": + if meta.get("bucket_positions_path") is None: + return False + try: + values_sidecar, bucket_sidecar, l2_sidecar = _load_light_sidecar_handles(array, descriptor) + except Exception: + return False + return ( + _supports_block_reads(array) + and _supports_block_reads(values_sidecar) + and _supports_block_reads(bucket_sidecar) + and _supports_block_reads(l2_sidecar) + ) + if meta.get("positions_path") is None: + return False + try: + values_sidecar, positions_sidecar, l2_sidecar = _load_reduced_sidecar_handles(array, descriptor) + except Exception: + return False + return ( + _supports_block_reads(array) + and _supports_block_reads(values_sidecar) + and _supports_block_reads(positions_sidecar) + and _supports_block_reads(l2_sidecar) + ) + + +def _chunk_nav_candidate_runs( + l2_row: np.ndarray, segment_count: int, plan: ExactPredicatePlan +) -> tuple[list[tuple[int, int]], int]: + segment_mask = _candidate_units_from_boundaries(l2_row[:segment_count], plan) + if not np.any(segment_mask): + return [], 0 + runs = _contiguous_true_runs(segment_mask) + return runs, int(np.count_nonzero(segment_mask)) + + +def _light_search_plan( + plan: ExactPredicatePlan, dtype: np.dtype, value_lossy_bits: int +) -> ExactPredicatePlan: + if value_lossy_bits <= 0 or plan.lower is None: + return plan + if dtype.kind in {"i", "u"}: + next_lower = plan.lower if plan.lower_inclusive else min(int(plan.lower) + 1, np.iinfo(dtype).max) + else: + next_lower = ( + plan.lower + if plan.lower_inclusive + else np.nextafter(np.asarray(plan.lower, dtype=dtype)[()], np.inf) + ) + return ExactPredicatePlan( + base=plan.base, + descriptor=plan.descriptor, + target=plan.target, + field=plan.field, + lower=_quantize_light_value_scalar(next_lower, dtype, value_lossy_bits), + lower_inclusive=True, + upper=plan.upper, + upper_inclusive=plan.upper_inclusive, + ) + + +def _bucket_masks_from_light_chunk_nav_ooc( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> tuple[np.ndarray, int, int]: + light = descriptor["light"] + offsets = _load_array_sidecar(array, descriptor["token"], "light", "offsets", light["offsets_path"]) + l1 = _load_light_l1_array(array, descriptor) + candidate_chunks = _candidate_units_from_boundaries(l1, plan) + bucket_masks = np.zeros((len(l1), int(light["bucket_count"])), dtype=bool) + if not np.any(candidate_chunks): + return bucket_masks, 0, 0 + + values_sidecar, bucket_sidecar, l2_sidecar = _load_light_sidecar_handles(array, descriptor) + dtype = np.dtype(descriptor["dtype"]) + chunk_len = int(light["chunk_len"]) + nav_segment_len = int(light["nav_segment_len"]) + nsegments_per_chunk = int(light["nsegments_per_chunk"]) + bucket_dtype = np.dtype(light.get("bucket_dtype", np.uint16)) + value_lossy_bits = int(light.get("value_lossy_bits", 0)) + search_plan = _light_search_plan(plan, dtype, value_lossy_bits) + total_candidate_segments = 0 + l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + span_values = np.empty(chunk_len, dtype=dtype) + bucket_ids = np.empty(chunk_len, dtype=bucket_dtype) + + for chunk_id in np.flatnonzero(candidate_chunks): + chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) + segment_count = _segment_row_count(chunk_items, nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) + segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) + total_candidate_segments += candidate_segments + if not segment_runs: + continue + + for seg_start_idx, seg_stop_idx in segment_runs: + local_start = seg_start_idx * nav_segment_len + local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) + span_items = local_stop - local_start + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(values_view, search_plan) + if lo >= hi: + continue + bucket_view = bucket_ids[: hi - lo] + bucket_sidecar.get_1d_span_numpy(bucket_view, int(chunk_id), local_start + lo, hi - lo) + bucket_masks[int(chunk_id), bucket_view.astype(np.intp, copy=False)] = True + + return bucket_masks, int(np.count_nonzero(candidate_chunks)), total_candidate_segments + + +def _exact_positions_from_reduced_chunk_nav_ooc( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> tuple[np.ndarray, int, int]: + reduced = descriptor["reduced"] + offsets = _load_array_sidecar(array, descriptor["token"], "reduced", "offsets", reduced["offsets_path"]) + l1 = _load_reduced_l1_array(array, descriptor) + candidate_chunks = _candidate_units_from_boundaries(l1, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64), 0, 0 + + values_sidecar, positions_sidecar, l2_sidecar = _load_reduced_sidecar_handles(array, descriptor) + dtype = np.dtype(descriptor["dtype"]) + chunk_len = int(reduced["chunk_len"]) + nav_segment_len = int(reduced["nav_segment_len"]) + nsegments_per_chunk = int(reduced["nsegments_per_chunk"]) + local_position_dtype = np.dtype(reduced.get("position_dtype", np.uint32)) + parts = [] + total_candidate_segments = 0 + l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + span_values = np.empty(chunk_len, dtype=dtype) + local_positions = np.empty(chunk_len, dtype=local_position_dtype) + + for chunk_id in np.flatnonzero(candidate_chunks): + chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) + segment_count = _segment_row_count(chunk_items, nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) + segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) + total_candidate_segments += candidate_segments + if not segment_runs: + continue + + for seg_start_idx, seg_stop_idx in segment_runs: + local_start = seg_start_idx * nav_segment_len + local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) + span_items = local_stop - local_start + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(values_view, plan) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, int(chunk_id), local_start + lo, hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + + if not parts: + return np.empty(0, dtype=np.int64), int(np.count_nonzero(candidate_chunks)), total_candidate_segments + positions = np.concatenate(parts) if len(parts) > 1 else parts[0] + return ( + np.sort(positions, kind="stable"), + int(np.count_nonzero(candidate_chunks)), + total_candidate_segments, + ) + + def _bit_count_sum(masks: np.ndarray) -> int: + if masks.dtype == bool: + return int(np.count_nonzero(masks)) return sum(int(mask).bit_count() for mask in masks.tolist()) def _bucket_masks_from_light( array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan -) -> np.ndarray: +) -> tuple[np.ndarray, int, int]: if _range_is_empty(plan): - return np.empty(0, dtype=np.uint64) + return np.empty((0, 0), dtype=bool), 0, 0 - summaries = _load_level_summaries(array, descriptor, "block") + if _chunk_nav_supports_selective_ooc_lookup(array, descriptor, "light"): + return _bucket_masks_from_light_chunk_nav_ooc(array, descriptor, plan) + + summaries = _load_level_summaries(array, descriptor, "chunk") dtype = np.dtype(descriptor["dtype"]) - candidate_blocks = _candidate_units_from_exact_plan(summaries, dtype, plan) - if not np.any(candidate_blocks): - return np.zeros(len(summaries), dtype=np.uint64) + candidate_chunks = _candidate_units_from_exact_plan(summaries, dtype, plan) + light = descriptor["light"] + chunk_len = int(light["chunk_len"]) + bucket_count = int(light["bucket_count"]) + bucket_masks = np.zeros((len(summaries), bucket_count), dtype=bool) + if not np.any(candidate_chunks): + return bucket_masks, 0, 0 sorted_values, bucket_positions, offsets = _load_light_arrays(array, descriptor) - light = descriptor["light"] value_lossy_bits = int(light.get("value_lossy_bits", 0)) - dtype = np.dtype(descriptor["dtype"]) - masks = np.zeros(len(summaries), dtype=np.uint64) - for block_id in np.flatnonzero(candidate_blocks): - start = int(offsets[block_id]) - stop = int(offsets[block_id + 1]) - block_values = sorted_values[start:stop] + nav_segment_len = int(light["nav_segment_len"]) + nsegments_per_chunk = int(light["nsegments_per_chunk"]) + l2 = _load_light_navigation_arrays(array, descriptor)[1] + total_candidate_segments = 0 + + for chunk_id in np.flatnonzero(candidate_chunks): + start = int(offsets[chunk_id]) + stop = int(offsets[chunk_id + 1]) + chunk_values = sorted_values[start:stop] + row_start = int(chunk_id) * nsegments_per_chunk + row_stop = row_start + _segment_row_count(min(chunk_len, stop - start), nav_segment_len) + segment_mask = _candidate_units_from_boundaries(l2[row_start:row_stop], plan) + total_candidate_segments += int(np.count_nonzero(segment_mask)) + if not np.any(segment_mask): + continue + if value_lossy_bits > 0: if plan.lower is not None: if dtype.kind in {"i", "u"}: @@ -2708,44 +3127,58 @@ def _bucket_masks_from_light( upper=plan.upper, upper_inclusive=plan.upper_inclusive, ) - lo, hi = _search_bounds(block_values, search_plan) + lo, hi = _search_bounds(chunk_values, search_plan) else: - lo, hi = _search_bounds(block_values, plan) + lo, hi = _search_bounds(chunk_values, plan) if lo >= hi: continue - masks[block_id] = _pack_bucket_mask(bucket_positions[start + lo : start + hi]) - return masks + bucket_masks[ + int(chunk_id), np.unique(bucket_positions[start + lo : start + hi].astype(np.int64)) + ] = True + return bucket_masks, int(np.count_nonzero(candidate_chunks)), total_candidate_segments def _exact_positions_from_reduced( array: blosc2.NDArray, descriptor: dict, dtype: np.dtype, plan: ExactPredicatePlan -) -> np.ndarray: +) -> tuple[np.ndarray, int, int]: if _range_is_empty(plan): - return np.empty(0, dtype=np.int64) + return np.empty(0, dtype=np.int64), 0, 0 - summaries = _load_level_summaries(array, descriptor, "block") - candidate_blocks = _candidate_units_from_exact_plan(summaries, dtype, plan) - if not np.any(candidate_blocks): - return np.empty(0, dtype=np.int64) + if _chunk_nav_supports_selective_ooc_lookup(array, descriptor, "medium"): + return _exact_positions_from_reduced_chunk_nav_ooc(array, descriptor, plan) + + summaries = _load_level_summaries(array, descriptor, "chunk") + candidate_chunks = _candidate_units_from_exact_plan(summaries, dtype, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64), 0, 0 sorted_values, local_positions, offsets = _load_reduced_arrays(array, descriptor) - block_len = int(descriptor["reduced"]["block_len"]) + chunk_len = int(descriptor["reduced"]["chunk_len"]) + nav_segment_len = int(descriptor["reduced"]["nav_segment_len"]) + nsegments_per_chunk = int(descriptor["reduced"]["nsegments_per_chunk"]) + l2 = _load_reduced_navigation_arrays(array, descriptor)[1] parts = [] - for block_id in np.flatnonzero(candidate_blocks): - start = int(offsets[block_id]) - stop = int(offsets[block_id + 1]) - block_values = sorted_values[start:stop] - lo, hi = _search_bounds(block_values, plan) + total_candidate_segments = 0 + for chunk_id in np.flatnonzero(candidate_chunks): + start = int(offsets[chunk_id]) + stop = int(offsets[chunk_id + 1]) + chunk_values = sorted_values[start:stop] + row_start = int(chunk_id) * nsegments_per_chunk + row_stop = row_start + _segment_row_count(min(chunk_len, stop - start), nav_segment_len) + segment_mask = _candidate_units_from_boundaries(l2[row_start:row_stop], plan) + total_candidate_segments += int(np.count_nonzero(segment_mask)) + if not np.any(segment_mask): + continue + lo, hi = _search_bounds(chunk_values, plan) if lo >= hi: continue - absolute = block_id * block_len local = local_positions[start + lo : start + hi].astype(np.int64, copy=False) - parts.append(absolute + local) + parts.append(chunk_id * chunk_len + local) if not parts: - return np.empty(0, dtype=np.int64) + return np.empty(0, dtype=np.int64), int(np.count_nonzero(candidate_chunks)), total_candidate_segments merged = np.concatenate(parts) if len(parts) > 1 else parts[0] - return np.sort(merged, kind="stable") + return np.sort(merged, kind="stable"), int(np.count_nonzero(candidate_chunks)), total_candidate_segments def _exact_positions_from_plan(plan: ExactPredicatePlan) -> np.ndarray | None: @@ -2755,7 +3188,7 @@ def _exact_positions_from_plan(plan: ExactPredicatePlan) -> np.ndarray | None: if kind == "medium": return _exact_positions_from_reduced( plan.base, plan.descriptor, np.dtype(plan.descriptor["dtype"]), plan - ) + )[0] return None @@ -2797,10 +3230,18 @@ def _plan_multi_exact_query(plans: list[ExactPredicatePlan]) -> IndexPlan | None base, exact_positions = multi_exact if len(exact_positions) >= int(base.shape[0]): return None + descriptor = _copy_descriptor(plans[0].descriptor) + lookup_path = None + if descriptor["kind"] == "medium": + lookup_path = ( + "chunk-nav-ooc" + if _chunk_nav_supports_selective_ooc_lookup(base, descriptor, "medium") + else "chunk-nav" + ) return IndexPlan( True, "multi-field exact indexes selected", - descriptor=_copy_descriptor(plans[0].descriptor), + descriptor=descriptor, base=base, target=plans[0].descriptor.get("target"), field=None, @@ -2808,6 +3249,7 @@ def _plan_multi_exact_query(plans: list[ExactPredicatePlan]) -> IndexPlan | None total_units=int(base.shape[0]), selected_units=len(exact_positions), exact_positions=exact_positions, + lookup_path=lookup_path, ) @@ -2829,7 +3271,7 @@ def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: ) if kind == "medium": dtype = np.dtype(exact_plan.descriptor["dtype"]) - exact_positions = _exact_positions_from_reduced( + exact_positions, candidate_chunks, candidate_nav_segments = _exact_positions_from_reduced( exact_plan.base, exact_plan.descriptor, dtype, exact_plan ) return IndexPlan( @@ -2843,10 +3285,18 @@ def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: total_units=exact_plan.base.shape[0], selected_units=len(exact_positions), exact_positions=exact_positions, + chunk_len=int(exact_plan.descriptor["reduced"]["chunk_len"]), + candidate_chunks=candidate_chunks, + candidate_nav_segments=candidate_nav_segments, + lookup_path="chunk-nav-ooc" + if _chunk_nav_supports_selective_ooc_lookup(exact_plan.base, exact_plan.descriptor, "medium") + else "chunk-nav", ) - bucket_masks = _bucket_masks_from_light(exact_plan.base, exact_plan.descriptor, exact_plan) + bucket_masks, candidate_chunks, candidate_nav_segments = _bucket_masks_from_light( + exact_plan.base, exact_plan.descriptor, exact_plan + ) light = exact_plan.descriptor["light"] - total_units = len(bucket_masks) * int(light["bucket_count"]) + total_units = bucket_masks.size selected_units = _bit_count_sum(bucket_masks) if selected_units < total_units: return IndexPlan( @@ -2861,11 +3311,16 @@ def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: selected_units=selected_units, bucket_masks=bucket_masks, bucket_len=int(light["bucket_len"]), - block_len=int(light["block_len"]), + chunk_len=int(light["chunk_len"]), lower=exact_plan.lower, lower_inclusive=exact_plan.lower_inclusive, upper=exact_plan.upper, upper_inclusive=exact_plan.upper_inclusive, + candidate_chunks=candidate_chunks, + candidate_nav_segments=candidate_nav_segments, + lookup_path="chunk-nav-ooc" + if _chunk_nav_supports_selective_ooc_lookup(exact_plan.base, exact_plan.descriptor, "light") + else "chunk-nav", ) return IndexPlan(False, "available exact index does not prune any units for this predicate") @@ -2958,43 +3413,33 @@ def evaluate_segment_query( return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) -def evaluate_light_query( # noqa: C901 +def evaluate_light_query( expression: str, operands: dict, ne_args: dict, where: dict, plan: IndexPlan ) -> np.ndarray: del expression, operands, ne_args - if plan.base is None or plan.bucket_masks is None or plan.block_len is None or plan.bucket_len is None: - raise ValueError("light evaluation requires bucket masks and block geometry") + if plan.base is None or plan.bucket_masks is None or plan.chunk_len is None or plan.bucket_len is None: + raise ValueError("light evaluation requires bucket masks and chunk geometry") parts = [] total_len = int(plan.base.shape[0]) chunk_len = int(plan.base.chunks[0]) - bucket_count = int(plan.descriptor["light"]["bucket_count"]) where_x = where["_where_x"] - for block_id, bucket_mask in enumerate(plan.bucket_masks.tolist()): - mask = int(bucket_mask) - if mask == 0: + for chunk_id, bucket_mask in enumerate(plan.bucket_masks): + if not np.any(bucket_mask): continue - block_start = block_id * plan.block_len - block_stop = min(block_start + plan.block_len, total_len) - bucket_id = 0 - while bucket_id < bucket_count: - if not ((mask >> bucket_id) & 1): - bucket_id += 1 - continue - run_start = bucket_id - bucket_id += 1 - while bucket_id < bucket_count and ((mask >> bucket_id) & 1): - bucket_id += 1 - start = block_start + run_start * plan.bucket_len - stop = min(block_start + bucket_id * plan.bucket_len, block_stop) + chunk_start = chunk_id * plan.chunk_len + chunk_stop = min(chunk_start + plan.chunk_len, total_len) + for run_start, run_stop in _contiguous_true_runs(np.asarray(bucket_mask, dtype=bool)): + start = chunk_start + run_start * plan.bucket_len + stop = min(chunk_start + run_stop * plan.bucket_len, chunk_stop) if start >= stop: continue if _supports_block_reads(where_x): span = np.empty(stop - start, dtype=where_x.dtype) - chunk_id = start // chunk_len - local_start = start - chunk_id * chunk_len - where_x.get_1d_span_numpy(span, chunk_id, local_start, stop - start) + base_chunk_id = start // chunk_len + local_start = start - base_chunk_id * chunk_len + where_x.get_1d_span_numpy(span, base_chunk_id, local_start, stop - start) else: span = where_x[start:stop] if plan.target is not None and plan.target.get("source") == "expression": @@ -3459,10 +3904,12 @@ def explain_query(expr) -> dict: "secondary_refinement": False, "candidate_units": plan.selected_units, "total_units": plan.total_units, - "candidate_chunks": plan.selected_units, + "candidate_chunks": plan.candidate_chunks if plan.candidate_chunks else plan.selected_units, "total_chunks": plan.total_units, + "candidate_nav_segments": plan.candidate_nav_segments or None, + "candidate_base_spans": plan.candidate_base_spans or None, "exact_rows": None if plan.exact_positions is None else len(plan.exact_positions), "full_runs": _full_run_count(plan.descriptor), - "lookup_path": _full_lookup_path(plan.descriptor, ordered=False), + "lookup_path": plan.lookup_path or _full_lookup_path(plan.descriptor, ordered=False), "descriptor": plan.descriptor, } diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 7ef0b1dc..f882302a 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -192,6 +192,32 @@ def test_default_ooc_persistent_index_matches_scan_and_rebuilds(tmp_path, kind): assert rebuilt["ooc"] is True +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_chunk_local_index_descriptor_and_lookup_path(tmp_path, kind): + path = tmp_path / f"chunk_local_{kind}.b2nd" + rng = np.random.default_rng(11) + data = np.arange(240_000, dtype=np.int64) + rng.shuffle(data) + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(24_000,), blocks=(4_000,)) + descriptor = arr.create_index(kind=kind) + meta = descriptor["light"] if kind == "light" else descriptor["reduced"] + + assert meta["layout"] == "chunk-local-v1" + assert meta["chunk_len"] == arr.chunks[0] + assert meta["nav_segment_len"] == arr.blocks[0] + assert meta["l1_path"] is not None + assert meta["l2_path"] is not None + + reopened = blosc2.open(path, mode="a") + expr = (reopened == 123_456).where(reopened) + explanation = expr.explain() + + assert explanation["lookup_path"] == "chunk-nav-ooc" + assert explanation["candidate_nav_segments"] is not None + np.testing.assert_array_equal(expr.compute()[:], data[data == 123_456]) + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_small_default_index_builder_uses_ooc(kind): data = np.arange(100_000, dtype=np.int64) From 27517765b441e3676faab62e993983eb771d5075 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 4 Apr 2026 09:26:18 +0200 Subject: [PATCH 21/68] Fix OOC light/medium append rebuilds --- src/blosc2/indexing.py | 47 ++++++++++++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index f752ba2b..95ed8c0a 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -1648,9 +1648,21 @@ def _replace_reduced_descriptor_tail( reduced = descriptor["reduced"] for key in ("values_path", "positions_path", "offsets_path", "l1_path", "l2_path"): _remove_sidecar_path(reduced.get(key)) - rebuilt = _build_reduced_descriptor( - array, descriptor["token"], descriptor["kind"], _values_for_target(array, target), persistent - ) + if descriptor.get("ooc", False): + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + rebuilt = _build_reduced_descriptor_ooc( + array, + target, + descriptor["token"], + descriptor["kind"], + np.dtype(descriptor["dtype"]), + persistent, + Path(tmpdir), + ) + else: + rebuilt = _build_reduced_descriptor( + array, descriptor["token"], descriptor["kind"], _values_for_target(array, target), persistent + ) descriptor["reduced"] = rebuilt @@ -1662,14 +1674,27 @@ def _replace_light_descriptor_tail( light = descriptor["light"] for key in ("values_path", "bucket_positions_path", "offsets_path", "l1_path", "l2_path"): _remove_sidecar_path(light.get(key)) - rebuilt = _build_light_descriptor( - array, - descriptor["token"], - descriptor["kind"], - _values_for_target(array, target), - descriptor["optlevel"], - persistent, - ) + if descriptor.get("ooc", False): + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + rebuilt = _build_light_descriptor_ooc( + array, + target, + descriptor["token"], + descriptor["kind"], + np.dtype(descriptor["dtype"]), + descriptor["optlevel"], + persistent, + Path(tmpdir), + ) + else: + rebuilt = _build_light_descriptor( + array, + descriptor["token"], + descriptor["kind"], + _values_for_target(array, target), + descriptor["optlevel"], + persistent, + ) descriptor["light"] = rebuilt From e401e7a2287b87736792ddc595f38645fef76b7b Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 4 Apr 2026 09:57:03 +0200 Subject: [PATCH 22/68] Tune medium nav density by optlevel --- src/blosc2/indexing.py | 49 +++++++++++++++++++++++++++++----- tests/ndarray/test_indexing.py | 6 ++++- 2 files changed, 47 insertions(+), 8 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 95ed8c0a..4b9d93a9 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -638,10 +638,11 @@ def _build_reduced_descriptor( token: str, kind: str, values: np.ndarray, + optlevel: int, persistent: bool, ) -> dict: chunk_len = int(array.chunks[0]) - nav_segment_len = int(array.blocks[0]) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), optlevel) sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( values, chunk_len, nav_segment_len ) @@ -663,6 +664,7 @@ def _build_reduced_descriptor( nav_segment_len, ) reduced["position_dtype"] = positions.dtype.str + reduced["nav_segment_divisor"] = nav_segment_divisor return reduced @@ -682,6 +684,23 @@ def _sidecar_block_len(sidecar: dict, fallback_block_len: int) -> int: return int(blosc2.open(path).blocks[0]) +def _medium_nav_segment_divisor(optlevel: int) -> int: + if optlevel <= 1: + return 1 + if optlevel == 2: + return 2 + if optlevel == 3: + return 4 + if optlevel <= 6: + return 8 + return 16 + + +def _medium_nav_segment_len(block_len: int, optlevel: int) -> tuple[int, int]: + divisor = min(block_len, _medium_nav_segment_divisor(int(optlevel))) + return max(1, block_len // divisor), divisor + + def _build_chunk_sorted_payload( values: np.ndarray, chunk_len: int, @@ -843,11 +862,12 @@ def _build_reduced_descriptor_ooc( token: str, kind: str, dtype: np.dtype, + optlevel: int, persistent: bool, workdir: Path, ) -> dict: chunk_len = int(array.chunks[0]) - nav_segment_len = int(array.blocks[0]) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), optlevel) sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload_ooc( array, target, dtype, workdir, f"{kind}_reduced", chunk_len, nav_segment_len ) @@ -869,6 +889,7 @@ def _build_reduced_descriptor_ooc( nav_segment_len, ) reduced["position_dtype"] = positions.dtype.str + reduced["nav_segment_divisor"] = nav_segment_divisor return reduced @@ -1393,7 +1414,9 @@ def create_index( else None ) reduced = ( - _build_reduced_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) + _build_reduced_descriptor_ooc( + array, target, token, kind, dtype, optlevel, persistent, workdir + ) if kind == "medium" else None ) @@ -1427,7 +1450,9 @@ def create_index( else None ) reduced = ( - _build_reduced_descriptor(array, token, kind, values, persistent) if kind == "medium" else None + _build_reduced_descriptor(array, token, kind, values, optlevel, persistent) + if kind == "medium" + else None ) full = _build_full_descriptor(array, token, kind, values, persistent) if kind == "full" else None descriptor = _build_descriptor( @@ -1493,7 +1518,9 @@ def create_expr_index( else None ) reduced = ( - _build_reduced_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) + _build_reduced_descriptor_ooc( + array, target, token, kind, dtype, optlevel, persistent, workdir + ) if kind == "medium" else None ) @@ -1527,7 +1554,9 @@ def create_expr_index( else None ) reduced = ( - _build_reduced_descriptor(array, token, kind, values, persistent) if kind == "medium" else None + _build_reduced_descriptor(array, token, kind, values, optlevel, persistent) + if kind == "medium" + else None ) full = _build_full_descriptor(array, token, kind, values, persistent) if kind == "full" else None descriptor = _build_descriptor( @@ -1656,12 +1685,18 @@ def _replace_reduced_descriptor_tail( descriptor["token"], descriptor["kind"], np.dtype(descriptor["dtype"]), + descriptor["optlevel"], persistent, Path(tmpdir), ) else: rebuilt = _build_reduced_descriptor( - array, descriptor["token"], descriptor["kind"], _values_for_target(array, target), persistent + array, + descriptor["token"], + descriptor["kind"], + _values_for_target(array, target), + descriptor["optlevel"], + persistent, ) descriptor["reduced"] = rebuilt diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index f882302a..2c8959ba 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -205,10 +205,14 @@ def test_chunk_local_index_descriptor_and_lookup_path(tmp_path, kind): assert meta["layout"] == "chunk-local-v1" assert meta["chunk_len"] == arr.chunks[0] - assert meta["nav_segment_len"] == arr.blocks[0] + expected_nav_len = arr.blocks[0] if kind == "light" else arr.blocks[0] // 8 + assert meta["nav_segment_len"] == expected_nav_len assert meta["l1_path"] is not None assert meta["l2_path"] is not None + if kind == "medium": + assert meta["nav_segment_divisor"] == 8 + reopened = blosc2.open(path, mode="a") expr = (reopened == 123_456).where(reopened) explanation = expr.explain() From 87d0b960362f0ca0335f05d37e7b43be8878e5ed Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 4 Apr 2026 09:59:27 +0200 Subject: [PATCH 23/68] Release compaction memmaps before unlink on Windows --- src/blosc2/indexing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 4b9d93a9..f759bd87 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -1983,6 +1983,7 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N sorted_values = np.load(final_run.values_path, mmap_mode="r") positions = np.load(final_run.positions_path, mmap_mode="r") _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) + del sorted_values, positions final_run.values_path.unlink(missing_ok=True) final_run.positions_path.unlink(missing_ok=True) From 7fb696d7a71a32ed6573cffd5d21b167fd809413 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 4 Apr 2026 11:40:15 +0200 Subject: [PATCH 24/68] Split indexing accelerators into indexing_ext and widen dtype coverage Move indexing-specific Cython helpers out of blosc2_ext.pyx into the new src/blosc2/indexing_ext.pyx module and wire indexing.py plus the CMake build to use the dedicated extension. Keep the accelerated query paths for light and medium but extend their typed dispatch beyond float64/int64 to cover the core numeric family: float32, float64, int8/16/32/64, and uint8/16/32/64. Retain the existing Python/NumPy fallback for unsupported dtypes. Add dispatch-focused indexing tests covering the accelerated numeric dtypes for medium, representative light numeric paths, and an unsupported float16 fallback case. Fix unsigned light lossy quantization masks so uint* dtypes do not overflow during index build. --- CMakeLists.txt | 12 +- src/blosc2/blosc2_ext.pyx | 1 + src/blosc2/indexing.py | 97 ++- src/blosc2/indexing_ext.pyx | 1495 ++++++++++++++++++++++++++++++++ tests/ndarray/test_indexing.py | 60 ++ 5 files changed, 1644 insertions(+), 21 deletions(-) create mode 100644 src/blosc2/indexing_ext.pyx diff --git a/CMakeLists.txt b/CMakeLists.txt index ed326b2a..9397a6db 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,11 +41,20 @@ add_custom_command( DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/blosc2_ext.pyx" VERBATIM) +add_custom_command( + OUTPUT indexing_ext.c + COMMAND Python::Interpreter -m cython + "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx" --output-file indexing_ext.c + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx" + VERBATIM) + # ...and add it to the target Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI) +Python_add_library(indexing_ext MODULE indexing_ext.c WITH_SOABI) # We need to link against NumPy target_link_libraries(blosc2_ext PRIVATE Python::NumPy) +target_link_libraries(indexing_ext PRIVATE Python::NumPy) # Fetch and build miniexpr library include(FetchContent) @@ -72,6 +81,7 @@ FetchContent_MakeAvailable(miniexpr) target_link_libraries(blosc2_ext PRIVATE miniexpr_static) target_compile_features(blosc2_ext PRIVATE c_std_11) +target_compile_features(indexing_ext PRIVATE c_std_11) if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang") execute_process( COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir @@ -148,7 +158,7 @@ endif() # Python extension -> site-packages/blosc2 install( - TARGETS blosc2_ext + TARGETS blosc2_ext indexing_ext LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/blosc2 ) diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 50888a9c..c36c51c5 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -59,6 +59,7 @@ ctypedef fused T: int32_t int64_t + cdef extern from "": int printf(const char *format, ...) nogil diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index f759bd87..7688e4d9 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -20,6 +20,8 @@ import blosc2 +from . import indexing_ext + INDEXES_VLMETA_KEY = "blosc2_indexes" INDEX_FORMAT_VERSION = 1 SELF_TARGET_NAME = "__self__" @@ -917,7 +919,8 @@ def _quantize_integer_array(values: np.ndarray, bits: int) -> np.ndarray: if bits <= 0: return values dtype = np.dtype(values.dtype) - mask = np.asarray(~((1 << bits) - 1), dtype=dtype)[()] + base_mask = np.iinfo(dtype).max if dtype.kind == "u" else -1 + mask = np.asarray(base_mask ^ ((1 << bits) - 1), dtype=dtype)[()] quantized = values.copy() np.bitwise_and(quantized, mask, out=quantized) return quantized @@ -927,7 +930,8 @@ def _quantize_integer_scalar(value, dtype: np.dtype, bits: int): scalar = np.asarray(value, dtype=dtype)[()] if bits <= 0: return scalar - mask = np.asarray(~((1 << bits) - 1), dtype=dtype)[()] + base_mask = np.iinfo(dtype).max if dtype.kind == "u" else -1 + mask = np.asarray(base_mask ^ ((1 << bits) - 1), dtype=dtype)[()] return np.bitwise_and(scalar, mask, dtype=dtype) @@ -2662,15 +2666,20 @@ def _candidate_units_from_exact_plan( def _search_bounds(values: np.ndarray, plan: ExactPredicatePlan) -> tuple[int, int]: - lo = 0 - hi = len(values) - if plan.lower is not None: - side = "left" if plan.lower_inclusive else "right" - lo = int(np.searchsorted(values, plan.lower, side=side)) - if plan.upper is not None: - side = "right" if plan.upper_inclusive else "left" - hi = int(np.searchsorted(values, plan.upper, side=side)) - return lo, hi + try: + return indexing_ext.index_search_bounds( + values, plan.lower, plan.lower_inclusive, plan.upper, plan.upper_inclusive + ) + except TypeError: + lo = 0 + hi = len(values) + if plan.lower is not None: + side = "left" if plan.lower_inclusive else "right" + lo = int(np.searchsorted(values, plan.lower, side=side)) + if plan.upper is not None: + side = "right" if plan.upper_inclusive else "left" + hi = int(np.searchsorted(values, plan.upper, side=side)) + return lo, hi def _candidate_units_from_boundaries(boundaries: np.ndarray, plan: ExactPredicatePlan) -> np.ndarray: @@ -2982,11 +2991,33 @@ def _chunk_nav_supports_selective_ooc_lookup(array: blosc2.NDArray, descriptor: def _chunk_nav_candidate_runs( l2_row: np.ndarray, segment_count: int, plan: ExactPredicatePlan ) -> tuple[list[tuple[int, int]], int]: - segment_mask = _candidate_units_from_boundaries(l2_row[:segment_count], plan) - if not np.any(segment_mask): + segment_lo, segment_hi = _sorted_boundary_search_bounds(l2_row[:segment_count], plan) + if segment_lo >= segment_hi: return [], 0 - runs = _contiguous_true_runs(segment_mask) - return runs, int(np.count_nonzero(segment_mask)) + return [(segment_lo, segment_hi)], segment_hi - segment_lo + + +def _sorted_boundary_search_bounds(boundaries: np.ndarray, plan: ExactPredicatePlan) -> tuple[int, int]: + if len(boundaries) == 0: + return 0, 0 + starts = boundaries["start"] + ends = boundaries["end"] + try: + lo, hi = indexing_ext.index_search_boundary_bounds( + starts, ends, plan.lower, plan.lower_inclusive, plan.upper, plan.upper_inclusive + ) + except TypeError: + lo = 0 + hi = len(boundaries) + if plan.lower is not None: + lo = int(np.searchsorted(ends, plan.lower, side="left" if plan.lower_inclusive else "right")) + if plan.upper is not None: + hi = int(np.searchsorted(starts, plan.upper, side="right" if plan.upper_inclusive else "left")) + if lo < 0: + lo = 0 + if hi > len(boundaries): + hi = len(boundaries) + return lo, hi def _light_search_plan( @@ -3079,13 +3110,39 @@ def _exact_positions_from_reduced_chunk_nav_ooc( nav_segment_len = int(reduced["nav_segment_len"]) nsegments_per_chunk = int(reduced["nsegments_per_chunk"]) local_position_dtype = np.dtype(reduced.get("position_dtype", np.uint32)) - parts = [] - total_candidate_segments = 0 + candidate_chunk_ids = np.flatnonzero(candidate_chunks).astype(np.intp, copy=False) l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) span_values = np.empty(chunk_len, dtype=dtype) local_positions = np.empty(chunk_len, dtype=local_position_dtype) - for chunk_id in np.flatnonzero(candidate_chunks): + try: + positions, total_candidate_segments = indexing_ext.index_collect_reduced_chunk_nav_positions( + offsets, + candidate_chunk_ids, + values_sidecar, + positions_sidecar, + l2_sidecar, + l2_row, + span_values, + local_positions, + chunk_len, + nav_segment_len, + nsegments_per_chunk, + plan.lower, + plan.lower_inclusive, + plan.upper, + plan.upper_inclusive, + ) + if len(positions) == 0: + return np.empty(0, dtype=np.int64), int(candidate_chunk_ids.size), total_candidate_segments + return np.sort(positions, kind="stable"), int(candidate_chunk_ids.size), total_candidate_segments + except TypeError: + pass + + parts = [] + total_candidate_segments = 0 + + for chunk_id in candidate_chunk_ids: chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) segment_count = _segment_row_count(chunk_items, nav_segment_len) l2_sidecar.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) @@ -3108,11 +3165,11 @@ def _exact_positions_from_reduced_chunk_nav_ooc( parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) if not parts: - return np.empty(0, dtype=np.int64), int(np.count_nonzero(candidate_chunks)), total_candidate_segments + return np.empty(0, dtype=np.int64), int(candidate_chunk_ids.size), total_candidate_segments positions = np.concatenate(parts) if len(parts) > 1 else parts[0] return ( np.sort(positions, kind="stable"), - int(np.count_nonzero(candidate_chunks)), + int(candidate_chunk_ids.size), total_candidate_segments, ) diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx new file mode 100644 index 00000000..087c7fa0 --- /dev/null +++ b/src/blosc2/indexing_ext.pyx @@ -0,0 +1,1495 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np +cimport numpy as np + +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t + + +cdef inline Py_ssize_t _search_left_float32(np.float32_t[:] values, np.float32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_float32(np.float32_t[:] values, np.float32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_float64(np.float64_t[:] values, np.float64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_float64(np.float64_t[:] values, np.float64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int8(np.int8_t[:] values, np.int8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int8(np.int8_t[:] values, np.int8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int16(np.int16_t[:] values, np.int16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int16(np.int16_t[:] values, np.int16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int32(np.int32_t[:] values, np.int32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int32(np.int32_t[:] values, np.int32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int64(np.int64_t[:] values, np.int64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int64(np.int64_t[:] values, np.int64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint8(np.uint8_t[:] values, np.uint8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint8(np.uint8_t[:] values, np.uint8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint16(np.uint16_t[:] values, np.uint16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint16(np.uint16_t[:] values, np.uint16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint32(np.uint32_t[:] values, np.uint32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint32(np.uint32_t[:] values, np.uint32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint64(np.uint64_t[:] values, np.uint64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint64(np.uint64_t[:] values, np.uint64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline tuple _search_bounds_float32_impl( + np.ndarray[np.float32_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef np.float32_t lower_v + cdef np.float32_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float32(values, lower_v) if lower_inclusive else _search_right_float32(values, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float32(values, upper_v) if upper_inclusive else _search_left_float32(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_float32_impl( + np.ndarray[np.float32_t, ndim=1] starts, + np.ndarray[np.float32_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef np.float32_t lower_v + cdef np.float32_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float32(ends, lower_v) if lower_inclusive else _search_right_float32(ends, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float32(starts, upper_v) if upper_inclusive else _search_left_float32(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_float64_impl( + np.ndarray[np.float64_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef np.float64_t lower_v + cdef np.float64_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float64(values, lower_v) if lower_inclusive else _search_right_float64(values, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float64(values, upper_v) if upper_inclusive else _search_left_float64(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_float64_impl( + np.ndarray[np.float64_t, ndim=1] starts, + np.ndarray[np.float64_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef np.float64_t lower_v + cdef np.float64_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float64(ends, lower_v) if lower_inclusive else _search_right_float64(ends, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float64(starts, upper_v) if upper_inclusive else _search_left_float64(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int8_impl( + np.ndarray[np.int8_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int8_t lower_v + cdef np.int8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 127: + lo = hi + elif lower_i >= -128: + lower_v = lower_i + lo = _search_left_int8(values, lower_v) if lower_inclusive else _search_right_int8(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -128: + hi = 0 + elif upper_i <= 127: + upper_v = upper_i + hi = _search_right_int8(values, upper_v) if upper_inclusive else _search_left_int8(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int8_impl( + np.ndarray[np.int8_t, ndim=1] starts, + np.ndarray[np.int8_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int8_t lower_v + cdef np.int8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 127: + lo = hi + elif lower_i >= -128: + lower_v = lower_i + lo = _search_left_int8(ends, lower_v) if lower_inclusive else _search_right_int8(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -128: + hi = 0 + elif upper_i <= 127: + upper_v = upper_i + hi = _search_right_int8(starts, upper_v) if upper_inclusive else _search_left_int8(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int16_impl( + np.ndarray[np.int16_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int16_t lower_v + cdef np.int16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 32767: + lo = hi + elif lower_i >= -32768: + lower_v = lower_i + lo = _search_left_int16(values, lower_v) if lower_inclusive else _search_right_int16(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -32768: + hi = 0 + elif upper_i <= 32767: + upper_v = upper_i + hi = _search_right_int16(values, upper_v) if upper_inclusive else _search_left_int16(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int16_impl( + np.ndarray[np.int16_t, ndim=1] starts, + np.ndarray[np.int16_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int16_t lower_v + cdef np.int16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 32767: + lo = hi + elif lower_i >= -32768: + lower_v = lower_i + lo = _search_left_int16(ends, lower_v) if lower_inclusive else _search_right_int16(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -32768: + hi = 0 + elif upper_i <= 32767: + upper_v = upper_i + hi = _search_right_int16(starts, upper_v) if upper_inclusive else _search_left_int16(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int32_impl( + np.ndarray[np.int32_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef long long lower_i + cdef long long upper_i + cdef np.int32_t lower_v + cdef np.int32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 2147483647: + lo = hi + elif lower_i >= -2147483648: + lower_v = lower_i + lo = _search_left_int32(values, lower_v) if lower_inclusive else _search_right_int32(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -2147483648: + hi = 0 + elif upper_i <= 2147483647: + upper_v = upper_i + hi = _search_right_int32(values, upper_v) if upper_inclusive else _search_left_int32(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int32_impl( + np.ndarray[np.int32_t, ndim=1] starts, + np.ndarray[np.int32_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef long long lower_i + cdef long long upper_i + cdef np.int32_t lower_v + cdef np.int32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 2147483647: + lo = hi + elif lower_i >= -2147483648: + lower_v = lower_i + lo = _search_left_int32(ends, lower_v) if lower_inclusive else _search_right_int32(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -2147483648: + hi = 0 + elif upper_i <= 2147483647: + upper_v = upper_i + hi = _search_right_int32(starts, upper_v) if upper_inclusive else _search_left_int32(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int64_impl( + np.ndarray[np.int64_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.int64_t lower_v + cdef np.int64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 9223372036854775807: + lo = hi + elif lower_i >= -9223372036854775808: + lower_v = lower_i + lo = _search_left_int64(values, lower_v) if lower_inclusive else _search_right_int64(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -9223372036854775808: + hi = 0 + elif upper_i <= 9223372036854775807: + upper_v = upper_i + hi = _search_right_int64(values, upper_v) if upper_inclusive else _search_left_int64(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int64_impl( + np.ndarray[np.int64_t, ndim=1] starts, + np.ndarray[np.int64_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.int64_t lower_v + cdef np.int64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 9223372036854775807: + lo = hi + elif lower_i >= -9223372036854775808: + lower_v = lower_i + lo = _search_left_int64(ends, lower_v) if lower_inclusive else _search_right_int64(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -9223372036854775808: + hi = 0 + elif upper_i <= 9223372036854775807: + upper_v = upper_i + hi = _search_right_int64(starts, upper_v) if upper_inclusive else _search_left_int64(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint8_impl( + np.ndarray[np.uint8_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint8_t lower_v + cdef np.uint8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 255: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint8(values, lower_v) if lower_inclusive else _search_right_uint8(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 255: + upper_v = upper_i + hi = _search_right_uint8(values, upper_v) if upper_inclusive else _search_left_uint8(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint8_impl( + np.ndarray[np.uint8_t, ndim=1] starts, + np.ndarray[np.uint8_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint8_t lower_v + cdef np.uint8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 255: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint8(ends, lower_v) if lower_inclusive else _search_right_uint8(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 255: + upper_v = upper_i + hi = _search_right_uint8(starts, upper_v) if upper_inclusive else _search_left_uint8(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint16_impl( + np.ndarray[np.uint16_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint16_t lower_v + cdef np.uint16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 65535: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint16(values, lower_v) if lower_inclusive else _search_right_uint16(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 65535: + upper_v = upper_i + hi = _search_right_uint16(values, upper_v) if upper_inclusive else _search_left_uint16(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint16_impl( + np.ndarray[np.uint16_t, ndim=1] starts, + np.ndarray[np.uint16_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint16_t lower_v + cdef np.uint16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 65535: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint16(ends, lower_v) if lower_inclusive else _search_right_uint16(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 65535: + upper_v = upper_i + hi = _search_right_uint16(starts, upper_v) if upper_inclusive else _search_left_uint16(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint32_impl( + np.ndarray[np.uint32_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint32_t lower_v + cdef np.uint32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 4294967295: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint32(values, lower_v) if lower_inclusive else _search_right_uint32(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 4294967295: + upper_v = upper_i + hi = _search_right_uint32(values, upper_v) if upper_inclusive else _search_left_uint32(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint32_impl( + np.ndarray[np.uint32_t, ndim=1] starts, + np.ndarray[np.uint32_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint32_t lower_v + cdef np.uint32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 4294967295: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint32(ends, lower_v) if lower_inclusive else _search_right_uint32(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 4294967295: + upper_v = upper_i + hi = _search_right_uint32(starts, upper_v) if upper_inclusive else _search_left_uint32(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint64_impl( + np.ndarray[np.uint64_t, ndim=1] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint64_t lower_v + cdef np.uint64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 18446744073709551615: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint64(values, lower_v) if lower_inclusive else _search_right_uint64(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 18446744073709551615: + upper_v = upper_i + hi = _search_right_uint64(values, upper_v) if upper_inclusive else _search_left_uint64(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint64_impl( + np.ndarray[np.uint64_t, ndim=1] starts, + np.ndarray[np.uint64_t, ndim=1] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint64_t lower_v + cdef np.uint64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 18446744073709551615: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint64(ends, lower_v) if lower_inclusive else _search_right_uint64(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 18446744073709551615: + upper_v = upper_i + hi = _search_right_uint64(starts, upper_v) if upper_inclusive else _search_left_uint64(starts, upper_v) + return int(lo), int(hi) + + +def index_search_bounds(np.ndarray values, object lower, bint lower_inclusive, object upper, bint upper_inclusive): + cdef np.dtype dtype = values.dtype + if dtype == np.dtype(np.float32): + return _search_bounds_float32_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.float64): + return _search_bounds_float64_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int8): + return _search_bounds_int8_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int16): + return _search_bounds_int16_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int32): + return _search_bounds_int32_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int64): + return _search_bounds_int64_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint8): + return _search_bounds_uint8_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint16): + return _search_bounds_uint16_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint32): + return _search_bounds_uint32_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint64): + return _search_bounds_uint64_impl(values, lower, lower_inclusive, upper, upper_inclusive) + raise TypeError("unsupported dtype for index_search_bounds") + + +def index_search_boundary_bounds( + np.ndarray starts, + np.ndarray ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.dtype dtype = starts.dtype + if dtype != ends.dtype: + raise TypeError("starts and ends must have the same dtype") + if dtype == np.dtype(np.float32): + return _search_boundary_bounds_float32_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.float64): + return _search_boundary_bounds_float64_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int8): + return _search_boundary_bounds_int8_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int16): + return _search_boundary_bounds_int16_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int32): + return _search_boundary_bounds_int32_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int64): + return _search_boundary_bounds_int64_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint8): + return _search_boundary_bounds_uint8_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint16): + return _search_boundary_bounds_uint16_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint32): + return _search_boundary_bounds_uint32_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint64): + return _search_boundary_bounds_uint64_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + raise TypeError("unsupported dtype for index_search_boundary_bounds") + + +cdef tuple _collect_chunk_positions_float32( + np.ndarray[np.int64_t, ndim=1] offsets, + np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, + object positions_sidecar, + object l2_sidecar, + np.ndarray l2_row, + np.ndarray[np.float32_t, ndim=1] span_values, + np.ndarray local_positions, + int64_t chunk_len, + int32_t nav_segment_len, + int32_t nsegments_per_chunk, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.ndarray[np.float32_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.float32_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_float32_impl( + starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive + ) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_float32_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_float64( + np.ndarray[np.int64_t, ndim=1] offsets, + np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, + object positions_sidecar, + object l2_sidecar, + np.ndarray l2_row, + np.ndarray[np.float64_t, ndim=1] span_values, + np.ndarray local_positions, + int64_t chunk_len, + int32_t nav_segment_len, + int32_t nsegments_per_chunk, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.ndarray[np.float64_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.float64_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_float64_impl( + starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive + ) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_float64_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int8( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int8_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int8_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int8_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int8_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int8_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int16( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int16_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int16_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int16_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int16_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int16_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int32( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int32_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int32_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int32_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int32_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int32_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int64( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int64_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int64_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int64_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int64_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int64_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint8( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint8_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint8_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint8_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint8_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint8_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint16( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint16_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint16_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint16_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint16_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint16_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint32( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint32_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint32_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint32_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint32_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint32_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint64( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint64_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint64_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint64_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint64_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint64_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +def index_collect_reduced_chunk_nav_positions( + np.ndarray[np.int64_t, ndim=1] offsets, + np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, + object positions_sidecar, + object l2_sidecar, + np.ndarray l2_row, + np.ndarray span_values, + np.ndarray local_positions, + int64_t chunk_len, + int32_t nav_segment_len, + int32_t nsegments_per_chunk, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.dtype dtype = span_values.dtype + if dtype == np.dtype(np.float32): + return _collect_chunk_positions_float32( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.float64): + return _collect_chunk_positions_float64( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int8): + return _collect_chunk_positions_int8( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int16): + return _collect_chunk_positions_int16( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int32): + return _collect_chunk_positions_int32( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int64): + return _collect_chunk_positions_int64( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint8): + return _collect_chunk_positions_uint8( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint16): + return _collect_chunk_positions_uint16( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint32): + return _collect_chunk_positions_uint32( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint64): + return _collect_chunk_positions_uint64( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + raise TypeError("unsupported dtype for index_collect_reduced_chunk_nav_positions") diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 2c8959ba..c985f225 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -93,6 +93,66 @@ def test_random_field_point_query_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 123_456) & (data["id"] < 123_457)]) +@pytest.mark.parametrize( + "dtype", + [ + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float32, + np.float64, + ], +) +def test_medium_numeric_dtype_query_matches_scan(dtype): + values = np.arange(2_000, dtype=dtype) + if np.issubdtype(dtype, np.floating): + values = values / dtype(10) + + arr = blosc2.asarray(values, chunks=(500,), blocks=(100,)) + arr.create_index(kind="medium") + + query_value = values[137].item() + indexed = arr[arr == query_value].compute()[:] + expected = values[values == query_value] + + np.testing.assert_array_equal(indexed, expected) + + +@pytest.mark.parametrize("dtype", [np.int32, np.uint32, np.float32, np.float64]) +def test_light_numeric_dtype_query_matches_scan(dtype): + values = np.arange(2_000, dtype=dtype) + if np.issubdtype(dtype, np.floating): + values = values / dtype(10) + + arr = blosc2.asarray(values, chunks=(500,), blocks=(100,)) + arr.create_index(kind="light") + + lower = values[137].item() + upper = values[163].item() + indexed = arr[(arr >= lower) & (arr < upper)].compute()[:] + expected = values[(values >= lower) & (values < upper)] + + np.testing.assert_array_equal(indexed, expected) + + +def test_numeric_unsupported_dtype_fallback_matches_scan(): + values = (np.arange(2_000, dtype=np.float16) / np.float16(10)).astype(np.float16) + + arr = blosc2.asarray(values, chunks=(500,), blocks=(100,)) + arr.create_index(kind="medium") + + query_value = values[137].item() + indexed = arr[arr == query_value].compute()[:] + expected = values[values == query_value] + + np.testing.assert_array_equal(indexed, expected) + + def test_light_lossy_integer_values_match_scan(): rng = np.random.default_rng(2) dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) From 94cb5e61ea2ae34e3552d59450f75651afc9f52c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sat, 4 Apr 2026 11:43:28 +0200 Subject: [PATCH 25/68] Fix stale in-memory index store reuse --- src/blosc2/indexing.py | 33 +++++++++++++++++++++++++-------- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 7688e4d9..a665a176 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -13,6 +13,7 @@ import os import re import tempfile +import weakref from dataclasses import dataclass from pathlib import Path @@ -37,6 +38,8 @@ } _IN_MEMORY_INDEXES: dict[int, dict] = {} +_IN_MEMORY_INDEX_FINALIZERS: dict[int, weakref.finalize] = {} +_PERSISTENT_INDEXES: dict[tuple[str, str | int], dict] = {} _DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} _SIDECAR_HANDLE_CACHE: dict[tuple[int, str | None, str, str], object] = {} BLOCK_GATHER_POSITIONS_THRESHOLD = 32 @@ -51,6 +54,11 @@ def _sanitize_token(token: str) -> str: return re.sub(r"[^0-9A-Za-z_.-]+", "_", token) +def _cleanup_in_memory_store(key: int) -> None: + _IN_MEMORY_INDEXES.pop(key, None) + _IN_MEMORY_INDEX_FINALIZERS.pop(key, None) + + @dataclass(slots=True) class IndexPlan: usable: bool @@ -177,12 +185,11 @@ def _is_persistent_array(array: blosc2.NDArray) -> bool: def _load_store(array: blosc2.NDArray) -> dict: - key = _array_key(array) - cached = _IN_MEMORY_INDEXES.get(key) - if cached is not None: - return cached - if _is_persistent_array(array): + key = _array_key(array) + cached = _PERSISTENT_INDEXES.get(key) + if cached is not None: + return cached try: store = array.schunk.vlmeta[INDEXES_VLMETA_KEY] except KeyError: @@ -191,19 +198,29 @@ def _load_store(array: blosc2.NDArray) -> dict: store = _default_index_store() store.setdefault("version", INDEX_FORMAT_VERSION) store.setdefault("indexes", {}) - else: - store = _default_index_store() + _PERSISTENT_INDEXES[key] = store + return store + key = id(array) + cached = _IN_MEMORY_INDEXES.get(key) + if cached is not None: + return cached + store = _default_index_store() _IN_MEMORY_INDEXES[key] = store + _IN_MEMORY_INDEX_FINALIZERS[key] = weakref.finalize(array, _cleanup_in_memory_store, key) return store def _save_store(array: blosc2.NDArray, store: dict) -> None: store.setdefault("version", INDEX_FORMAT_VERSION) store.setdefault("indexes", {}) - _IN_MEMORY_INDEXES[_array_key(array)] = store if _is_persistent_array(array): + _PERSISTENT_INDEXES[_array_key(array)] = store array.schunk.vlmeta[INDEXES_VLMETA_KEY] = store + else: + key = id(array) + _IN_MEMORY_INDEXES[key] = store + _IN_MEMORY_INDEX_FINALIZERS.setdefault(key, weakref.finalize(array, _cleanup_in_memory_store, key)) def _supported_index_dtype(dtype: np.dtype) -> bool: From d1c36367b21947dcfc8640456fe2710b9bcd5915 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sun, 5 Apr 2026 07:45:47 +0200 Subject: [PATCH 26/68] Replace full OOC temp runs with Blosc2 scratch arrays --- bench/ndarray/index_query_bench.py | 3 +- src/blosc2/indexing.py | 440 +++++++++++++++++++++++------ 2 files changed, 359 insertions(+), 84 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 7b0cf496..f645a80c 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -304,7 +304,8 @@ def benchmark_size( with _with_full_query_mode(full_query_mode): explanation = idx_expr.explain() cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) - logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) + descriptor = idx_arr.indexes[0] + logical_index_bytes, disk_index_bytes = index_sizes(descriptor) rows.append( { diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index a665a176..05006d47 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -117,6 +117,13 @@ class SortedRun: length: int +@dataclass(slots=True) +class TempRunTracker: + current_disk_bytes: int = 0 + peak_disk_bytes: int = 0 + total_written_bytes: int = 0 + + @dataclass(slots=True) class OrderedIndexPlan: usable: bool @@ -458,6 +465,23 @@ def _compute_sorted_boundaries(values: np.ndarray, dtype: np.dtype, segment_len: return boundaries +def _compute_sorted_boundaries_from_sidecar( + path: str, dtype: np.dtype, length: int, segment_len: int +) -> np.ndarray: + nsegments = math.ceil(length / segment_len) + boundaries = np.empty(nsegments, dtype=_boundary_dtype(dtype)) + sidecar = blosc2.open(path, mmap_mode="r") + start_value = np.empty(1, dtype=dtype) + end_value = np.empty(1, dtype=dtype) + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, length) + _read_ndarray_linear_span(sidecar, start, start_value) + _read_ndarray_linear_span(sidecar, stop - 1, end_value) + boundaries[idx] = (start_value[0], end_value[0]) + return boundaries + + def _store_array_sidecar( array: blosc2.NDArray, token: str, @@ -561,7 +585,7 @@ def _sidecar_storage_geometry( ) -> tuple[int, int]: if path is None: return fallback_chunk_len, fallback_block_len - sidecar = blosc2.open(path) + sidecar = blosc2.open(path, mmap_mode="r") return int(sidecar.chunks[0]), int(sidecar.blocks[0]) @@ -584,10 +608,93 @@ def _rebuild_full_navigation_sidecars( full["l2_path"] = l2_sidecar["path"] full["sidecar_chunk_len"] = int(chunk_len) full["sidecar_block_len"] = int(block_len) + + +def _rebuild_full_navigation_sidecars_from_path( + array: blosc2.NDArray, + token: str, + kind: str, + full: dict, + values_path: str, + dtype: np.dtype, + length: int, + persistent: bool, +) -> None: + chunk_len, block_len = _sidecar_storage_geometry(values_path, int(array.chunks[0]), int(array.blocks[0])) + l1 = _compute_sorted_boundaries_from_sidecar(values_path, dtype, length, chunk_len) + l2 = _compute_sorted_boundaries_from_sidecar(values_path, dtype, length, block_len) + l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent) + l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent) + full["l1_path"] = l1_sidecar["path"] + full["l2_path"] = l2_sidecar["path"] + full["sidecar_chunk_len"] = int(chunk_len) + full["sidecar_block_len"] = int(block_len) full["l1_dtype"] = l1_sidecar["dtype"] full["l2_dtype"] = l2_sidecar["dtype"] +def _stream_copy_sidecar_array( + source_path: Path | str, + dest_path: Path | str, + length: int, + dtype: np.dtype, + chunks: tuple[int, ...], + blocks: tuple[int, ...], +) -> None: + source = blosc2.open(str(source_path), mmap_mode="r") + blosc2.remove_urlpath(str(dest_path)) + dest = blosc2.empty( + (length,), dtype=dtype, chunks=chunks, blocks=blocks, urlpath=str(dest_path), mode="w" + ) + chunk_len = int(dest.chunks[0]) + for start in range(0, length, chunk_len): + stop = min(start + chunk_len, length) + span = np.empty(stop - start, dtype=dtype) + _read_ndarray_linear_span(source, start, span) + dest[start:stop] = span + del source, dest + + +def _stream_copy_temp_run_to_full_sidecars( + array: blosc2.NDArray, + token: str, + kind: str, + full: dict, + run: SortedRun, + dtype: np.dtype, + persistent: bool, + tracker: TempRunTracker | None = None, +) -> None: + if not persistent: + raise ValueError("temp-run streaming only supports persistent runs") + + values_path = _sidecar_path(array, token, kind, "full.values") + positions_path = _sidecar_path(array, token, kind, "full.positions") + _remove_sidecar_path(values_path) + _remove_sidecar_path(positions_path) + _stream_copy_sidecar_array( + run.values_path, values_path, run.length, dtype, (int(array.chunks[0]),), (int(array.blocks[0]),) + ) + _stream_copy_sidecar_array( + run.positions_path, + positions_path, + run.length, + np.dtype(np.int64), + (int(array.chunks[0]),), + (int(array.blocks[0]),), + ) + _tracker_register_delete(tracker, run.values_path, run.positions_path) + run.values_path.unlink(missing_ok=True) + run.positions_path.unlink(missing_ok=True) + full["values_path"] = values_path + full["positions_path"] = positions_path + full["runs"] = [] + full["next_run_id"] = 0 + _rebuild_full_navigation_sidecars_from_path( + array, token, kind, full, values_path, dtype, run.length, persistent + ) + + def _build_full_descriptor( array: blosc2.NDArray, token: str, @@ -1183,30 +1290,151 @@ def _pair_searchsorted_right(values: np.ndarray, positions: np.ndarray, value, p return int(np.searchsorted(records, needle, side="right")) +def _temp_run_storage_geometry( + length: int, dtype: np.dtype, buffer_items: int +) -> tuple[tuple[int], tuple[int]]: + chunk_items = max(1, min(length, buffer_items)) + target_block_bytes = 256 * 1024 + block_items = max(1, min(chunk_items, target_block_bytes // max(1, dtype.itemsize))) + return (chunk_items,), (block_items,) + + +def _path_disk_bytes(path: Path | str) -> int: + path = Path(path) + if not path.exists(): + return 0 + if path.is_file(): + return path.stat().st_size + return sum(entry.stat().st_size for entry in path.rglob("*") if entry.is_file()) + + +def _tracker_register_create(tracker: TempRunTracker | None, *paths: Path) -> None: + if tracker is None: + return + delta = sum(_path_disk_bytes(path) for path in paths) + tracker.current_disk_bytes += delta + tracker.total_written_bytes += delta + tracker.peak_disk_bytes = max(tracker.peak_disk_bytes, tracker.current_disk_bytes) + + +def _tracker_register_delete(tracker: TempRunTracker | None, *paths: Path) -> None: + if tracker is None: + return + delta = sum(_path_disk_bytes(path) for path in paths) + tracker.current_disk_bytes = max(0, tracker.current_disk_bytes - delta) + + +def _create_blosc2_temp_array(path: Path, length: int, dtype: np.dtype, buffer_items: int): + chunks, blocks = _temp_run_storage_geometry(length, dtype, buffer_items) + cparams = blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=1) + return blosc2.empty( + (length,), + dtype=dtype, + chunks=chunks, + blocks=blocks, + urlpath=str(path), + mode="w", + cparams=cparams, + ) + + +def _read_ndarray_linear_span(array: blosc2.NDArray, start: int, out: np.ndarray) -> None: + if len(out) == 0: + return + chunk_len = int(array.chunks[0]) + cursor = int(start) + out_cursor = 0 + while out_cursor < len(out): + chunk_id = cursor // chunk_len + local_start = cursor % chunk_len + take = min(len(out) - out_cursor, chunk_len - local_start) + array.get_1d_span_numpy( + out[out_cursor : out_cursor + take], int(chunk_id), int(local_start), int(take) + ) + cursor += take + out_cursor += take + + +def _materialize_sorted_run( + values: np.ndarray, + positions: np.ndarray, + length: int, + value_dtype: np.dtype, + workdir: Path, + prefix: str, + tracker: TempRunTracker | None = None, +) -> SortedRun: + values_path = workdir / f"{prefix}.values.b2nd" + positions_path = workdir / f"{prefix}.positions.b2nd" + run_values = _create_blosc2_temp_array(values_path, length, value_dtype, FULL_OOC_MERGE_BUFFER_ITEMS) + run_positions = _create_blosc2_temp_array( + positions_path, length, np.dtype(np.int64), FULL_OOC_MERGE_BUFFER_ITEMS + ) + run_values[:] = values + run_positions[:] = positions + del run_values, run_positions + _tracker_register_create(tracker, values_path, positions_path) + return SortedRun(values_path, positions_path, length) + + +def _copy_sidecar_to_temp_run( + path: str, + length: int, + dtype: np.dtype, + workdir: Path, + prefix: str, + tracker: TempRunTracker | None = None, +) -> Path: + out_path = workdir / f"{prefix}.b2nd" + sidecar = blosc2.open(path, mmap_mode="r") + output = _create_blosc2_temp_array(out_path, length, dtype, FULL_OOC_MERGE_BUFFER_ITEMS) + chunk_len = int(sidecar.chunks[0]) + for chunk_id, start in enumerate(range(0, length, chunk_len)): + stop = min(start + chunk_len, length) + span = np.empty(stop - start, dtype=dtype) + sidecar.get_1d_span_numpy(span, chunk_id, 0, stop - start) + output[start:stop] = span + del output + _tracker_register_create(tracker, out_path) + return out_path + + def _refill_run_buffer( - values_mm: np.ndarray, positions_mm: np.ndarray, cursor: int, buffer_items: int + values_src, positions_src, cursor: int, buffer_items: int ) -> tuple[np.ndarray, np.ndarray, int]: - if cursor >= len(values_mm): - return np.empty(0, dtype=values_mm.dtype), np.empty(0, dtype=positions_mm.dtype), cursor - stop = min(cursor + buffer_items, len(values_mm)) - return np.asarray(values_mm[cursor:stop]), np.asarray(positions_mm[cursor:stop]), stop + if cursor >= len(values_src): + values_dtype = values_src.dtype if hasattr(values_src, "dtype") else np.float64 + positions_dtype = positions_src.dtype if hasattr(positions_src, "dtype") else np.int64 + return np.empty(0, dtype=values_dtype), np.empty(0, dtype=positions_dtype), cursor + stop = min(cursor + buffer_items, len(values_src)) + if isinstance(values_src, np.ndarray): + return np.asarray(values_src[cursor:stop]), np.asarray(positions_src[cursor:stop]), stop + values = np.empty(stop - cursor, dtype=np.dtype(values_src.dtype)) + positions = np.empty(stop - cursor, dtype=np.dtype(positions_src.dtype)) + _read_ndarray_linear_span(values_src, cursor, values) + _read_ndarray_linear_span(positions_src, cursor, positions) + return values, positions, stop def _merge_run_pair( - left: SortedRun, right: SortedRun, workdir: Path, dtype: np.dtype, merge_id: int, buffer_items: int + left: SortedRun, + right: SortedRun, + workdir: Path, + dtype: np.dtype, + merge_id: int, + buffer_items: int, + tracker: TempRunTracker | None = None, ) -> SortedRun: - left_values_mm = np.load(left.values_path, mmap_mode="r") - left_positions_mm = np.load(left.positions_path, mmap_mode="r") - right_values_mm = np.load(right.values_path, mmap_mode="r") - right_positions_mm = np.load(right.positions_path, mmap_mode="r") - - out_values_path = workdir / f"full_merge_values_{merge_id}.npy" - out_positions_path = workdir / f"full_merge_positions_{merge_id}.npy" - out_values = np.lib.format.open_memmap( - out_values_path, mode="w+", dtype=dtype, shape=(left.length + right.length,) - ) - out_positions = np.lib.format.open_memmap( - out_positions_path, mode="w+", dtype=np.int64, shape=(left.length + right.length,) + left_values_mm = blosc2.open(str(left.values_path), mmap_mode="r") + left_positions_mm = blosc2.open(str(left.positions_path), mmap_mode="r") + right_values_mm = blosc2.open(str(right.values_path), mmap_mode="r") + right_positions_mm = blosc2.open(str(right.positions_path), mmap_mode="r") + + out_values_path = workdir / f"full_merge_values_{merge_id}.b2nd" + out_positions_path = workdir / f"full_merge_positions_{merge_id}.b2nd" + out_values = _create_blosc2_temp_array(out_values_path, left.length + right.length, dtype, buffer_items) + out_positions = _create_blosc2_temp_array( + out_positions_path, left.length + right.length, np.dtype(np.int64), buffer_items ) left_cursor = 0 @@ -1272,9 +1500,12 @@ def _merge_run_pair( right_values = right_values[right_cut:] right_positions = right_positions[right_cut:] - out_values.flush() - out_positions.flush() - del left_values_mm, left_positions_mm, right_values_mm, right_positions_mm, out_values, out_positions + del out_values, out_positions + _tracker_register_create(tracker, out_values_path, out_positions_path) + del left_values_mm, left_positions_mm, right_values_mm, right_positions_mm + _tracker_register_delete( + tracker, left.values_path, left.positions_path, right.values_path, right.positions_path + ) left.values_path.unlink(missing_ok=True) left.positions_path.unlink(missing_ok=True) right.values_path.unlink(missing_ok=True) @@ -1292,6 +1523,7 @@ def _build_full_descriptor_ooc( workdir: Path, ) -> dict: size = int(array.shape[0]) + tracker = TempRunTracker() if size == 0: sorted_values = np.empty(0, dtype=dtype) positions = np.empty(0, dtype=np.int64) @@ -1318,19 +1550,17 @@ def _build_full_descriptor_ooc( order = np.lexsort((positions, values)) sorted_values = values[order] sorted_positions = positions[order] - - values_path = workdir / f"full_run_values_{run_id}.npy" - positions_path = workdir / f"full_run_positions_{run_id}.npy" - run_values = np.lib.format.open_memmap(values_path, mode="w+", dtype=dtype, shape=(stop - start,)) - run_positions = np.lib.format.open_memmap( - positions_path, mode="w+", dtype=np.int64, shape=(stop - start,) + runs.append( + _materialize_sorted_run( + sorted_values, + sorted_positions, + stop - start, + dtype, + workdir, + f"full_run_{run_id}", + tracker, + ) ) - run_values[:] = sorted_values - run_positions[:] = sorted_positions - run_values.flush() - run_positions.flush() - del run_values, run_positions - runs.append(SortedRun(values_path, positions_path, stop - start)) merge_id = 0 while len(runs) > 1: @@ -1341,24 +1571,48 @@ def _build_full_descriptor_ooc( continue next_runs.append( _merge_run_pair( - runs[idx], runs[idx + 1], workdir, dtype, merge_id, FULL_OOC_MERGE_BUFFER_ITEMS + runs[idx], + runs[idx + 1], + workdir, + dtype, + merge_id, + FULL_OOC_MERGE_BUFFER_ITEMS, + tracker, ) ) merge_id += 1 runs = next_runs final_run = runs[0] - sorted_values = np.load(final_run.values_path, mmap_mode="r") - positions = np.load(final_run.positions_path, mmap_mode="r") - values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) full = { - "values_path": values_sidecar["path"], - "positions_path": positions_sidecar["path"], + "values_path": None, + "positions_path": None, "runs": [], "next_run_id": 0, + "temp_backend": "blosc2", + "temp_peak_disk_bytes": tracker.peak_disk_bytes, + "temp_total_written_bytes": tracker.total_written_bytes, } - _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + if persistent: + _stream_copy_temp_run_to_full_sidecars( + array, token, kind, full, final_run, dtype, persistent, tracker + ) + else: + sorted_values = blosc2.open(str(final_run.values_path), mmap_mode="r")[:] + positions = blosc2.open(str(final_run.positions_path), mmap_mode="r")[:] + values_sidecar = _store_array_sidecar( + array, token, kind, "full", "values", sorted_values, persistent + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full", "positions", positions, persistent + ) + full["values_path"] = values_sidecar["path"] + full["positions_path"] = positions_sidecar["path"] + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + del sorted_values, positions + _tracker_register_delete(tracker, final_run.values_path, final_run.positions_path) + final_run.values_path.unlink(missing_ok=True) + final_run.positions_path.unlink(missing_ok=True) return full @@ -1779,6 +2033,56 @@ def _replace_full_descriptor( _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) +def _replace_full_descriptor_from_paths( + array: blosc2.NDArray, + descriptor: dict, + values_path: Path, + positions_path: Path, + length: int, +) -> None: + kind = descriptor["kind"] + token = descriptor["token"] + full = descriptor["full"] + persistent = descriptor["persistent"] + if not persistent: + raise ValueError("path-based full replacement requires persistent indexes") + for run in full.get("runs", ()): + _remove_sidecar_path(run.get("values_path")) + _remove_sidecar_path(run.get("positions_path")) + _remove_sidecar_path(full.get("l1_path")) + _remove_sidecar_path(full.get("l2_path")) + _clear_cached_data(array, token) + final_values_path = _sidecar_path(array, token, kind, "full.values") + final_positions_path = _sidecar_path(array, token, kind, "full.positions") + _remove_sidecar_path(final_values_path) + _remove_sidecar_path(final_positions_path) + _stream_copy_sidecar_array( + values_path, + final_values_path, + length, + np.dtype(descriptor["dtype"]), + (int(array.chunks[0]),), + (int(array.blocks[0]),), + ) + _stream_copy_sidecar_array( + positions_path, + final_positions_path, + length, + np.dtype(np.int64), + (int(array.chunks[0]),), + (int(array.blocks[0]),), + ) + values_path.unlink(missing_ok=True) + positions_path.unlink(missing_ok=True) + full["values_path"] = final_values_path + full["positions_path"] = final_positions_path + full["runs"] = [] + full["next_run_id"] = 0 + _rebuild_full_navigation_sidecars_from_path( + array, token, kind, full, final_values_path, np.dtype(descriptor["dtype"]), length, persistent + ) + + def _store_full_run_descriptor( array: blosc2.NDArray, descriptor: dict, @@ -1891,41 +2195,6 @@ def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | N ) -def _copy_sidecar_to_temp_run(path: str, length: int, dtype: np.dtype, workdir: Path, prefix: str) -> Path: - sidecar = blosc2.open(path) - out_path = workdir / f"{prefix}.npy" - output = np.lib.format.open_memmap(out_path, mode="w+", dtype=dtype, shape=(length,)) - chunk_len = int(sidecar.chunks[0]) - for chunk_id, start in enumerate(range(0, length, chunk_len)): - stop = min(start + chunk_len, length) - span = np.empty(stop - start, dtype=dtype) - sidecar.get_1d_span_numpy(span, chunk_id, 0, stop - start) - output[start:stop] = span - output.flush() - del output - return out_path - - -def _materialize_sorted_run( - values: np.ndarray, - positions: np.ndarray, - length: int, - value_dtype: np.dtype, - workdir: Path, - prefix: str, -) -> SortedRun: - values_path = workdir / f"{prefix}.values.npy" - positions_path = workdir / f"{prefix}.positions.npy" - run_values = np.lib.format.open_memmap(values_path, mode="w+", dtype=value_dtype, shape=(length,)) - run_positions = np.lib.format.open_memmap(positions_path, mode="w+", dtype=np.int64, shape=(length,)) - run_values[:] = values - run_positions[:] = positions - run_values.flush() - run_positions.flush() - del run_values, run_positions - return SortedRun(values_path, positions_path, length) - - def _full_compaction_runs(array: blosc2.NDArray, descriptor: dict, workdir: Path) -> list[SortedRun]: full = descriptor["full"] dtype = np.dtype(descriptor["dtype"]) @@ -2001,12 +2270,17 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N merge_id += 1 runs = next_runs final_run = runs[0] - sorted_values = np.load(final_run.values_path, mmap_mode="r") - positions = np.load(final_run.positions_path, mmap_mode="r") - _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) - del sorted_values, positions - final_run.values_path.unlink(missing_ok=True) - final_run.positions_path.unlink(missing_ok=True) + if descriptor["persistent"]: + _replace_full_descriptor_from_paths( + array, descriptor, final_run.values_path, final_run.positions_path, final_run.length + ) + else: + sorted_values = blosc2.open(str(final_run.values_path), mmap_mode="r")[:] + positions = blosc2.open(str(final_run.positions_path), mmap_mode="r")[:] + _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) + del sorted_values, positions + final_run.values_path.unlink(missing_ok=True) + final_run.positions_path.unlink(missing_ok=True) _clear_full_merge_cache(array, descriptor["token"]) _save_store(array, store) From 426dce9bdaf47d44bdee2114730d74ded904fb52 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sun, 5 Apr 2026 08:49:24 +0200 Subject: [PATCH 27/68] Accelerate indexed queries with threaded chunk batching Add chunk-batch threading to the OOC query path for light and the Python fallback path for medium, then extend threading to the shared downstream execution layer used by ultralight and light. Keep scan-equivalent row order by processing contiguous chunk batches and merging batch results strictly in scheduled chunk order. --- bench/ndarray/index_query_bench.py | 3 +- src/blosc2/indexing.py | 379 +++++++++++++++++++++-------- tests/ndarray/test_indexing.py | 54 ++++ 3 files changed, 332 insertions(+), 104 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index f645a80c..6aeb0992 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -233,11 +233,12 @@ def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int, in_me for descriptor in arr.indexes: if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: continue + expected_ooc = descriptor.get("ooc", False) if kind == "ultralight" else (not bool(in_mem)) if ( descriptor.get("field") == "id" and descriptor.get("kind") == kind and int(descriptor.get("optlevel", -1)) == int(optlevel) - and bool(descriptor.get("ooc", False)) is (not bool(in_mem)) + and bool(descriptor.get("ooc", False)) is bool(expected_ooc) and not descriptor.get("stale", False) ): return descriptor diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 05006d47..cc83fd48 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -14,6 +14,7 @@ import re import tempfile import weakref +from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass from pathlib import Path @@ -48,6 +49,7 @@ FULL_SELECTIVE_OOC_MAX_SPANS = 128 FULL_RUN_BOUNDED_FALLBACK_RUNS = 8 FULL_RUN_BOUNDED_FALLBACK_ITEMS = 1_000_000 +INDEX_QUERY_MIN_CHUNKS_PER_THREAD = 8 def _sanitize_token(token: str) -> str: @@ -3288,6 +3290,126 @@ def _chunk_nav_candidate_runs( return [(segment_lo, segment_hi)], segment_hi - segment_lo +def _index_query_thread_count(task_count: int) -> int: + if task_count < INDEX_QUERY_MIN_CHUNKS_PER_THREAD: + return 1 + configured_threads = int(getattr(blosc2, "nthreads", 1) or 1) + return max(1, min(configured_threads, task_count // INDEX_QUERY_MIN_CHUNKS_PER_THREAD)) + + +def _chunk_batches(chunk_ids: np.ndarray, thread_count: int) -> list[np.ndarray]: + if thread_count <= 1 or len(chunk_ids) == 0: + return [chunk_ids] + batch_size = max(1, math.ceil(len(chunk_ids) / thread_count)) + return [chunk_ids[start : start + batch_size] for start in range(0, len(chunk_ids), batch_size)] + + +def _downstream_query_thread_count(task_count: int, plan: IndexPlan) -> int: + if plan.lookup_path == "chunk-nav-ooc": + return 1 + return _index_query_thread_count(task_count) + + +def _merge_position_batches(position_batches: list[np.ndarray]) -> np.ndarray: + if not position_batches: + return np.empty(0, dtype=np.int64) + return np.concatenate(position_batches) if len(position_batches) > 1 else position_batches[0] + + +def _run_position_batches(chunk_ids: np.ndarray, thread_count: int, process_batch) -> tuple[np.ndarray, int]: + if thread_count <= 1: + return process_batch(chunk_ids) + batches = _chunk_batches(chunk_ids, thread_count) + position_batches = [] + total_candidate_segments = 0 + with ThreadPoolExecutor(max_workers=thread_count) as executor: + for positions_part, batch_candidate_segments in executor.map(process_batch, batches): + total_candidate_segments += batch_candidate_segments + if len(positions_part) > 0: + position_batches.append(positions_part) + return _merge_position_batches(position_batches), total_candidate_segments + + +def _light_batch_result_dtype(where_x) -> np.dtype: + return _where_output_dtype(where_x) + + +def _light_worker_source(where_x): + if _supports_block_reads(where_x) and getattr(where_x, "urlpath", None) is not None: + return blosc2.open(str(where_x.urlpath), mmap_mode="r") + return where_x + + +def _light_match_from_span(span: np.ndarray, plan: IndexPlan) -> np.ndarray: + if plan.target is not None and plan.target.get("source") == "expression": + field_values = _values_from_numpy_target(span, plan.target) + else: + field_values = span if plan.field is None else span[plan.field] + match = np.ones(len(field_values), dtype=bool) + if plan.lower is not None: + match &= field_values >= plan.lower if plan.lower_inclusive else field_values > plan.lower + if plan.upper is not None: + match &= field_values <= plan.upper if plan.upper_inclusive else field_values < plan.upper + return match + + +def _process_light_chunk_batch( + chunk_ids: np.ndarray, where_x, plan: IndexPlan, total_len: int, chunk_len: int +) -> np.ndarray: + parts = [] + local_where_x = _light_worker_source(where_x) + for chunk_id in chunk_ids: + bucket_mask = plan.bucket_masks[int(chunk_id)] + chunk_start = int(chunk_id) * plan.chunk_len + chunk_stop = min(chunk_start + plan.chunk_len, total_len) + for run_start, run_stop in _contiguous_true_runs(np.asarray(bucket_mask, dtype=bool)): + start = chunk_start + run_start * plan.bucket_len + stop = min(chunk_start + run_stop * plan.bucket_len, chunk_stop) + if start >= stop: + continue + if _supports_block_reads(local_where_x): + span = np.empty(stop - start, dtype=local_where_x.dtype) + base_chunk_id = start // chunk_len + local_start = start - base_chunk_id * chunk_len + local_where_x.get_1d_span_numpy(span, base_chunk_id, local_start, stop - start) + else: + span = local_where_x[start:stop] + match = _light_match_from_span(span, plan) + if np.any(match): + parts.append(np.require(span[match], requirements="C")) + if not parts: + return np.empty(0, dtype=_light_batch_result_dtype(where_x)) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + +def _merge_result_batches(parts: list[np.ndarray], dtype: np.dtype) -> np.ndarray: + parts = [part for part in parts if len(part) > 0] + if not parts: + return np.empty(0, dtype=dtype) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + +def _reduced_positions_from_cython_batches( + candidate_chunk_ids: np.ndarray, thread_count: int, process_batch +) -> tuple[np.ndarray, int]: + return _run_position_batches(candidate_chunk_ids, thread_count, process_batch) + + +def _reduced_positions_from_python_batches( + candidate_chunk_ids: np.ndarray, thread_count: int, process_batch +) -> tuple[list[np.ndarray], int]: + if thread_count <= 1: + return process_batch(candidate_chunk_ids) + parts = [] + total_candidate_segments = 0 + batches = _chunk_batches(candidate_chunk_ids, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + for batch_parts, batch_candidate_segments in executor.map(process_batch, batches): + total_candidate_segments += batch_candidate_segments + parts.extend(batch_parts) + return parts, total_candidate_segments + + def _sorted_boundary_search_bounds(boundaries: np.ndarray, plan: ExactPredicatePlan) -> tuple[int, int]: if len(boundaries) == 0: return 0, 0 @@ -3356,31 +3478,56 @@ def _bucket_masks_from_light_chunk_nav_ooc( value_lossy_bits = int(light.get("value_lossy_bits", 0)) search_plan = _light_search_plan(plan, dtype, value_lossy_bits) total_candidate_segments = 0 - l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) - span_values = np.empty(chunk_len, dtype=dtype) - bucket_ids = np.empty(chunk_len, dtype=bucket_dtype) - - for chunk_id in np.flatnonzero(candidate_chunks): - chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) - segment_count = _segment_row_count(chunk_items, nav_segment_len) - l2_sidecar.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) - segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) - total_candidate_segments += candidate_segments - if not segment_runs: - continue + candidate_chunk_ids = np.flatnonzero(candidate_chunks).astype(np.intp, copy=False) - for seg_start_idx, seg_stop_idx in segment_runs: - local_start = seg_start_idx * nav_segment_len - local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) - span_items = local_stop - local_start - values_view = span_values[:span_items] - values_sidecar.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) - lo, hi = _search_bounds(values_view, search_plan) - if lo >= hi: + def process_batch(chunk_ids: np.ndarray) -> tuple[list[tuple[int, np.ndarray]], int]: + if len(chunk_ids) == 0: + return [], 0 + batch_values = blosc2.open(light["values_path"], mmap_mode="r") + batch_buckets = blosc2.open(light["bucket_positions_path"], mmap_mode="r") + batch_l2 = blosc2.open(light["l2_path"], mmap_mode="r") + batch_results = [] + batch_candidate_segments = 0 + l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + span_values = np.empty(chunk_len, dtype=dtype) + bucket_ids = np.empty(chunk_len, dtype=bucket_dtype) + for chunk_id in chunk_ids: + chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) + segment_count = _segment_row_count(chunk_items, nav_segment_len) + batch_l2.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) + segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) + batch_candidate_segments += candidate_segments + if not segment_runs: continue - bucket_view = bucket_ids[: hi - lo] - bucket_sidecar.get_1d_span_numpy(bucket_view, int(chunk_id), local_start + lo, hi - lo) - bucket_masks[int(chunk_id), bucket_view.astype(np.intp, copy=False)] = True + matched_buckets = np.zeros(int(light["bucket_count"]), dtype=bool) + for seg_start_idx, seg_stop_idx in segment_runs: + local_start = seg_start_idx * nav_segment_len + local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) + span_items = local_stop - local_start + values_view = span_values[:span_items] + batch_values.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(values_view, search_plan) + if lo >= hi: + continue + bucket_view = bucket_ids[: hi - lo] + batch_buckets.get_1d_span_numpy(bucket_view, int(chunk_id), local_start + lo, hi - lo) + matched_buckets[bucket_view.astype(np.intp, copy=False)] = True + if np.any(matched_buckets): + batch_results.append((int(chunk_id), matched_buckets)) + return batch_results, batch_candidate_segments + + thread_count = _index_query_thread_count(len(candidate_chunk_ids)) + if thread_count <= 1: + batch_results, total_candidate_segments = process_batch(candidate_chunk_ids) + for chunk_id, matched_buckets in batch_results: + bucket_masks[chunk_id] = matched_buckets + else: + batches = _chunk_batches(candidate_chunk_ids, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + for batch_results, batch_candidate_segments in executor.map(process_batch, batches): + total_candidate_segments += batch_candidate_segments + for chunk_id, matched_buckets in batch_results: + bucket_masks[chunk_id] = matched_buckets return bucket_masks, int(np.count_nonzero(candidate_chunks)), total_candidate_segments @@ -3395,27 +3542,32 @@ def _exact_positions_from_reduced_chunk_nav_ooc( if not np.any(candidate_chunks): return np.empty(0, dtype=np.int64), 0, 0 - values_sidecar, positions_sidecar, l2_sidecar = _load_reduced_sidecar_handles(array, descriptor) dtype = np.dtype(descriptor["dtype"]) chunk_len = int(reduced["chunk_len"]) nav_segment_len = int(reduced["nav_segment_len"]) nsegments_per_chunk = int(reduced["nsegments_per_chunk"]) local_position_dtype = np.dtype(reduced.get("position_dtype", np.uint32)) candidate_chunk_ids = np.flatnonzero(candidate_chunks).astype(np.intp, copy=False) - l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) - span_values = np.empty(chunk_len, dtype=dtype) - local_positions = np.empty(chunk_len, dtype=local_position_dtype) - - try: - positions, total_candidate_segments = indexing_ext.index_collect_reduced_chunk_nav_positions( + l2_boundary_dtype = _boundary_dtype(dtype) + + def process_cython_batch(chunk_ids: np.ndarray) -> tuple[np.ndarray, int]: + if len(chunk_ids) == 0: + return np.empty(0, dtype=np.int64), 0 + batch_values = blosc2.open(reduced["values_path"], mmap_mode="r") + batch_positions = blosc2.open(reduced["positions_path"], mmap_mode="r") + batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode="r") + batch_l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) + batch_span_values = np.empty(chunk_len, dtype=dtype) + batch_local_positions = np.empty(chunk_len, dtype=local_position_dtype) + return indexing_ext.index_collect_reduced_chunk_nav_positions( offsets, - candidate_chunk_ids, - values_sidecar, - positions_sidecar, - l2_sidecar, - l2_row, - span_values, - local_positions, + chunk_ids, + batch_values, + batch_positions, + batch_l2, + batch_l2_row, + batch_span_values, + batch_local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, @@ -3424,36 +3576,55 @@ def _exact_positions_from_reduced_chunk_nav_ooc( plan.upper, plan.upper_inclusive, ) + + try: + thread_count = _index_query_thread_count(len(candidate_chunk_ids)) + positions, total_candidate_segments = _reduced_positions_from_cython_batches( + candidate_chunk_ids, thread_count, process_cython_batch + ) if len(positions) == 0: return np.empty(0, dtype=np.int64), int(candidate_chunk_ids.size), total_candidate_segments return np.sort(positions, kind="stable"), int(candidate_chunk_ids.size), total_candidate_segments except TypeError: pass - parts = [] - total_candidate_segments = 0 - - for chunk_id in candidate_chunk_ids: - chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) - segment_count = _segment_row_count(chunk_items, nav_segment_len) - l2_sidecar.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) - segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) - total_candidate_segments += candidate_segments - if not segment_runs: - continue - - for seg_start_idx, seg_stop_idx in segment_runs: - local_start = seg_start_idx * nav_segment_len - local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) - span_items = local_stop - local_start - values_view = span_values[:span_items] - values_sidecar.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) - lo, hi = _search_bounds(values_view, plan) - if lo >= hi: + def process_batch(chunk_ids: np.ndarray) -> tuple[list[np.ndarray], int]: + if len(chunk_ids) == 0: + return [], 0 + batch_values = blosc2.open(reduced["values_path"], mmap_mode="r") + batch_positions = blosc2.open(reduced["positions_path"], mmap_mode="r") + batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode="r") + batch_parts = [] + batch_candidate_segments = 0 + l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) + span_values = np.empty(chunk_len, dtype=dtype) + local_positions = np.empty(chunk_len, dtype=local_position_dtype) + for chunk_id in chunk_ids: + chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) + segment_count = _segment_row_count(chunk_items, nav_segment_len) + batch_l2.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) + segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) + batch_candidate_segments += candidate_segments + if not segment_runs: continue - positions_view = local_positions[: hi - lo] - positions_sidecar.get_1d_span_numpy(positions_view, int(chunk_id), local_start + lo, hi - lo) - parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + for seg_start_idx, seg_stop_idx in segment_runs: + local_start = seg_start_idx * nav_segment_len + local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) + span_items = local_stop - local_start + values_view = span_values[:span_items] + batch_values.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(values_view, plan) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + batch_positions.get_1d_span_numpy(positions_view, int(chunk_id), local_start + lo, hi - lo) + batch_parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + return batch_parts, batch_candidate_segments + + thread_count = _index_query_thread_count(len(candidate_chunk_ids)) + parts, total_candidate_segments = _reduced_positions_from_python_batches( + candidate_chunk_ids, thread_count, process_batch + ) if not parts: return np.empty(0, dtype=np.int64), int(candidate_chunk_ids.size), total_candidate_segments @@ -3806,19 +3977,34 @@ def evaluate_segment_query( if plan.base is None or plan.candidate_units is None or plan.segment_len is None: raise ValueError("segment evaluation requires a segment-based plan") - parts = [] - chunk_operands = {} - for unit in np.flatnonzero(plan.candidate_units): - start = int(unit) * plan.segment_len - stop = min(start + plan.segment_len, plan.base.shape[0]) - cslice = (slice(start, stop, 1),) - get_chunk_operands(operands, cslice, chunk_operands, plan.base.shape) - result, _ = _get_result(expression, chunk_operands, ne_args, where) - if len(result) > 0: - parts.append(np.require(result, requirements="C")) + candidate_units = np.flatnonzero(plan.candidate_units).astype(np.intp, copy=False) + + def process_batch(units: np.ndarray) -> np.ndarray: + chunk_operands = {} + parts = [] + for unit in units: + start = int(unit) * plan.segment_len + stop = min(start + plan.segment_len, plan.base.shape[0]) + cslice = (slice(start, stop, 1),) + get_chunk_operands(operands, cslice, chunk_operands, plan.base.shape) + result, _ = _get_result(expression, chunk_operands, ne_args, where) + if len(result) > 0: + parts.append(np.require(result, requirements="C")) + if not parts: + return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + thread_count = _downstream_query_thread_count(len(candidate_units), plan) + if thread_count <= 1: + parts = [process_batch(candidate_units)] + else: + batches = _chunk_batches(candidate_units, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + parts = list(executor.map(process_batch, batches)) + parts = [part for part in parts if len(part) > 0] if parts: - return np.concatenate(parts) + return np.concatenate(parts) if len(parts) > 1 else parts[0] return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) @@ -3830,42 +4016,29 @@ def evaluate_light_query( if plan.base is None or plan.bucket_masks is None or plan.chunk_len is None or plan.bucket_len is None: raise ValueError("light evaluation requires bucket masks and chunk geometry") - parts = [] total_len = int(plan.base.shape[0]) chunk_len = int(plan.base.chunks[0]) where_x = where["_where_x"] - for chunk_id, bucket_mask in enumerate(plan.bucket_masks): - if not np.any(bucket_mask): - continue - chunk_start = chunk_id * plan.chunk_len - chunk_stop = min(chunk_start + plan.chunk_len, total_len) - for run_start, run_stop in _contiguous_true_runs(np.asarray(bucket_mask, dtype=bool)): - start = chunk_start + run_start * plan.bucket_len - stop = min(chunk_start + run_stop * plan.bucket_len, chunk_stop) - if start >= stop: - continue - if _supports_block_reads(where_x): - span = np.empty(stop - start, dtype=where_x.dtype) - base_chunk_id = start // chunk_len - local_start = start - base_chunk_id * chunk_len - where_x.get_1d_span_numpy(span, base_chunk_id, local_start, stop - start) - else: - span = where_x[start:stop] - if plan.target is not None and plan.target.get("source") == "expression": - field_values = _values_from_numpy_target(span, plan.target) - else: - field_values = span if plan.field is None else span[plan.field] - match = np.ones(len(field_values), dtype=bool) - if plan.lower is not None: - match &= field_values >= plan.lower if plan.lower_inclusive else field_values > plan.lower - if plan.upper is not None: - match &= field_values <= plan.upper if plan.upper_inclusive else field_values < plan.upper - if np.any(match): - parts.append(np.require(span[match], requirements="C")) + candidate_chunk_ids = np.flatnonzero(np.any(plan.bucket_masks, axis=1)).astype(np.intp, copy=False) - if parts: - return np.concatenate(parts) - return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) + thread_count = _downstream_query_thread_count(len(candidate_chunk_ids), plan) + if thread_count <= 1: + parts = [_process_light_chunk_batch(candidate_chunk_ids, where_x, plan, total_len, chunk_len)] + else: + batches = _chunk_batches(candidate_chunk_ids, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + parts = list( + executor.map( + _process_light_chunk_batch, + batches, + [where_x] * len(batches), + [plan] * len(batches), + [total_len] * len(batches), + [chunk_len] * len(batches), + ) + ) + + return _merge_result_batches(parts, _where_output_dtype(where["_where_x"])) def _gather_positions(where_x, positions: np.ndarray) -> np.ndarray: diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index c985f225..86301fa3 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -195,6 +195,60 @@ def test_light_lossy_float_values_match_scan(): np.testing.assert_array_equal(indexed, data[(data["x"] >= -12.5) & (data["x"] < 17.25)]) +def test_ultralight_threaded_downstream_order_matches_scan(monkeypatch): + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.zeros(240_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + arr.create_index(field="id", kind="ultralight") + + indexing = __import__("blosc2.indexing", fromlist=["INDEX_QUERY_MIN_CHUNKS_PER_THREAD"]) + monkeypatch.setattr(indexing, "INDEX_QUERY_MIN_CHUNKS_PER_THREAD", 1) + monkeypatch.setattr(blosc2, "nthreads", 4) + + expr = blosc2.lazyexpr("(id >= 60_000) & (id < 180_000)", arr.fields).where(arr) + explanation = expr.explain() + + assert explanation["will_use_index"] is True + assert explanation["level"] == "chunk" + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(data["id"] >= 60_000) & (data["id"] < 180_000)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + +def test_light_threaded_downstream_order_matches_scan(monkeypatch): + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.zeros(240_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + arr.create_index(field="id", kind="light", in_mem=True) + + indexing = __import__("blosc2.indexing", fromlist=["INDEX_QUERY_MIN_CHUNKS_PER_THREAD"]) + monkeypatch.setattr(indexing, "INDEX_QUERY_MIN_CHUNKS_PER_THREAD", 1) + monkeypatch.setattr(blosc2, "nthreads", 4) + + expr = blosc2.lazyexpr("(id >= 60_000) & (id < 180_000)", arr.fields).where(arr) + explanation = expr.explain() + + assert explanation["will_use_index"] is True + assert explanation["lookup_path"] == "chunk-nav" + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(data["id"] >= 60_000) & (data["id"] < 180_000)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_persistent_index_survives_reopen(tmp_path, kind): path = tmp_path / "indexed_array.b2nd" From d3e9cc8eb17caecc851529003a37cad6abfa62e4 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sun, 5 Apr 2026 09:35:23 +0200 Subject: [PATCH 28/68] Add configurable chunk/block geometry to index query bench --- bench/ndarray/index_query_bench.py | 132 +++++++++++++++++++++++------ 1 file changed, 104 insertions(+), 28 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 6aeb0992..c381f6c1 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -21,8 +21,6 @@ from blosc2 import indexing as blosc2_indexing SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) -CHUNK_LEN = 100_000 -BLOCK_LEN = 20_000 DEFAULT_REPEATS = 3 KINDS = ("ultralight", "light", "medium", "full") DISTS = ("sorted", "block-shuffled", "random") @@ -35,6 +33,10 @@ def dtype_token(dtype: np.dtype) -> str: return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") +def source_dtype(id_dtype: np.dtype) -> np.dtype: + return np.dtype([("id", np.dtype(id_dtype)), ("payload", np.float32)]) + + def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: dtype = np.dtype(dtype) if dtype == np.dtype(np.bool_): @@ -59,19 +61,19 @@ def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: raise ValueError(f"unsupported dtype for benchmark: {dtype}") -def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: +def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random.Generator, block_len: int) -> None: size = ids.shape[0] if dist == "sorted": ids[:] = ordered_ids return if dist == "block-shuffled": - nblocks = (size + BLOCK_LEN - 1) // BLOCK_LEN + nblocks = (size + block_len - 1) // block_len order = rng.permutation(nblocks) dest = 0 for src_block in order: - src_start = int(src_block) * BLOCK_LEN - src_stop = min(src_start + BLOCK_LEN, size) + src_start = int(src_block) * block_len + src_stop = min(src_start + block_len, size) block_size = src_stop - src_start ids[dest : dest + block_size] = ordered_ids[src_start:src_stop] dest += block_size @@ -85,30 +87,68 @@ def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random raise ValueError(f"unsupported distribution {dist!r}") -def make_source_data(size: int, dist: str, id_dtype: np.dtype) -> np.ndarray: - dtype = np.dtype([("id", id_dtype), ("payload", np.float32)]) +def _geometry_value_token(value: int | None) -> str: + return "auto" if value is None else f"{value}" + + +def geometry_token(chunks: int | None, blocks: int | None) -> str: + return f"chunks-{_geometry_value_token(chunks)}.blocks-{_geometry_value_token(blocks)}" + + +def format_geometry_value(value: int | None) -> str: + return "auto" if value is None else f"{value:,}" + + +def resolve_geometry(shape: tuple[int, ...], dtype: np.dtype, chunks: int | None, blocks: int | None) -> tuple[int, int]: + chunk_spec = None if chunks is None else (chunks,) + block_spec = None if blocks is None else (blocks,) + resolved_chunks, resolved_blocks = blosc2.compute_chunks_blocks(shape, chunk_spec, block_spec, dtype=dtype) + return int(resolved_chunks[0]), int(resolved_blocks[0]) + + +def make_source_data(size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None) -> np.ndarray: + dtype = source_dtype(id_dtype) data = np.zeros(size, dtype=dtype) - fill_ids(data["id"], make_ordered_ids(size, id_dtype), dist, np.random.default_rng(RNG_SEED)) + _, block_len = resolve_geometry((size,), dtype, chunks, blocks) + fill_ids(data["id"], make_ordered_ids(size, id_dtype), dist, np.random.default_rng(RNG_SEED), block_len) return data -def build_array(data: np.ndarray) -> blosc2.NDArray: - return blosc2.asarray(data, chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) +def build_array(data: np.ndarray, chunks: int | None, blocks: int | None) -> blosc2.NDArray: + kwargs = {} + if chunks is not None: + kwargs["chunks"] = (chunks,) + if blocks is not None: + kwargs["blocks"] = (blocks,) + return blosc2.asarray(data, **kwargs) -def build_persistent_array(data: np.ndarray, path: Path) -> blosc2.NDArray: - return blosc2.asarray(data, urlpath=path, mode="w", chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) +def build_persistent_array(data: np.ndarray, path: Path, chunks: int | None, blocks: int | None) -> blosc2.NDArray: + kwargs = {"urlpath": path, "mode": "w"} + if chunks is not None: + kwargs["chunks"] = (chunks,) + if blocks is not None: + kwargs["blocks"] = (blocks,) + return blosc2.asarray(data, **kwargs) -def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype) -> Path: - return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.b2nd" +def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None) -> Path: + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.b2nd" def indexed_array_path( - size_dir: Path, size: int, dist: str, kind: str, optlevel: int, id_dtype: np.dtype, in_mem: bool + size_dir: Path, + size: int, + dist: str, + kind: str, + optlevel: int, + id_dtype: np.dtype, + in_mem: bool, + chunks: int | None, + blocks: int | None, ) -> Path: mode = "mem" if in_mem else "ooc" - return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{kind}.opt{optlevel}.{mode}.b2nd" + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.{kind}.opt{optlevel}.{mode}.b2nd" def benchmark_scan_once(expr) -> tuple[float, int]: @@ -179,13 +219,13 @@ def index_sizes(descriptor: dict) -> tuple[int, int]: return logical, disk -def _source_data_factory(size: int, dist: str, id_dtype: np.dtype): +def _source_data_factory(size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None): data = None def get_data() -> np.ndarray: nonlocal data if data is None: - data = make_source_data(size, dist, id_dtype) + data = make_source_data(size, dist, id_dtype, chunks, blocks) return data return get_data @@ -245,15 +285,15 @@ def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int, in_me return None -def _open_or_build_persistent_array(path: Path, get_data) -> blosc2.NDArray: +def _open_or_build_persistent_array(path: Path, get_data, chunks: int | None, blocks: int | None) -> blosc2.NDArray: if path.exists(): return blosc2.open(path, mode="a") blosc2.remove_urlpath(path) - return build_persistent_array(get_data(), path) + return build_persistent_array(get_data(), path, chunks, blocks) def _open_or_build_indexed_array( - path: Path, get_data, kind: str, optlevel: int, in_mem: bool + path: Path, get_data, kind: str, optlevel: int, in_mem: bool, chunks: int | None, blocks: int | None ) -> tuple[blosc2.NDArray, float]: if path.exists(): arr = blosc2.open(path, mode="a") @@ -263,7 +303,7 @@ def _open_or_build_indexed_array( arr.drop_index(field="id") blosc2.remove_urlpath(path) - arr = build_persistent_array(get_data(), path) + arr = build_persistent_array(get_data(), path, chunks, blocks) build_start = time.perf_counter() arr.create_index(field="id", kind=kind, optlevel=optlevel, in_mem=in_mem) return arr, time.perf_counter() - build_start @@ -278,10 +318,12 @@ def benchmark_size( id_dtype: np.dtype, in_mem: bool, full_query_mode: str, + chunks: int | None, + blocks: int | None, ) -> list[dict]: - get_data = _source_data_factory(size, dist, id_dtype) + get_data = _source_data_factory(size, dist, id_dtype, chunks, blocks) get_ordered_ids = _ordered_ids_factory(size, id_dtype) - arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist, id_dtype), get_data) + arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist, id_dtype, chunks, blocks), get_data, chunks, blocks) lo, hi = _query_bounds(get_ordered_ids(), query_width) condition_str = _condition_expr(lo, hi, id_dtype) condition = blosc2.lazyexpr(condition_str, arr.fields) @@ -294,11 +336,13 @@ def benchmark_size( rows = [] for kind in KINDS: idx_arr, build_time = _open_or_build_indexed_array( - indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem), + indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks), get_data, kind, optlevel, in_mem, + chunks, + blocks, ) idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) idx_expr = idx_cond.where(idx_arr) @@ -378,6 +422,13 @@ def parse_human_size(value: str) -> int: return size +def parse_human_size_or_auto(value: str) -> int | None: + value = value.strip() + if value.lower() == "auto": + return None + return parse_human_size(value) + + def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Benchmark python-blosc2 index kinds.") parser.add_argument( @@ -391,6 +442,18 @@ def parse_args() -> argparse.Namespace: default=1_000, help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1000.", ) + parser.add_argument( + "--chunks", + type=parse_human_size_or_auto, + default=None, + help="Chunk size for the base array. Supports suffixes like 10k, 1M, and 'auto'. Default: auto.", + ) + parser.add_argument( + "--blocks", + type=parse_human_size_or_auto, + default=None, + help="Block size for the base array. Supports suffixes like 10k, 1M, and 'auto'. Default: auto.", + ) parser.add_argument( "--repeats", type=int, @@ -460,6 +523,8 @@ def main() -> None: id_dtype, args.in_mem, args.full_query_mode, + args.chunks, + args.blocks, ) else: args.outdir.mkdir(parents=True, exist_ok=True) @@ -474,6 +539,8 @@ def main() -> None: id_dtype, args.in_mem, args.full_query_mode, + args.chunks, + args.blocks, ) @@ -488,18 +555,27 @@ def run_benchmarks( id_dtype: np.dtype, in_mem: bool, full_query_mode: str, + chunks: int | None, + blocks: int | None, ) -> None: all_results = [] + array_dtype = source_dtype(id_dtype) + resolved_geometries = {resolve_geometry((size,), array_dtype, chunks, blocks) for size in sizes} + if len(resolved_geometries) == 1: + resolved_chunk_len, resolved_block_len = next(iter(resolved_geometries)) + geometry_label = f"chunks={resolved_chunk_len:,}, blocks={resolved_block_len:,}" + else: + geometry_label = "chunks=varies, blocks=varies" print("Structured range-query benchmark across index kinds") print( - f"chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " + f"{geometry_label}, repeats={repeats}, dist={dist_label}, " f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " f"full_query_mode={full_query_mode}" ) for dist in dists: for size in sizes: size_results = benchmark_size( - size, size_dir, dist, query_width, optlevel, id_dtype, in_mem, full_query_mode + size, size_dir, dist, query_width, optlevel, id_dtype, in_mem, full_query_mode, chunks, blocks ) all_results.extend(size_results) From 9498a1a1fa999da482304921d65244fe42c37bbe Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sun, 5 Apr 2026 10:02:03 +0200 Subject: [PATCH 29/68] New geometry for blocks in medium indexes --- src/blosc2/indexing.py | 18 +++++++++--------- tests/ndarray/test_indexing.py | 7 +++++-- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index cc83fd48..fa2ab687 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -770,7 +770,7 @@ def _build_reduced_descriptor( persistent: bool, ) -> dict: chunk_len = int(array.chunks[0]) - nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), optlevel) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( values, chunk_len, nav_segment_len ) @@ -815,18 +815,18 @@ def _sidecar_block_len(sidecar: dict, fallback_block_len: int) -> int: def _medium_nav_segment_divisor(optlevel: int) -> int: if optlevel <= 1: return 1 - if optlevel == 2: + if optlevel <= 3: return 2 - if optlevel == 3: - return 4 if optlevel <= 6: - return 8 - return 16 + return 4 + return 8 -def _medium_nav_segment_len(block_len: int, optlevel: int) -> tuple[int, int]: +def _medium_nav_segment_len(block_len: int, chunk_len: int, optlevel: int) -> tuple[int, int]: divisor = min(block_len, _medium_nav_segment_divisor(int(optlevel))) - return max(1, block_len // divisor), divisor + max_segments_per_chunk = 2048 + chunk_floor = max(1, math.ceil(int(chunk_len) / max_segments_per_chunk)) + return max(1, block_len // divisor, chunk_floor), divisor def _build_chunk_sorted_payload( @@ -995,7 +995,7 @@ def _build_reduced_descriptor_ooc( workdir: Path, ) -> dict: chunk_len = int(array.chunks[0]) - nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), optlevel) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload_ooc( array, target, dtype, workdir, f"{kind}_reduced", chunk_len, nav_segment_len ) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 86301fa3..e8c6a2f5 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -4,6 +4,7 @@ # # SPDX-License-Identifier: BSD-3-Clause ####################################################################### +import math import numpy as np import pytest @@ -319,13 +320,15 @@ def test_chunk_local_index_descriptor_and_lookup_path(tmp_path, kind): assert meta["layout"] == "chunk-local-v1" assert meta["chunk_len"] == arr.chunks[0] - expected_nav_len = arr.blocks[0] if kind == "light" else arr.blocks[0] // 8 + expected_nav_len = ( + arr.blocks[0] if kind == "light" else max(arr.blocks[0] // 4, math.ceil(arr.chunks[0] / 2048)) + ) assert meta["nav_segment_len"] == expected_nav_len assert meta["l1_path"] is not None assert meta["l2_path"] is not None if kind == "medium": - assert meta["nav_segment_divisor"] == 8 + assert meta["nav_segment_divisor"] == 4 reopened = blosc2.open(path, mode="a") expr = (reopened == 123_456).where(reopened) From 53d7651f87218cf3299043dd2ee6e39576c93ce4 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Sun, 5 Apr 2026 10:04:19 +0200 Subject: [PATCH 30/68] Release OOC temp memmaps before Windows cleanup --- src/blosc2/indexing.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index fa2ab687..525093af 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -1018,6 +1018,7 @@ def _build_reduced_descriptor_ooc( ) reduced["position_dtype"] = positions.dtype.str reduced["nav_segment_divisor"] = nav_segment_divisor + del sorted_values, positions return reduced @@ -1228,6 +1229,7 @@ def _build_light_descriptor_ooc( light["bucket_len"] = bucket_len light["value_lossy_bits"] = value_lossy_bits light["bucket_dtype"] = bucket_positions.dtype.str + del sorted_values, positions, bucket_positions return light From ba9cdc14d54d048def126c73643ab6dcedb4ac8d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 6 Apr 2026 12:23:51 +0200 Subject: [PATCH 31/68] Remove memmap staging from light/medium index builds --- src/blosc2/indexing.py | 545 +++++++++++++++++++++++++-------- tests/ndarray/test_indexing.py | 33 ++ 2 files changed, 442 insertions(+), 136 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 525093af..b8903f3f 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -516,6 +516,32 @@ def _store_array_sidecar( return {"path": path, "dtype": data.dtype.descr if data.dtype.fields else data.dtype.str} +def _create_persistent_sidecar_handle( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + name: str, + length: int, + dtype: np.dtype, + *, + chunks: tuple[int, ...] | None = None, + blocks: tuple[int, ...] | None = None, +) -> tuple[blosc2.NDArray | None, dict]: + path = _sidecar_path(array, token, kind, f"{category}.{name}") + blosc2.remove_urlpath(path) + kwargs = {"urlpath": path, "mode": "w"} + if chunks is not None: + kwargs["chunks"] = chunks + if blocks is not None: + kwargs["blocks"] = blocks + if length == 0: + blosc2.asarray(np.empty(0, dtype=dtype), **kwargs) + return None, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} + handle = blosc2.empty((length,), dtype=dtype, **kwargs) + return handle, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} + + def _load_array_sidecar( array: blosc2.NDArray, token: str, category: str, name: str, path: str | None ) -> np.ndarray: @@ -796,11 +822,6 @@ def _build_reduced_descriptor( return reduced -def _open_temp_memmap(workdir: Path, name: str, dtype: np.dtype, shape: tuple[int, ...]) -> np.memmap: - path = workdir / f"{name}.npy" - return np.lib.format.open_memmap(path, mode="w+", dtype=dtype, shape=shape) - - def _segment_row_count(chunk_len: int, nav_segment_len: int) -> int: return max(1, math.ceil(chunk_len / nav_segment_len)) @@ -872,25 +893,29 @@ def _build_chunk_sorted_payload( return sorted_values, positions, offsets, l2, position_dtype -def _build_chunk_sorted_payload_ooc( +def _build_chunk_sorted_payload_direct( array: blosc2.NDArray, target: dict, dtype: np.dtype, - workdir: Path, - prefix: str, chunk_len: int, nav_segment_len: int, -) -> tuple[np.memmap, np.memmap, np.ndarray, np.ndarray, np.dtype]: + *, + payload_dtype: np.dtype | None = None, + aux_dtype: np.dtype | None = None, + value_transform=None, + aux_transform=None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: size = int(array.shape[0]) nchunks = math.ceil(size / chunk_len) - position_dtype = _position_dtype(chunk_len - 1) + payload_dtype = np.dtype(dtype if payload_dtype is None else payload_dtype) + aux_dtype = np.dtype(_position_dtype(chunk_len - 1) if aux_dtype is None else aux_dtype) offsets = np.empty(nchunks + 1, dtype=np.int64) offsets[0] = 0 - sorted_values = _open_temp_memmap(workdir, f"{prefix}_values", dtype, (size,)) - positions = _open_temp_memmap(workdir, f"{prefix}_positions", position_dtype, (size,)) - l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + payload = np.empty(size, dtype=payload_dtype) + aux = np.empty(size, dtype=aux_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(payload_dtype)) nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) - l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(payload_dtype)) cursor = 0 for chunk_id in range(nchunks): @@ -900,23 +925,28 @@ def _build_chunk_sorted_payload_ooc( order = np.argsort(chunk, kind="stable") chunk_size = stop - start next_cursor = cursor + chunk_size - chunk_sorted = chunk[order] - sorted_values[cursor:next_cursor] = chunk_sorted - positions[cursor:next_cursor] = order.astype(position_dtype, copy=False) + chunk_payload = chunk[order] + if value_transform is not None: + chunk_payload = value_transform(chunk_payload) + chunk_aux = order.astype(_position_dtype(chunk_len - 1), copy=False) + if aux_transform is not None: + chunk_aux = aux_transform(chunk_aux) + payload[cursor:next_cursor] = chunk_payload + aux[cursor:next_cursor] = chunk_aux offsets[chunk_id + 1] = next_cursor - l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) - - row_start = chunk_id * nsegments_per_chunk - segment_count = _segment_row_count(chunk_size, nav_segment_len) - for segment_id in range(segment_count): - seg_start = cursor + segment_id * nav_segment_len - seg_stop = min(seg_start + nav_segment_len, next_cursor) - l2[row_start + segment_id] = (sorted_values[seg_start], sorted_values[seg_stop - 1]) - for segment_id in range(segment_count, nsegments_per_chunk): - l2[row_start + segment_id] = l2[row_start + segment_count - 1] + if chunk_size > 0: + l1[chunk_id] = (chunk_payload[0], chunk_payload[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_payload[seg_start], chunk_payload[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] cursor = next_cursor - return sorted_values, positions, offsets, l2, position_dtype + return payload, aux, offsets, l1, l2 def _chunk_index_payload_storage( @@ -984,6 +1014,85 @@ def _chunk_index_payload_storage( } +def _prepare_chunk_index_payload_sidecars( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + payload_name: str, + payload_dtype: np.dtype, + aux_name: str, + aux_dtype: np.dtype, + size: int, + chunk_len: int, + nav_segment_len: int, +) -> tuple[blosc2.NDArray | None, dict, blosc2.NDArray | None, dict]: + payload_handle, payload_sidecar = _create_persistent_sidecar_handle( + array, + token, + kind, + category, + payload_name, + size, + payload_dtype, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + ) + aux_handle, aux_sidecar = _create_persistent_sidecar_handle( + array, + token, + kind, + category, + aux_name, + size, + aux_dtype, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + ) + return payload_handle, payload_sidecar, aux_handle, aux_sidecar + + +def _finalize_chunk_index_payload_storage( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + aux_name: str, + offsets: np.ndarray, + l1: np.ndarray, + l2: np.ndarray, + payload_sidecar: dict, + aux_sidecar: dict, + chunk_len: int, + nav_segment_len: int, +) -> dict: + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + offsets_sidecar = _store_array_sidecar(array, token, kind, category, "offsets", offsets, True) + l1_sidecar = _store_array_sidecar(array, token, kind, f"{category}_nav", "l1", l1, True) + l2_sidecar = _store_array_sidecar( + array, + token, + kind, + f"{category}_nav", + "l2", + l2, + True, + chunks=(nsegments_per_chunk,), + blocks=(min(nsegments_per_chunk, max(1, nsegments_per_chunk)),), + ) + return { + "layout": "chunk-local-v1", + "chunk_len": chunk_len, + "nav_segment_len": nav_segment_len, + "nsegments_per_chunk": nsegments_per_chunk, + "values_path": payload_sidecar["path"], + f"{aux_name}_path": aux_sidecar["path"], + "offsets_path": offsets_sidecar["path"], + "l1_path": l1_sidecar["path"], + "l2_path": l2_sidecar["path"], + } + + def _build_reduced_descriptor_ooc( array: blosc2.NDArray, target: dict, @@ -992,14 +1101,94 @@ def _build_reduced_descriptor_ooc( dtype: np.dtype, optlevel: int, persistent: bool, - workdir: Path, ) -> dict: + if persistent: + size = int(array.shape[0]) + chunk_len = int(array.chunks[0]) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len( + int(array.blocks[0]), chunk_len, optlevel + ) + nchunks = math.ceil(size / chunk_len) + position_dtype = _position_dtype(chunk_len - 1) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + values_handle = positions_handle = None + values_sidecar = positions_sidecar = None + try: + values_handle, values_sidecar, positions_handle, positions_sidecar = ( + _prepare_chunk_index_payload_sidecars( + array, + token, + kind, + "reduced", + "values", + dtype, + "positions", + position_dtype, + size, + chunk_len, + nav_segment_len, + ) + ) + cursor = 0 + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk = _slice_values_for_target(array, target, start, stop) + order = np.argsort(chunk, kind="stable") + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_sorted = chunk[order] + local_positions = order.astype(position_dtype, copy=False) + if values_handle is not None: + values_handle[cursor:next_cursor] = chunk_sorted + if positions_handle is not None: + positions_handle[cursor:next_cursor] = local_positions + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + del values_handle, positions_handle + reduced = _finalize_chunk_index_payload_storage( + array, + token, + kind, + "reduced", + "positions", + offsets, + l1, + l2, + values_sidecar, + positions_sidecar, + chunk_len, + nav_segment_len, + ) + except Exception: + if values_sidecar is not None: + _remove_sidecar_path(values_sidecar["path"]) + if positions_sidecar is not None: + _remove_sidecar_path(positions_sidecar["path"]) + raise + reduced["position_dtype"] = position_dtype.str + reduced["nav_segment_divisor"] = nav_segment_divisor + return reduced + chunk_len = int(array.chunks[0]) nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) - sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload_ooc( - array, target, dtype, workdir, f"{kind}_reduced", chunk_len, nav_segment_len + sorted_values, positions, offsets, l1, l2 = _build_chunk_sorted_payload_direct( + array, target, dtype, chunk_len, nav_segment_len ) - l1 = _compute_sorted_boundaries(np.asarray(sorted_values), dtype, chunk_len) reduced = _chunk_index_payload_storage( array, token, @@ -1018,7 +1207,6 @@ def _build_reduced_descriptor_ooc( ) reduced["position_dtype"] = positions.dtype.str reduced["nav_segment_divisor"] = nav_segment_divisor - del sorted_values, positions return reduced @@ -1139,6 +1327,60 @@ def _quantize_light_value_scalar(value, dtype: np.dtype, bits: int): return np.asarray(value, dtype=dtype)[()] +def _build_light_chunk_payloads( + array: blosc2.NDArray, + target: dict, + dtype: np.dtype, + chunk_len: int, + nav_segment_len: int, + value_lossy_bits: int, + bucket_len: int, + bucket_dtype: np.dtype, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + size = int(array.shape[0]) + nchunks = math.ceil(size / chunk_len) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty(size, dtype=dtype) + bucket_positions = np.empty(size, dtype=bucket_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + position_dtype = _position_dtype(chunk_len - 1) + cursor = 0 + + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk = _slice_values_for_target(array, target, start, stop) + order = np.argsort(chunk, kind="stable") + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_sorted = chunk[order] + stored_chunk_sorted = chunk_sorted + if value_lossy_bits > 0: + stored_chunk_sorted = _quantize_light_values_array(chunk_sorted, value_lossy_bits) + local_positions = order.astype(position_dtype, copy=False) + sorted_values[cursor:next_cursor] = stored_chunk_sorted + bucket_positions[cursor:next_cursor] = (local_positions // bucket_len).astype( + bucket_dtype, copy=False + ) + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (stored_chunk_sorted[0], stored_chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, bucket_positions, offsets, l1, l2 + + def _build_light_descriptor( array: blosc2.NDArray, token: str, @@ -1191,24 +1433,66 @@ def _build_light_descriptor_ooc( dtype: np.dtype, optlevel: int, persistent: bool, - workdir: Path, ) -> dict: chunk_len = int(array.chunks[0]) nav_segment_len = int(array.blocks[0]) bucket_len = max(1, math.ceil(nav_segment_len / 64)) bucket_count = math.ceil(chunk_len / bucket_len) value_lossy_bits = _light_value_lossy_bits(dtype, optlevel) - sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload_ooc( - array, target, dtype, workdir, f"{kind}_light", chunk_len, nav_segment_len - ) - if value_lossy_bits > 0: - sorted_values[:] = _quantize_light_values_array(np.asarray(sorted_values), value_lossy_bits) bucket_dtype = _position_dtype(bucket_count - 1) - bucket_positions = _open_temp_memmap( - workdir, f"{kind}_light_bucket_positions", bucket_dtype, positions.shape + sorted_values, bucket_positions, offsets, l1, l2 = _build_light_chunk_payloads( + array, target, dtype, chunk_len, nav_segment_len, value_lossy_bits, bucket_len, bucket_dtype ) - bucket_positions[:] = (np.asarray(positions) // bucket_len).astype(bucket_dtype, copy=False) - l1 = _compute_sorted_boundaries(np.asarray(sorted_values), dtype, chunk_len) + if persistent: + values_handle = bucket_handle = None + values_sidecar = bucket_sidecar = None + try: + values_handle, values_sidecar, bucket_handle, bucket_sidecar = ( + _prepare_chunk_index_payload_sidecars( + array, + token, + kind, + "light", + "values", + dtype, + "bucket_positions", + bucket_dtype, + len(sorted_values), + chunk_len, + nav_segment_len, + ) + ) + if values_handle is not None: + values_handle[:] = sorted_values + if bucket_handle is not None: + bucket_handle[:] = bucket_positions + del values_handle, bucket_handle + light = _finalize_chunk_index_payload_storage( + array, + token, + kind, + "light", + "bucket_positions", + offsets, + l1, + l2, + values_sidecar, + bucket_sidecar, + chunk_len, + nav_segment_len, + ) + except Exception: + if values_sidecar is not None: + _remove_sidecar_path(values_sidecar["path"]) + if bucket_sidecar is not None: + _remove_sidecar_path(bucket_sidecar["path"]) + raise + light["bucket_count"] = bucket_count + light["bucket_len"] = bucket_len + light["value_lossy_bits"] = value_lossy_bits + light["bucket_dtype"] = bucket_dtype.str + return light + light = _chunk_index_payload_storage( array, token, @@ -1229,7 +1513,6 @@ def _build_light_descriptor_ooc( light["bucket_len"] = bucket_len light["value_lossy_bits"] = value_lossy_bits light["bucket_dtype"] = bucket_positions.dtype.str - del sorted_values, positions, bucket_positions return light @@ -1684,42 +1967,39 @@ def create_index( use_ooc = _resolve_ooc_mode(kind, in_mem) if use_ooc and kind in {"light", "medium", "full"}: - with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: - workdir = Path(tmpdir) - levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) - light = ( - _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, workdir) - if kind == "light" - else None - ) - reduced = ( - _build_reduced_descriptor_ooc( - array, target, token, kind, dtype, optlevel, persistent, workdir + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) + light = ( + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + if kind == "medium" + else None + ) + full = None + if kind == "full": + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + full = _build_full_descriptor_ooc( + array, target, token, kind, dtype, persistent, Path(tmpdir) ) - if kind == "medium" - else None - ) - full = ( - _build_full_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) - if kind == "full" - else None - ) - descriptor = _build_descriptor( - array, - target, - token, - kind, - optlevel, - granularity, - persistent, - True, - name, - dtype, - levels, - light, - reduced, - full, - ) + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + granularity, + persistent, + True, + name, + dtype, + levels, + light, + reduced, + full, + ) else: values = _values_for_target(array, target) levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent) @@ -1788,42 +2068,39 @@ def create_expr_index( token = _target_token(target) if use_ooc and kind in {"light", "medium", "full"}: - with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: - workdir = Path(tmpdir) - levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) - light = ( - _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, workdir) - if kind == "light" - else None - ) - reduced = ( - _build_reduced_descriptor_ooc( - array, target, token, kind, dtype, optlevel, persistent, workdir + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) + light = ( + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + if kind == "medium" + else None + ) + full = None + if kind == "full": + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + full = _build_full_descriptor_ooc( + array, target, token, kind, dtype, persistent, Path(tmpdir) ) - if kind == "medium" - else None - ) - full = ( - _build_full_descriptor_ooc(array, target, token, kind, dtype, persistent, workdir) - if kind == "full" - else None - ) - descriptor = _build_descriptor( - array, - target, - token, - kind, - optlevel, - granularity, - persistent, - True, - name, - dtype, - levels, - light, - reduced, - full, - ) + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + granularity, + persistent, + True, + name, + dtype, + levels, + light, + reduced, + full, + ) else: values = _values_for_target(array, target) levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent) @@ -1957,17 +2234,15 @@ def _replace_reduced_descriptor_tail( for key in ("values_path", "positions_path", "offsets_path", "l1_path", "l2_path"): _remove_sidecar_path(reduced.get(key)) if descriptor.get("ooc", False): - with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: - rebuilt = _build_reduced_descriptor_ooc( - array, - target, - descriptor["token"], - descriptor["kind"], - np.dtype(descriptor["dtype"]), - descriptor["optlevel"], - persistent, - Path(tmpdir), - ) + rebuilt = _build_reduced_descriptor_ooc( + array, + target, + descriptor["token"], + descriptor["kind"], + np.dtype(descriptor["dtype"]), + descriptor["optlevel"], + persistent, + ) else: rebuilt = _build_reduced_descriptor( array, @@ -1989,17 +2264,15 @@ def _replace_light_descriptor_tail( for key in ("values_path", "bucket_positions_path", "offsets_path", "l1_path", "l2_path"): _remove_sidecar_path(light.get(key)) if descriptor.get("ooc", False): - with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: - rebuilt = _build_light_descriptor_ooc( - array, - target, - descriptor["token"], - descriptor["kind"], - np.dtype(descriptor["dtype"]), - descriptor["optlevel"], - persistent, - Path(tmpdir), - ) + rebuilt = _build_light_descriptor_ooc( + array, + target, + descriptor["token"], + descriptor["kind"], + np.dtype(descriptor["dtype"]), + descriptor["optlevel"], + persistent, + ) else: rebuilt = _build_light_descriptor( array, diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index e8c6a2f5..6019784f 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -307,6 +307,39 @@ def test_default_ooc_persistent_index_matches_scan_and_rebuilds(tmp_path, kind): assert rebuilt["ooc"] is True +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_persistent_chunk_local_ooc_builds_do_not_use_temp_memmap(tmp_path, kind): + path = tmp_path / f"persistent_no_memmap_{kind}.b2nd" + data = np.arange(120_000, dtype=np.int64) + indexing = __import__("blosc2.indexing", fromlist=["_segment_row_count"]) + assert not hasattr(indexing, "_open_temp_memmap") + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(12_000,), blocks=(2_000,)) + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + meta = descriptor["light"] if kind == "light" else descriptor["reduced"] + assert meta["values_path"] is not None + + reopened = blosc2.open(path, mode="a") + expr = ((reopened >= 55_000) & (reopened < 55_010)).where(reopened) + np.testing.assert_array_equal(expr.compute()[:], data[(data >= 55_000) & (data < 55_010)]) + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_in_memory_chunk_local_ooc_builds_do_not_use_temp_memmap(kind): + data = np.arange(120_000, dtype=np.int64) + indexing = __import__("blosc2.indexing", fromlist=["_segment_row_count"]) + assert not hasattr(indexing, "_open_temp_memmap") + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(2_000,)) + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + expr = ((arr >= 55_000) & (arr < 55_010)).where(arr) + np.testing.assert_array_equal(expr.compute()[:], data[(data >= 55_000) & (data < 55_010)]) + + @pytest.mark.parametrize("kind", ["light", "medium"]) def test_chunk_local_index_descriptor_and_lookup_path(tmp_path, kind): path = tmp_path / f"chunk_local_{kind}.b2nd" From a9a7f156d736b95528a4c568b6a80c874cd16450 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 6 Apr 2026 14:09:50 +0200 Subject: [PATCH 32/68] Rework chunk-local index builds around intra-chunk sorting - add native intra-chunk sort and linear merge in indexing_ext - keep safe NumPy fallbacks for unsupported dtypes - simplify build path to a single intra-chunk implementation - use BLOSC2_INDEX_BUILD_THREADS to control build parallelism - document that BLOSC2_INDEX_BUILD_THREADS=1 disables parallel sorting --- doc/reference/ndarray.rst | 4 + src/blosc2/indexing.py | 220 ++++++++++- src/blosc2/indexing_ext.pyx | 694 +++++++++++++++++++++++++++++++++ src/blosc2/ndarray.py | 10 + tests/ndarray/test_indexing.py | 59 +++ 5 files changed, 981 insertions(+), 6 deletions(-) diff --git a/doc/reference/ndarray.rst b/doc/reference/ndarray.rst index 36a21408..846c54c0 100644 --- a/doc/reference/ndarray.rst +++ b/doc/reference/ndarray.rst @@ -40,6 +40,10 @@ In addition, all the functions from the :ref:`Lazy Functions ` s mutations, and ``compact_index`` to consolidate append-heavy ``full`` indexes explicitly. + Chunk-local index creation uses parallel intra-chunk sorting by default. + Set ``BLOSC2_INDEX_BUILD_THREADS`` to control the number of build threads. + Set ``BLOSC2_INDEX_BUILD_THREADS=1`` to disable parallel sorting. + .. automethod:: create_index .. automethod:: create_csindex .. automethod:: create_expr_index diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index b8903f3f..a356415a 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -826,6 +826,27 @@ def _segment_row_count(chunk_len: int, nav_segment_len: int) -> int: return max(1, math.ceil(chunk_len / nav_segment_len)) +def _chunk_offsets(size: int, chunk_len: int) -> np.ndarray: + nchunks = math.ceil(size / chunk_len) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + if nchunks == 0: + return offsets + offsets[1:] = np.minimum(np.arange(1, nchunks + 1, dtype=np.int64) * chunk_len, size) + return offsets + + +def _index_build_threads() -> int: + forced = os.getenv("BLOSC2_INDEX_BUILD_THREADS") + if forced is not None: + try: + forced_threads = int(forced) + except ValueError: + forced_threads = 1 + return max(1, forced_threads) + return max(1, int(getattr(blosc2, "nthreads", 1) or 1)) + + def _sidecar_block_len(sidecar: dict, fallback_block_len: int) -> int: path = sidecar.get("path") if path is None: @@ -949,6 +970,140 @@ def _build_chunk_sorted_payload_direct( return payload, aux, offsets, l1, l2 +def _intra_chunk_run_ranges(chunk_size: int, thread_count: int) -> list[tuple[int, int]]: + if chunk_size <= 0: + return [] + run_count = max(1, min(thread_count, chunk_size)) + boundaries = np.linspace(0, chunk_size, run_count + 1, dtype=np.int64) + return [(int(boundaries[idx]), int(boundaries[idx + 1])) for idx in range(run_count)] + + +def _sort_chunk_run( + chunk: np.ndarray, run_start: int, run_stop: int, position_dtype: np.dtype +) -> tuple[np.ndarray, np.ndarray]: + run = chunk[run_start:run_stop] + try: + return indexing_ext.intra_chunk_sort_run(run, run_start, position_dtype) + except TypeError: + order = np.argsort(run, kind="stable") + return run[order], (order + run_start).astype(position_dtype, copy=False) + + +def _merge_sorted_run_pair( + left_values: np.ndarray, + left_positions: np.ndarray, + right_values: np.ndarray, + right_positions: np.ndarray, + dtype: np.dtype, + position_dtype: np.dtype, +) -> tuple[np.ndarray, np.ndarray]: + try: + merged_values, merged_positions = indexing_ext.intra_chunk_merge_sorted_slices( + left_values, left_positions, right_values, right_positions, position_dtype + ) + except TypeError: + merged_values, merged_positions = _merge_sorted_slices( + left_values, left_positions, right_values, right_positions, dtype + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +def _sort_chunk_intra_chunk( + chunk: np.ndarray, + position_dtype: np.dtype, + *, + thread_count: int | None = None, +) -> tuple[np.ndarray, np.ndarray]: + chunk_size = chunk.shape[0] + if chunk_size == 0: + return np.empty(0, dtype=chunk.dtype), np.empty(0, dtype=position_dtype) + if thread_count is None: + thread_count = _index_build_threads() + thread_count = max(1, min(int(thread_count), chunk_size)) + if thread_count <= 1: + order = np.argsort(chunk, kind="stable") + return chunk[order], order.astype(position_dtype, copy=False) + + def sort_run(run_range: tuple[int, int]) -> tuple[np.ndarray, np.ndarray]: + return _sort_chunk_run(chunk, run_range[0], run_range[1], position_dtype) + + run_ranges = _intra_chunk_run_ranges(chunk_size, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + runs = list(executor.map(sort_run, run_ranges)) + + while len(runs) > 1: + pair_specs = [(runs[idx], runs[idx + 1]) for idx in range(0, len(runs) - 1, 2)] + + def merge_pair( + pair_spec: tuple[tuple[np.ndarray, np.ndarray], tuple[np.ndarray, np.ndarray]], + ) -> tuple[np.ndarray, np.ndarray]: + (left_values, left_positions), (right_values, right_positions) = pair_spec + return _merge_sorted_run_pair( + left_values, left_positions, right_values, right_positions, chunk.dtype, position_dtype + ) + + if pair_specs: + merge_workers = min(thread_count, len(pair_specs)) + if merge_workers <= 1: + merged_runs = [merge_pair(pair_spec) for pair_spec in pair_specs] + else: + with ThreadPoolExecutor(max_workers=merge_workers) as executor: + merged_runs = list(executor.map(merge_pair, pair_specs)) + else: + merged_runs = [] + if len(runs) % 2 == 1: + merged_runs.append(runs[-1]) + runs = merged_runs + + return runs[0] + + +def _build_reduced_chunk_payloads_intra_chunk( + array: blosc2.NDArray, + target: dict, + dtype: np.dtype, + chunk_len: int, + nav_segment_len: int, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + size = int(array.shape[0]) + nchunks = math.ceil(size / chunk_len) + position_dtype = _position_dtype(chunk_len - 1) + sorted_values = np.empty(size, dtype=dtype) + positions = np.empty(size, dtype=position_dtype) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + cursor = 0 + thread_count = _index_build_threads() + + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk_sorted, local_positions = _sort_chunk_intra_chunk( + _slice_values_for_target(array, target, start, stop), position_dtype, thread_count=thread_count + ) + chunk_size = stop - start + next_cursor = cursor + chunk_size + sorted_values[cursor:next_cursor] = chunk_sorted + positions[cursor:next_cursor] = local_positions + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, positions, offsets, l1, l2 + + def _chunk_index_payload_storage( array: blosc2.NDArray, token: str, @@ -1137,12 +1292,11 @@ def _build_reduced_descriptor_ooc( for chunk_id in range(nchunks): start = chunk_id * chunk_len stop = min(start + chunk_len, size) - chunk = _slice_values_for_target(array, target, start, stop) - order = np.argsort(chunk, kind="stable") chunk_size = stop - start next_cursor = cursor + chunk_size - chunk_sorted = chunk[order] - local_positions = order.astype(position_dtype, copy=False) + chunk_sorted, local_positions = _sort_chunk_intra_chunk( + _slice_values_for_target(array, target, start, stop), position_dtype + ) if values_handle is not None: values_handle[cursor:next_cursor] = chunk_sorted if positions_handle is not None: @@ -1186,7 +1340,7 @@ def _build_reduced_descriptor_ooc( chunk_len = int(array.chunks[0]) nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) - sorted_values, positions, offsets, l1, l2 = _build_chunk_sorted_payload_direct( + sorted_values, positions, offsets, l1, l2 = _build_reduced_chunk_payloads_intra_chunk( array, target, dtype, chunk_len, nav_segment_len ) reduced = _chunk_index_payload_storage( @@ -1381,6 +1535,60 @@ def _build_light_chunk_payloads( return sorted_values, bucket_positions, offsets, l1, l2 +def _build_light_chunk_payloads_intra_chunk( + array: blosc2.NDArray, + target: dict, + dtype: np.dtype, + chunk_len: int, + nav_segment_len: int, + value_lossy_bits: int, + bucket_len: int, + bucket_dtype: np.dtype, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + size = int(array.shape[0]) + nchunks = math.ceil(size / chunk_len) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty(size, dtype=dtype) + bucket_positions = np.empty(size, dtype=bucket_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + position_dtype = _position_dtype(chunk_len - 1) + cursor = 0 + thread_count = _index_build_threads() + + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk_sorted, local_positions = _sort_chunk_intra_chunk( + _slice_values_for_target(array, target, start, stop), position_dtype, thread_count=thread_count + ) + chunk_size = stop - start + next_cursor = cursor + chunk_size + stored_chunk_sorted = chunk_sorted + if value_lossy_bits > 0: + stored_chunk_sorted = _quantize_light_values_array(chunk_sorted, value_lossy_bits) + sorted_values[cursor:next_cursor] = stored_chunk_sorted + bucket_positions[cursor:next_cursor] = (local_positions // bucket_len).astype( + bucket_dtype, copy=False + ) + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (stored_chunk_sorted[0], stored_chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, bucket_positions, offsets, l1, l2 + + def _build_light_descriptor( array: blosc2.NDArray, token: str, @@ -1440,7 +1648,7 @@ def _build_light_descriptor_ooc( bucket_count = math.ceil(chunk_len / bucket_len) value_lossy_bits = _light_value_lossy_bits(dtype, optlevel) bucket_dtype = _position_dtype(bucket_count - 1) - sorted_values, bucket_positions, offsets, l1, l2 = _build_light_chunk_payloads( + sorted_values, bucket_positions, offsets, l1, l2 = _build_light_chunk_payloads_intra_chunk( array, target, dtype, chunk_len, nav_segment_len, value_lossy_bits, bucket_len, bucket_dtype ) if persistent: diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx index 087c7fa0..a2cc90ef 100644 --- a/src/blosc2/indexing_ext.pyx +++ b/src/blosc2/indexing_ext.pyx @@ -11,6 +11,700 @@ cimport numpy as np from libc.stdint cimport int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t +ctypedef fused sort_float_t: + np.float32_t + np.float64_t + + +ctypedef fused sort_ordered_t: + np.int8_t + np.int16_t + np.int32_t + np.int64_t + np.uint8_t + np.uint16_t + np.uint32_t + np.uint64_t + + +cdef inline bint _le_float_pair( + sort_float_t left_value, + uint64_t left_position, + sort_float_t right_value, + uint64_t right_position, +) noexcept nogil: + cdef bint left_nan = left_value != left_value + cdef bint right_nan = right_value != right_value + if left_nan: + if right_nan: + return left_position <= right_position + return False + if right_nan: + return True + if left_value < right_value: + return True + if left_value > right_value: + return False + return left_position <= right_position + + +cdef inline bint _le_ordered_pair( + sort_ordered_t left_value, + uint64_t left_position, + sort_ordered_t right_value, + uint64_t right_position, +) noexcept nogil: + if left_value < right_value: + return True + if left_value > right_value: + return False + return left_position <= right_position + + +cdef void _stable_mergesort_float( + sort_float_t[:] values, + uint64_t[:] positions, + sort_float_t[:] tmp_values, + uint64_t[:] tmp_positions, +) noexcept nogil: + cdef Py_ssize_t n = values.shape[0] + cdef Py_ssize_t width = 1 + cdef Py_ssize_t start + cdef Py_ssize_t mid + cdef Py_ssize_t stop + cdef Py_ssize_t left + cdef Py_ssize_t right + cdef Py_ssize_t out + cdef sort_float_t[:] src_values = values + cdef uint64_t[:] src_positions = positions + cdef sort_float_t[:] dst_values = tmp_values + cdef uint64_t[:] dst_positions = tmp_positions + cdef sort_float_t[:] swap_values + cdef uint64_t[:] swap_positions + cdef bint in_original = True + while width < n: + start = 0 + while start < n: + mid = start + width + if mid > n: + mid = n + stop = start + 2 * width + if stop > n: + stop = n + left = start + right = mid + out = start + while left < mid and right < stop: + if _le_float_pair( + src_values[left], src_positions[left], src_values[right], src_positions[right] + ): + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + else: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + while left < mid: + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + out += 1 + while right < stop: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + start = stop + swap_values = src_values + src_values = dst_values + dst_values = swap_values + swap_positions = src_positions + src_positions = dst_positions + dst_positions = swap_positions + in_original = not in_original + width <<= 1 + if not in_original: + for start in range(n): + values[start] = src_values[start] + positions[start] = src_positions[start] + + +cdef void _stable_mergesort_ordered( + sort_ordered_t[:] values, + uint64_t[:] positions, + sort_ordered_t[:] tmp_values, + uint64_t[:] tmp_positions, +) noexcept nogil: + cdef Py_ssize_t n = values.shape[0] + cdef Py_ssize_t width = 1 + cdef Py_ssize_t start + cdef Py_ssize_t mid + cdef Py_ssize_t stop + cdef Py_ssize_t left + cdef Py_ssize_t right + cdef Py_ssize_t out + cdef sort_ordered_t[:] src_values = values + cdef uint64_t[:] src_positions = positions + cdef sort_ordered_t[:] dst_values = tmp_values + cdef uint64_t[:] dst_positions = tmp_positions + cdef sort_ordered_t[:] swap_values + cdef uint64_t[:] swap_positions + cdef bint in_original = True + while width < n: + start = 0 + while start < n: + mid = start + width + if mid > n: + mid = n + stop = start + 2 * width + if stop > n: + stop = n + left = start + right = mid + out = start + while left < mid and right < stop: + if _le_ordered_pair( + src_values[left], src_positions[left], src_values[right], src_positions[right] + ): + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + else: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + while left < mid: + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + out += 1 + while right < stop: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + start = stop + swap_values = src_values + src_values = dst_values + dst_values = swap_values + swap_positions = src_positions + src_positions = dst_positions + dst_positions = swap_positions + in_original = not in_original + width <<= 1 + if not in_original: + for start in range(n): + values[start] = src_values[start] + positions[start] = src_positions[start] + + +cdef tuple _intra_chunk_sort_run_float32(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.float32_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.float32_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.float32_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.float32_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_float(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_float64(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.float64_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.float64_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.float64_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.float64_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_float(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int8(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int8_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int8_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int8_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int8_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int16(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int16_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int16_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int16_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int16_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int32(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int32_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int32_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int32_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int32_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int64(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int64_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int64_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int64_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int64_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint8(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint8_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint8_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint8_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint8_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint16(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint16_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint16_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint16_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint16_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint32(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint32_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint32_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint32_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint32_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint64(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint64_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint64_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint64_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +def intra_chunk_sort_run(np.ndarray values, Py_ssize_t run_start, object position_dtype): + cdef np.dtype dtype = values.dtype + cdef np.dtype pos_dtype = np.dtype(position_dtype) + if dtype == np.dtype(np.float32): + return _intra_chunk_sort_run_float32(values, run_start, pos_dtype) + if dtype == np.dtype(np.float64): + return _intra_chunk_sort_run_float64(values, run_start, pos_dtype) + if dtype == np.dtype(np.int8): + return _intra_chunk_sort_run_int8(values, run_start, pos_dtype) + if dtype == np.dtype(np.int16): + return _intra_chunk_sort_run_int16(values, run_start, pos_dtype) + if dtype == np.dtype(np.int32): + return _intra_chunk_sort_run_int32(values, run_start, pos_dtype) + if dtype == np.dtype(np.int64): + return _intra_chunk_sort_run_int64(values, run_start, pos_dtype) + if dtype == np.dtype(np.uint8) or dtype == np.dtype(np.bool_): + sorted_values, positions = _intra_chunk_sort_run_uint8(values.view(np.uint8), run_start, pos_dtype) + if dtype == np.dtype(np.bool_): + return sorted_values.view(np.bool_), positions + return sorted_values, positions + if dtype == np.dtype(np.uint16): + return _intra_chunk_sort_run_uint16(values, run_start, pos_dtype) + if dtype == np.dtype(np.uint32): + return _intra_chunk_sort_run_uint32(values, run_start, pos_dtype) + if dtype == np.dtype(np.uint64): + return _intra_chunk_sort_run_uint64(values, run_start, pos_dtype) + if dtype.kind in {"m", "M"}: + sorted_values, positions = _intra_chunk_sort_run_int64(values.view(np.int64), run_start, pos_dtype) + return sorted_values.view(dtype), positions + raise TypeError("unsupported dtype for intra_chunk_sort_run") + + +cdef void _linear_merge_float( + sort_float_t[:] left_values, + uint64_t[:] left_positions, + sort_float_t[:] right_values, + uint64_t[:] right_positions, + sort_float_t[:] out_values, + uint64_t[:] out_positions, +) noexcept nogil: + cdef Py_ssize_t left = 0 + cdef Py_ssize_t right = 0 + cdef Py_ssize_t out = 0 + cdef Py_ssize_t left_n = left_values.shape[0] + cdef Py_ssize_t right_n = right_values.shape[0] + while left < left_n and right < right_n: + if _le_float_pair(left_values[left], left_positions[left], right_values[right], right_positions[right]): + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + else: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + while left < left_n: + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + out += 1 + while right < right_n: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + + +cdef void _linear_merge_ordered( + sort_ordered_t[:] left_values, + uint64_t[:] left_positions, + sort_ordered_t[:] right_values, + uint64_t[:] right_positions, + sort_ordered_t[:] out_values, + uint64_t[:] out_positions, +) noexcept nogil: + cdef Py_ssize_t left = 0 + cdef Py_ssize_t right = 0 + cdef Py_ssize_t out = 0 + cdef Py_ssize_t left_n = left_values.shape[0] + cdef Py_ssize_t right_n = right_values.shape[0] + while left < left_n and right < right_n: + if _le_ordered_pair( + left_values[left], left_positions[left], right_values[right], right_positions[right] + ): + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + else: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + while left < left_n: + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + out += 1 + while right < right_n: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + + +cdef tuple _intra_chunk_merge_float32( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.float32_t, ndim=1] merged_values = np.empty(total, dtype=np.float32) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.float32_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.float32_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.float32_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_float( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_float64( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.float64_t, ndim=1] merged_values = np.empty(total, dtype=np.float64) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.float64_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.float64_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.float64_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_float( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int8( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int8_t, ndim=1] merged_values = np.empty(total, dtype=np.int8) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int8_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int8_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int8_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int16( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int16_t, ndim=1] merged_values = np.empty(total, dtype=np.int16) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int16_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int16_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int16_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int32( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int32_t, ndim=1] merged_values = np.empty(total, dtype=np.int32) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int32_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int32_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int32_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int64( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int64_t, ndim=1] merged_values = np.empty(total, dtype=np.int64) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int64_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int64_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int64_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint8( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint8_t, ndim=1] merged_values = np.empty(total, dtype=np.uint8) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint8_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint8_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint8_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint16( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint16_t, ndim=1] merged_values = np.empty(total, dtype=np.uint16) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint16_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint16_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint16_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint32( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint32_t, ndim=1] merged_values = np.empty(total, dtype=np.uint32) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint32_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint32_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint32_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint64( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint64_t, ndim=1] merged_values = np.empty(total, dtype=np.uint64) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint64_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint64_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint64_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +def intra_chunk_merge_sorted_slices( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, object position_dtype +): + cdef np.dtype dtype = left_values.dtype + cdef np.dtype pos_dtype = np.dtype(position_dtype) + if dtype != right_values.dtype: + raise TypeError("left_values and right_values must have the same dtype") + if dtype == np.dtype(np.float32): + return _intra_chunk_merge_float32(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.float64): + return _intra_chunk_merge_float64(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int8): + return _intra_chunk_merge_int8(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int16): + return _intra_chunk_merge_int16(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int32): + return _intra_chunk_merge_int32(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int64): + return _intra_chunk_merge_int64(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint8): + return _intra_chunk_merge_uint8(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint16): + return _intra_chunk_merge_uint16(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint32): + return _intra_chunk_merge_uint32(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint64): + return _intra_chunk_merge_uint64(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.bool_): + merged_values, merged_positions = _intra_chunk_merge_uint8( + left_values.view(np.uint8), left_positions, right_values.view(np.uint8), right_positions, pos_dtype + ) + return merged_values.view(np.bool_), merged_positions + if dtype.kind in {"m", "M"}: + merged_values, merged_positions = _intra_chunk_merge_int64( + left_values.view(np.int64), left_positions, right_values.view(np.int64), right_positions, pos_dtype + ) + return merged_values.view(dtype), merged_positions + raise TypeError("unsupported dtype for intra_chunk_merge_sorted_slices") + + cdef inline Py_ssize_t _search_left_float32(np.float32_t[:] values, np.float32_t target) noexcept nogil: cdef Py_ssize_t lo = 0 cdef Py_ssize_t hi = values.shape[0] diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 12c01481..367cfd0f 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4760,6 +4760,11 @@ def create_index( The current indexing model supports one active index target per field. Append operations keep compatible indexes current, while general mutation and resize operations mark indexes as stale until rebuild. + + Chunk-local index creation uses parallel intra-chunk sorting by default. + Set the ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to control + the number of build threads. Setting + ``BLOSC2_INDEX_BUILD_THREADS=1`` disables parallel sorting. """ from . import indexing @@ -4824,6 +4829,11 @@ def create_expr_index( Expression indexes are matched by normalized expression identity. The current implementation supports one active index target per normalized expression key. + + Chunk-local index creation uses parallel intra-chunk sorting by default. + Set the ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to control + the number of build threads. Setting + ``BLOSC2_INDEX_BUILD_THREADS=1`` disables parallel sorting. """ from . import indexing diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 6019784f..a1d2d025 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -392,6 +392,65 @@ def test_in_mem_override_disables_ooc_builder(kind): assert descriptor["ooc"] is False +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_chunk_local_ooc_intra_chunk_build_uses_thread_pool_when_strategy_forced(monkeypatch, kind): + data = np.arange(48_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(48_000,), blocks=(1_500,)) + indexing = __import__("blosc2.indexing", fromlist=["ThreadPoolExecutor"]) + observed_workers = [] + + class FakeExecutor: + def __init__(self, *, max_workers): + observed_workers.append(max_workers) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def map(self, fn, iterable): + return [fn(item) for item in iterable] + + monkeypatch.setenv("BLOSC2_INDEX_BUILD_THREADS", "2") + monkeypatch.setattr(indexing, "ThreadPoolExecutor", FakeExecutor) + + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + assert observed_workers + assert observed_workers[0] == 2 + + +def test_intra_chunk_sort_run_matches_numpy_stable_order(): + indexing_ext = __import__("blosc2.indexing_ext", fromlist=["intra_chunk_sort_run"]) + values = np.array([4.0, np.nan, 2.0, 2.0, np.nan, 1.0, 4.0], dtype=np.float64) + + sorted_values, positions = indexing_ext.intra_chunk_sort_run(values, 0, np.dtype(np.uint16)) + + order = np.argsort(values, kind="stable") + np.testing.assert_array_equal(sorted_values, values[order]) + np.testing.assert_array_equal(positions, order.astype(np.uint16, copy=False)) + + +def test_intra_chunk_merge_sorted_slices_matches_lexsort_merge(): + indexing_ext = __import__("blosc2.indexing_ext", fromlist=["intra_chunk_merge_sorted_slices"]) + left_values = np.array([1.0, 2.0, 2.0, np.nan], dtype=np.float64) + left_positions = np.array([0, 2, 3, 6], dtype=np.uint16) + right_values = np.array([1.0, 2.0, 3.0, np.nan], dtype=np.float64) + right_positions = np.array([1, 4, 5, 7], dtype=np.uint16) + + merged_values, merged_positions = indexing_ext.intra_chunk_merge_sorted_slices( + left_values, left_positions, right_values, right_positions, np.dtype(np.uint16) + ) + + all_values = np.concatenate((left_values, right_values)) + all_positions = np.concatenate((left_positions, right_positions)) + order = np.lexsort((all_positions, all_values)) + np.testing.assert_array_equal(merged_values, all_values[order]) + np.testing.assert_array_equal(merged_positions, all_positions[order]) + + def test_mutation_marks_index_stale_and_rebuild_restores_it(): data = np.arange(50_000, dtype=np.int64) arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,)) From 5666fdbb1e0229e04328655bb779cf9ce0c5c838 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 6 Apr 2026 17:44:01 +0200 Subject: [PATCH 33/68] Stream benchmark array generation and rename random to permuted - build persistent benchmark arrays chunk by chunk - avoid materializing the full base array in memory - generate permuted ids directly without temp disk scratch - compute query bounds analytically instead of building ordered arrays - stream cold benchmark rows as each index kind finishes - simplify cold output to a single aligned table - keep warm timings in the final summary table - rename the low-memory distribution from random to permuted --- bench/ndarray/index_query_bench.py | 396 ++++++++++++++++++++--------- 1 file changed, 270 insertions(+), 126 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index c381f6c1..026c5ad6 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -8,6 +8,7 @@ from __future__ import annotations import argparse +import math import os import re import statistics @@ -23,11 +24,44 @@ SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) DEFAULT_REPEATS = 3 KINDS = ("ultralight", "light", "medium", "full") -DISTS = ("sorted", "block-shuffled", "random") +DISTS = ("sorted", "block-shuffled", "permuted") RNG_SEED = 0 DEFAULT_OPLEVEL = 5 FULL_QUERY_MODES = ("auto", "selective-ooc", "whole-load") +COLD_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), +] + +WARM_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ( + "speedup", + lambda result: f"{result['warm_speedup']:.2f}x" if result["warm_speedup"] is not None else "-", + ), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), +] + def dtype_token(dtype: np.dtype) -> str: return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") @@ -61,6 +95,65 @@ def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: raise ValueError(f"unsupported dtype for benchmark: {dtype}") +def ordered_id_slice(size: int, start: int, stop: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if stop <= start: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + values = np.zeros(stop - start, dtype=dtype) + true_start = max(start, size // 2) + if true_start < stop: + values[true_start - start :] = True + return values + + positions = np.arange(start, stop, dtype=np.int64) + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_at(size: int, index: int, dtype: np.dtype) -> object: + return ordered_id_slice(size, index, index + 1, dtype)[0].item() + + +def ordered_ids_from_positions(positions: np.ndarray, size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if positions.size == 0: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + return (positions >= (size // 2)).astype(dtype, copy=False) + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random.Generator, block_len: int) -> None: size = ids.shape[0] if dist == "sorted": @@ -106,30 +199,73 @@ def resolve_geometry(shape: tuple[int, ...], dtype: np.dtype, chunks: int | None return int(resolved_chunks[0]), int(resolved_blocks[0]) -def make_source_data(size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None) -> np.ndarray: - dtype = source_dtype(id_dtype) - data = np.zeros(size, dtype=dtype) - _, block_len = resolve_geometry((size,), dtype, chunks, blocks) - fill_ids(data["id"], make_ordered_ids(size, id_dtype), dist, np.random.default_rng(RNG_SEED), block_len) - return data - - -def build_array(data: np.ndarray, chunks: int | None, blocks: int | None) -> blosc2.NDArray: - kwargs = {} - if chunks is not None: - kwargs["chunks"] = (chunks,) - if blocks is not None: - kwargs["blocks"] = (blocks,) - return blosc2.asarray(data, **kwargs) +def _block_order(size: int, block_len: int) -> np.ndarray: + nblocks = (size + block_len - 1) // block_len + return np.random.default_rng(RNG_SEED).permutation(nblocks) -def build_persistent_array(data: np.ndarray, path: Path, chunks: int | None, blocks: int | None) -> blosc2.NDArray: +def _fill_block_shuffled_ids( + ids: np.ndarray, size: int, start: int, stop: int, block_len: int, order: np.ndarray +) -> None: + cursor = start + out_cursor = 0 + while cursor < stop: + dest_block = cursor // block_len + block_offset = cursor % block_len + src_block = int(order[dest_block]) + src_start = src_block * block_len + block_offset + take = min(stop - cursor, block_len - block_offset, size - src_start) + ids[out_cursor : out_cursor + take] = ordered_id_slice(size, src_start, src_start + take, ids.dtype) + cursor += take + out_cursor += take + + +def _permuted_position_params(size: int) -> tuple[int, int]: + if size <= 1: + return 1, 0 + rng = np.random.default_rng(RNG_SEED) + step = int(rng.integers(1, size)) + while math.gcd(step, size) != 1: + step += 1 + if step >= size: + step = 1 + offset = int(rng.integers(0, size)) + return step, offset + + +def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step: int, offset: int) -> None: + positions = np.arange(start, stop, dtype=np.int64) + shuffled_positions = (positions * step + offset) % size + ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype) + + +def build_persistent_array( + size: int, dist: str, id_dtype: np.dtype, path: Path, chunks: int | None, blocks: int | None +) -> blosc2.NDArray: + dtype = source_dtype(id_dtype) kwargs = {"urlpath": path, "mode": "w"} if chunks is not None: kwargs["chunks"] = (chunks,) if blocks is not None: kwargs["blocks"] = (blocks,) - return blosc2.asarray(data, **kwargs) + arr = blosc2.zeros((size,), dtype=dtype, **kwargs) + chunk_len = int(arr.chunks[0]) + block_len = int(arr.blocks[0]) + block_order = _block_order(size, block_len) if dist == "block-shuffled" else None + permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0) + for start in range(0, size, chunk_len): + stop = min(start + chunk_len, size) + chunk = np.zeros(stop - start, dtype=dtype) + if dist == "sorted": + chunk["id"] = ordered_id_slice(size, start, stop, id_dtype) + elif dist == "block-shuffled": + _fill_block_shuffled_ids(chunk["id"], size, start, stop, block_len, block_order) + elif dist == "permuted": + _fill_permuted_ids(chunk["id"], size, start, stop, permuted_step, permuted_offset) + else: + raise ValueError(f"unsupported distribution {dist!r}") + arr[start:stop] = chunk + return arr def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None) -> Path: @@ -219,37 +355,13 @@ def index_sizes(descriptor: dict) -> tuple[int, int]: return logical, disk -def _source_data_factory(size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None): - data = None - - def get_data() -> np.ndarray: - nonlocal data - if data is None: - data = make_source_data(size, dist, id_dtype, chunks, blocks) - return data - - return get_data - - -def _ordered_ids_factory(size: int, id_dtype: np.dtype): - ordered_ids = None - - def get_ordered_ids() -> np.ndarray: - nonlocal ordered_ids - if ordered_ids is None: - ordered_ids = make_ordered_ids(size, id_dtype) - return ordered_ids - - return get_ordered_ids - - -def _query_bounds(ordered_ids: np.ndarray, query_width: int) -> tuple[object, object]: - if ordered_ids.size == 0: +def _query_bounds(size: int, query_width: int, dtype: np.dtype) -> tuple[object, object]: + if size <= 0: raise ValueError("benchmark arrays must not be empty") - lo_idx = ordered_ids.size // 2 - hi_idx = min(ordered_ids.size - 1, lo_idx + max(query_width - 1, 0)) - return ordered_ids[lo_idx].item(), ordered_ids[hi_idx].item() + lo_idx = size // 2 + hi_idx = min(size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_id_at(size, lo_idx, dtype), ordered_id_at(size, hi_idx, dtype) def _literal(value: object, dtype: np.dtype) -> str: @@ -285,15 +397,25 @@ def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int, in_me return None -def _open_or_build_persistent_array(path: Path, get_data, chunks: int | None, blocks: int | None) -> blosc2.NDArray: +def _open_or_build_persistent_array( + path: Path, size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None +) -> blosc2.NDArray: if path.exists(): return blosc2.open(path, mode="a") blosc2.remove_urlpath(path) - return build_persistent_array(get_data(), path, chunks, blocks) + return build_persistent_array(size, dist, id_dtype, path, chunks, blocks) def _open_or_build_indexed_array( - path: Path, get_data, kind: str, optlevel: int, in_mem: bool, chunks: int | None, blocks: int | None + path: Path, + size: int, + dist: str, + id_dtype: np.dtype, + kind: str, + optlevel: int, + in_mem: bool, + chunks: int | None, + blocks: int | None, ) -> tuple[blosc2.NDArray, float]: if path.exists(): arr = blosc2.open(path, mode="a") @@ -303,7 +425,7 @@ def _open_or_build_indexed_array( arr.drop_index(field="id") blosc2.remove_urlpath(path) - arr = build_persistent_array(get_data(), path, chunks, blocks) + arr = build_persistent_array(size, dist, id_dtype, path, chunks, blocks) build_start = time.perf_counter() arr.create_index(field="id", kind=kind, optlevel=optlevel, in_mem=in_mem) return arr, time.perf_counter() - build_start @@ -320,11 +442,12 @@ def benchmark_size( full_query_mode: str, chunks: int | None, blocks: int | None, + cold_row_callback=None, ) -> list[dict]: - get_data = _source_data_factory(size, dist, id_dtype, chunks, blocks) - get_ordered_ids = _ordered_ids_factory(size, id_dtype) - arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist, id_dtype, chunks, blocks), get_data, chunks, blocks) - lo, hi = _query_bounds(get_ordered_ids(), query_width) + arr = _open_or_build_persistent_array( + base_array_path(size_dir, size, dist, id_dtype, chunks, blocks), size, dist, id_dtype, chunks, blocks + ) + lo, hi = _query_bounds(size, query_width, id_dtype) condition_str = _condition_expr(lo, hi, id_dtype) condition = blosc2.lazyexpr(condition_str, arr.fields) expr = condition.where(arr) @@ -337,7 +460,9 @@ def benchmark_size( for kind in KINDS: idx_arr, build_time = _open_or_build_indexed_array( indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks), - get_data, + size, + dist, + id_dtype, kind, optlevel, in_mem, @@ -352,33 +477,34 @@ def benchmark_size( descriptor = idx_arr.indexes[0] logical_index_bytes, disk_index_bytes = index_sizes(descriptor) - rows.append( - { - "size": size, - "dist": dist, - "kind": kind, - "optlevel": optlevel, - "in_mem": in_mem, - "query_rows": index_len, - "build_s": build_time, - "create_idx_ms": build_time * 1_000, - "scan_ms": scan_ms, - "cold_ms": cold_time * 1_000, - "cold_speedup": scan_ms / (cold_time * 1_000), - "warm_ms": None, - "warm_speedup": None, - "candidate_units": explanation["candidate_units"], - "total_units": explanation["total_units"], - "lookup_path": explanation.get("lookup_path"), - "full_query_mode": full_query_mode, - "logical_index_bytes": logical_index_bytes, - "disk_index_bytes": disk_index_bytes, - "index_pct": logical_index_bytes / base_bytes * 100, - "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, - "_arr": idx_arr, - "_cond": idx_cond, - } - ) + row = { + "size": size, + "dist": dist, + "kind": kind, + "optlevel": optlevel, + "in_mem": in_mem, + "query_rows": index_len, + "build_s": build_time, + "create_idx_ms": build_time * 1_000, + "scan_ms": scan_ms, + "cold_ms": cold_time * 1_000, + "cold_speedup": scan_ms / (cold_time * 1_000), + "warm_ms": None, + "warm_speedup": None, + "candidate_units": explanation["candidate_units"], + "total_units": explanation["total_units"], + "lookup_path": explanation.get("lookup_path"), + "full_query_mode": full_query_mode, + "logical_index_bytes": logical_index_bytes, + "disk_index_bytes": disk_index_bytes, + "index_pct": logical_index_bytes / base_bytes * 100, + "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + "_arr": idx_arr, + "_cond": idx_cond, + } + rows.append(row) + if cold_row_callback is not None: + cold_row_callback(row) return rows @@ -559,6 +685,11 @@ def run_benchmarks( blocks: int | None, ) -> None: all_results = [] + cold_widths = progress_widths(COLD_COLUMNS, sizes, dists, id_dtype) + + def stream_cold_row(result: dict) -> None: + print_table_row(result, COLD_COLUMNS, cold_widths) + array_dtype = source_dtype(id_dtype) resolved_geometries = {resolve_geometry((size,), array_dtype, chunks, blocks) for size in sizes} if len(resolved_geometries) == 1: @@ -572,58 +703,30 @@ def run_benchmarks( f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " f"full_query_mode={full_query_mode}" ) + print() + print("Cold Query Table") + print_table_header(COLD_COLUMNS, cold_widths) for dist in dists: for size in sizes: size_results = benchmark_size( - size, size_dir, dist, query_width, optlevel, id_dtype, in_mem, full_query_mode, chunks, blocks + size, + size_dir, + dist, + query_width, + optlevel, + id_dtype, + in_mem, + full_query_mode, + chunks, + blocks, + stream_cold_row, ) all_results.extend(size_results) - - print() - print("Cold Query Table") - print_table( - all_results, - [ - ("rows", lambda result: f"{result['size']:,}"), - ("dist", lambda result: result["dist"]), - ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), - ("kind", lambda result: result["kind"]), - ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), - ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), - ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), - ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), - ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), - ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), - ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), - ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), - ], - ) if repeats > 0: measure_warm_queries(all_results, repeats) print() print("Warm Query Table") - print_table( - all_results, - [ - ("rows", lambda result: f"{result['size']:,}"), - ("dist", lambda result: result["dist"]), - ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), - ("kind", lambda result: result["kind"]), - ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), - ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), - ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), - ( - "speedup", - lambda result: f"{result['warm_speedup']:.2f}x" - if result["warm_speedup"] is not None - else "-", - ), - ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), - ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), - ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), - ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), - ], - ) + print_table(all_results, WARM_COLUMNS) def _format_row(cells: list[str], widths: list[int]) -> str: @@ -647,5 +750,46 @@ def print_table(results: list[dict], columns: list[tuple[str, callable]]) -> Non print(_format_row(row, widths)) +def print_table_header(columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + headers = [header for header, _ in columns] + if widths is None: + widths = [len(header) for header in headers] + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + + +def print_table_row(result: dict, columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + cells = [formatter(result) for _, formatter in columns] + if widths is None: + widths = [max(len(header), len(cell)) for (header, _), cell in zip(columns, cells, strict=True)] + print(_format_row(cells, widths)) + + +def progress_widths( + columns: list[tuple[str, callable]], sizes: tuple[int, ...], dists: tuple[str, ...], id_dtype: np.dtype +) -> list[int]: + max_size = max(sizes) + max_index_bytes = max_size * max(np.dtype(id_dtype).itemsize + 8, 16) + max_cells = { + "rows": f"{max_size:,}", + "dist": max(dists, key=len), + "builder": "ooc", + "kind": max(KINDS, key=len), + "create_idx_ms": "999999.999", + "scan_ms": "9999.999", + "cold_ms": "9999.999", + "warm_ms": "9999.999", + "speedup": "9999.99x", + "logical_bytes": f"{max_index_bytes:,}", + "disk_bytes": f"{max_index_bytes:,}", + "index_pct": "100.0000%", + "index_pct_disk": "100.0000%", + } + widths = [] + for header, _ in columns: + widths.append(max(len(header), len(max_cells.get(header, "")))) + return widths + + if __name__ == "__main__": main() From 5f24620a4b3bea20e736dfcbcdd383bd0798c463 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 6 Apr 2026 17:51:30 +0200 Subject: [PATCH 34/68] New --kind option for selecting the kind of the index --- bench/ndarray/index_query_bench.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 026c5ad6..8e16ac2b 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -24,6 +24,7 @@ SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) DEFAULT_REPEATS = 3 KINDS = ("ultralight", "light", "medium", "full") +DEFAULT_KIND = "light" DISTS = ("sorted", "block-shuffled", "permuted") RNG_SEED = 0 DEFAULT_OPLEVEL = 5 @@ -442,6 +443,7 @@ def benchmark_size( full_query_mode: str, chunks: int | None, blocks: int | None, + kinds: tuple[str, ...], cold_row_callback=None, ) -> list[dict]: arr = _open_or_build_persistent_array( @@ -457,7 +459,7 @@ def benchmark_size( scan_ms = benchmark_scan_once(expr)[0] * 1_000 rows = [] - for kind in KINDS: + for kind in kinds: idx_arr, build_time = _open_or_build_indexed_array( indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks), size, @@ -608,6 +610,12 @@ def parse_args() -> argparse.Namespace: default="sorted", help="Distribution for the indexed field. Use 'all' to benchmark every distribution.", ) + parser.add_argument( + "--kind", + choices=(*KINDS, "all"), + default=DEFAULT_KIND, + help=f"Index kind to benchmark. Use 'all' to benchmark every kind. Default: {DEFAULT_KIND}.", + ) parser.add_argument( "--in-mem", action=argparse.BooleanOptionalAction, @@ -635,12 +643,14 @@ def main() -> None: raise SystemExit(f"--dtype only supports bool, integer, and floating-point dtypes; got {id_dtype}") sizes = (args.size,) if args.size is not None else SIZES dists = DISTS if args.dist == "all" else (args.dist,) + kinds = KINDS if args.kind == "all" else (args.kind,) if args.outdir is None: with tempfile.TemporaryDirectory() as tmpdir: run_benchmarks( sizes, dists, + kinds, Path(tmpdir), args.dist, args.query_width, @@ -657,6 +667,7 @@ def main() -> None: run_benchmarks( sizes, dists, + kinds, args.outdir, args.dist, args.query_width, @@ -673,6 +684,7 @@ def main() -> None: def run_benchmarks( sizes: tuple[int, ...], dists: tuple[str, ...], + kinds: tuple[str, ...], size_dir: Path, dist_label: str, query_width: int, @@ -685,7 +697,7 @@ def run_benchmarks( blocks: int | None, ) -> None: all_results = [] - cold_widths = progress_widths(COLD_COLUMNS, sizes, dists, id_dtype) + cold_widths = progress_widths(COLD_COLUMNS, sizes, dists, kinds, id_dtype) def stream_cold_row(result: dict) -> None: print_table_row(result, COLD_COLUMNS, cold_widths) @@ -719,6 +731,7 @@ def stream_cold_row(result: dict) -> None: full_query_mode, chunks, blocks, + kinds, stream_cold_row, ) all_results.extend(size_results) @@ -766,7 +779,11 @@ def print_table_row(result: dict, columns: list[tuple[str, callable]], widths: l def progress_widths( - columns: list[tuple[str, callable]], sizes: tuple[int, ...], dists: tuple[str, ...], id_dtype: np.dtype + columns: list[tuple[str, callable]], + sizes: tuple[int, ...], + dists: tuple[str, ...], + kinds: tuple[str, ...], + id_dtype: np.dtype, ) -> list[int]: max_size = max(sizes) max_index_bytes = max_size * max(np.dtype(id_dtype).itemsize + 8, 16) @@ -774,7 +791,7 @@ def progress_widths( "rows": f"{max_size:,}", "dist": max(dists, key=len), "builder": "ooc", - "kind": max(KINDS, key=len), + "kind": max(kinds, key=len), "create_idx_ms": "999999.999", "scan_ms": "9999.999", "cold_ms": "9999.999", From 8946c85bce5cef1e2520344fb7e302b5c5305213 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Mon, 6 Apr 2026 18:02:45 +0200 Subject: [PATCH 35/68] Better table formatting --- bench/ndarray/index_query_bench.py | 31 +++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 8e16ac2b..712d143b 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -697,10 +697,6 @@ def run_benchmarks( blocks: int | None, ) -> None: all_results = [] - cold_widths = progress_widths(COLD_COLUMNS, sizes, dists, kinds, id_dtype) - - def stream_cold_row(result: dict) -> None: - print_table_row(result, COLD_COLUMNS, cold_widths) array_dtype = source_dtype(id_dtype) resolved_geometries = {resolve_geometry((size,), array_dtype, chunks, blocks) for size in sizes} @@ -715,9 +711,6 @@ def stream_cold_row(result: dict) -> None: f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " f"full_query_mode={full_query_mode}" ) - print() - print("Cold Query Table") - print_table_header(COLD_COLUMNS, cold_widths) for dist in dists: for size in sizes: size_results = benchmark_size( @@ -732,14 +725,24 @@ def stream_cold_row(result: dict) -> None: chunks, blocks, kinds, - stream_cold_row, ) all_results.extend(size_results) + cold_widths = table_widths(all_results, COLD_COLUMNS) + print() + print("Cold Query Table") + print_table(all_results, COLD_COLUMNS, cold_widths) if repeats > 0: measure_warm_queries(all_results, repeats) + warm_widths = table_widths(all_results, WARM_COLUMNS) + shared_width_by_header = {} + for (header, _), width in zip(COLD_COLUMNS, cold_widths, strict=True): + shared_width_by_header[header] = width + for (header, _), width in zip(WARM_COLUMNS, warm_widths, strict=True): + shared_width_by_header[header] = max(shared_width_by_header.get(header, 0), width) + warm_widths = [shared_width_by_header[header] for header, _ in WARM_COLUMNS] print() print("Warm Query Table") - print_table(all_results, WARM_COLUMNS) + print_table(all_results, WARM_COLUMNS, warm_widths) def _format_row(cells: list[str], widths: list[int]) -> str: @@ -755,8 +758,9 @@ def _table_rows(results: list[dict], columns: list[tuple[str, callable]]) -> tup return headers, rows, widths -def print_table(results: list[dict], columns: list[tuple[str, callable]]) -> None: - headers, rows, widths = _table_rows(results, columns) +def print_table(results: list[dict], columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + headers, rows, computed_widths = _table_rows(results, columns) + widths = computed_widths if widths is None else widths print(_format_row(headers, widths)) print(_format_row(["-" * width for width in widths], widths)) for row in rows: @@ -808,5 +812,10 @@ def progress_widths( return widths +def table_widths(results: list[dict], columns: list[tuple[str, callable]]) -> list[int]: + _, _, widths = _table_rows(results, columns) + return widths + + if __name__ == "__main__": main() From 7c8765f096115ceb6958e74e840da5404224e573 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 7 Apr 2026 09:27:28 +0200 Subject: [PATCH 36/68] Honor cparams in create_index() --- bench/ndarray/index_query_bench.py | 32 ++- src/blosc2/indexing.py | 302 ++++++++++++++++++++++------- src/blosc2/ndarray.py | 18 +- tests/ndarray/test_indexing.py | 50 ++++- 4 files changed, 324 insertions(+), 78 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 712d143b..b808d79b 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -283,9 +283,15 @@ def indexed_array_path( in_mem: bool, chunks: int | None, blocks: int | None, + nthreads: int | None, ) -> Path: mode = "mem" if in_mem else "ooc" - return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.{kind}.opt{optlevel}.{mode}.b2nd" + thread_token = "threads-auto" if nthreads is None else f"threads-{nthreads}" + return ( + size_dir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.{thread_token}" + f".{kind}.opt{optlevel}.{mode}.b2nd" + ) def benchmark_scan_once(expr) -> tuple[float, int]: @@ -417,6 +423,7 @@ def _open_or_build_indexed_array( in_mem: bool, chunks: int | None, blocks: int | None, + nthreads: int | None, ) -> tuple[blosc2.NDArray, float]: if path.exists(): arr = blosc2.open(path, mode="a") @@ -428,7 +435,10 @@ def _open_or_build_indexed_array( arr = build_persistent_array(size, dist, id_dtype, path, chunks, blocks) build_start = time.perf_counter() - arr.create_index(field="id", kind=kind, optlevel=optlevel, in_mem=in_mem) + kwargs = {"field": "id", "kind": kind, "optlevel": optlevel, "in_mem": in_mem} + if nthreads is not None: + kwargs["cparams"] = {"nthreads": nthreads} + arr.create_index(**kwargs) return arr, time.perf_counter() - build_start @@ -443,6 +453,7 @@ def benchmark_size( full_query_mode: str, chunks: int | None, blocks: int | None, + nthreads: int | None, kinds: tuple[str, ...], cold_row_callback=None, ) -> list[dict]: @@ -461,7 +472,7 @@ def benchmark_size( rows = [] for kind in kinds: idx_arr, build_time = _open_or_build_indexed_array( - indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks), + indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks, nthreads), size, dist, id_dtype, @@ -470,6 +481,7 @@ def benchmark_size( in_mem, chunks, blocks, + nthreads, ) idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) idx_expr = idx_cond.where(idx_arr) @@ -628,6 +640,12 @@ def parse_args() -> argparse.Namespace: default="auto", help="How full exact queries should run during the benchmark: auto, selective-ooc, or whole-load.", ) + parser.add_argument( + "--nthreads", + type=int, + default=None, + help="Number of threads to use for index creation. Default: use blosc2.nthreads.", + ) return parser.parse_args() @@ -641,6 +659,8 @@ def main() -> None: raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc if id_dtype.kind not in {"b", "i", "u", "f"}: raise SystemExit(f"--dtype only supports bool, integer, and floating-point dtypes; got {id_dtype}") + if args.nthreads is not None and args.nthreads <= 0: + raise SystemExit("--nthreads must be a positive integer") sizes = (args.size,) if args.size is not None else SIZES dists = DISTS if args.dist == "all" else (args.dist,) kinds = KINDS if args.kind == "all" else (args.kind,) @@ -661,6 +681,7 @@ def main() -> None: args.full_query_mode, args.chunks, args.blocks, + args.nthreads, ) else: args.outdir.mkdir(parents=True, exist_ok=True) @@ -678,6 +699,7 @@ def main() -> None: args.full_query_mode, args.chunks, args.blocks, + args.nthreads, ) @@ -695,6 +717,7 @@ def run_benchmarks( full_query_mode: str, chunks: int | None, blocks: int | None, + nthreads: int | None, ) -> None: all_results = [] @@ -709,7 +732,7 @@ def run_benchmarks( print( f"{geometry_label}, repeats={repeats}, dist={dist_label}, " f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " - f"full_query_mode={full_query_mode}" + f"full_query_mode={full_query_mode}, index_nthreads={'auto' if nthreads is None else nthreads}" ) for dist in dists: for size in sizes: @@ -724,6 +747,7 @@ def run_benchmarks( full_query_mode, chunks, blocks, + nthreads, kinds, ) all_results.extend(size_results) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index a356415a..7b648796 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -8,6 +8,7 @@ from __future__ import annotations import ast +import enum import hashlib import math import os @@ -15,7 +16,7 @@ import tempfile import weakref from concurrent.futures import ThreadPoolExecutor -from dataclasses import dataclass +from dataclasses import asdict, dataclass from pathlib import Path import numpy as np @@ -175,6 +176,8 @@ def _copy_nested_dict(value: dict | None) -> dict | None: def _copy_descriptor(descriptor: dict) -> dict: copied = descriptor.copy() + if descriptor.get("cparams") is not None: + copied["cparams"] = descriptor["cparams"].copy() copied["levels"] = _copy_nested_dict(descriptor.get("levels")) if descriptor.get("target") is not None: copied["target"] = descriptor["target"].copy() @@ -495,6 +498,7 @@ def _store_array_sidecar( *, chunks: tuple[int, ...] | None = None, blocks: tuple[int, ...] | None = None, + cparams: dict | None = None, ) -> dict: cache_key = _data_cache_key(array, token, category, name) if persistent: @@ -505,6 +509,8 @@ def _store_array_sidecar( kwargs["chunks"] = chunks if blocks is not None: kwargs["blocks"] = blocks + if cparams is not None: + kwargs["cparams"] = cparams blosc2.asarray(data, **kwargs) if isinstance(data, np.memmap): _DATA_CACHE.pop(cache_key, None) @@ -527,6 +533,7 @@ def _create_persistent_sidecar_handle( *, chunks: tuple[int, ...] | None = None, blocks: tuple[int, ...] | None = None, + cparams: dict | None = None, ) -> tuple[blosc2.NDArray | None, dict]: path = _sidecar_path(array, token, kind, f"{category}.{name}") blosc2.remove_urlpath(path) @@ -535,6 +542,8 @@ def _create_persistent_sidecar_handle( kwargs["chunks"] = chunks if blocks is not None: kwargs["blocks"] = blocks + if cparams is not None: + kwargs["cparams"] = cparams if length == 0: blosc2.asarray(np.empty(0, dtype=dtype), **kwargs) return None, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} @@ -542,6 +551,34 @@ def _create_persistent_sidecar_handle( return handle, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} +def _normalize_index_cparams(cparams) -> blosc2.CParams | None: + if cparams is None: + return None + if isinstance(cparams, blosc2.CParams): + return cparams + return blosc2.CParams(**cparams) + + +def _plain_index_cparams(cparams: dict | blosc2.CParams | None) -> dict | None: + if cparams is None: + return None + + def _plain_value(value): + if isinstance(value, enum.Enum): + return value.value + if isinstance(value, dict): + return {key: _plain_value(item) for key, item in value.items()} + if isinstance(value, list | tuple): + return type(value)(_plain_value(item) for item in value) + return value + + if isinstance(cparams, blosc2.CParams): + cparams = asdict(cparams) + else: + cparams = cparams.copy() + return {key: _plain_value(value) for key, value in cparams.items()} + + def _load_array_sidecar( array: blosc2.NDArray, token: str, category: str, name: str, path: str | None ) -> np.ndarray: @@ -564,12 +601,15 @@ def _build_levels_descriptor( dtype: np.dtype, values: np.ndarray, persistent: bool, + cparams: dict | None = None, ) -> dict: levels = {} for level in SEGMENT_LEVELS_BY_KIND[kind]: segment_len = _segment_len(array, level) summaries = _compute_segment_summaries(values, dtype, segment_len) - sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) levels[level] = { "segment_len": segment_len, "nsegments": len(summaries), @@ -586,6 +626,7 @@ def _build_levels_descriptor_ooc( kind: str, dtype: np.dtype, persistent: bool, + cparams: dict | None = None, ) -> dict: levels = {} size = int(array.shape[0]) @@ -598,7 +639,9 @@ def _build_levels_descriptor_ooc( start = idx * segment_len stop = min(start + segment_len, size) summaries[idx] = _segment_summary(_slice_values_for_target(array, target, start, stop), dtype) - sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) levels[level] = { "segment_len": segment_len, "nsegments": len(summaries), @@ -624,14 +667,15 @@ def _rebuild_full_navigation_sidecars( full: dict, sorted_values: np.ndarray, persistent: bool, + cparams: dict | None = None, ) -> None: chunk_len, block_len = _sidecar_storage_geometry( full.get("values_path"), int(array.chunks[0]), int(array.blocks[0]) ) l1 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), chunk_len) l2 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), block_len) - l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent) - l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent) + l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent, cparams=cparams) + l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent, cparams=cparams) full["l1_path"] = l1_sidecar["path"] full["l2_path"] = l2_sidecar["path"] full["sidecar_chunk_len"] = int(chunk_len) @@ -647,12 +691,13 @@ def _rebuild_full_navigation_sidecars_from_path( dtype: np.dtype, length: int, persistent: bool, + cparams: dict | None = None, ) -> None: chunk_len, block_len = _sidecar_storage_geometry(values_path, int(array.chunks[0]), int(array.blocks[0])) l1 = _compute_sorted_boundaries_from_sidecar(values_path, dtype, length, chunk_len) l2 = _compute_sorted_boundaries_from_sidecar(values_path, dtype, length, block_len) - l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent) - l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent) + l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent, cparams=cparams) + l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent, cparams=cparams) full["l1_path"] = l1_sidecar["path"] full["l2_path"] = l2_sidecar["path"] full["sidecar_chunk_len"] = int(chunk_len) @@ -668,12 +713,14 @@ def _stream_copy_sidecar_array( dtype: np.dtype, chunks: tuple[int, ...], blocks: tuple[int, ...], + cparams: dict | None = None, ) -> None: source = blosc2.open(str(source_path), mmap_mode="r") blosc2.remove_urlpath(str(dest_path)) - dest = blosc2.empty( - (length,), dtype=dtype, chunks=chunks, blocks=blocks, urlpath=str(dest_path), mode="w" - ) + kwargs = {"chunks": chunks, "blocks": blocks, "urlpath": str(dest_path), "mode": "w"} + if cparams is not None: + kwargs["cparams"] = cparams + dest = blosc2.empty((length,), dtype=dtype, **kwargs) chunk_len = int(dest.chunks[0]) for start in range(0, length, chunk_len): stop = min(start + chunk_len, length) @@ -692,6 +739,7 @@ def _stream_copy_temp_run_to_full_sidecars( dtype: np.dtype, persistent: bool, tracker: TempRunTracker | None = None, + cparams: dict | None = None, ) -> None: if not persistent: raise ValueError("temp-run streaming only supports persistent runs") @@ -701,7 +749,13 @@ def _stream_copy_temp_run_to_full_sidecars( _remove_sidecar_path(values_path) _remove_sidecar_path(positions_path) _stream_copy_sidecar_array( - run.values_path, values_path, run.length, dtype, (int(array.chunks[0]),), (int(array.blocks[0]),) + run.values_path, + values_path, + run.length, + dtype, + (int(array.chunks[0]),), + (int(array.blocks[0]),), + cparams, ) _stream_copy_sidecar_array( run.positions_path, @@ -710,6 +764,7 @@ def _stream_copy_temp_run_to_full_sidecars( np.dtype(np.int64), (int(array.chunks[0]),), (int(array.blocks[0]),), + cparams, ) _tracker_register_delete(tracker, run.values_path, run.positions_path) run.values_path.unlink(missing_ok=True) @@ -719,7 +774,7 @@ def _stream_copy_temp_run_to_full_sidecars( full["runs"] = [] full["next_run_id"] = 0 _rebuild_full_navigation_sidecars_from_path( - array, token, kind, full, values_path, dtype, run.length, persistent + array, token, kind, full, values_path, dtype, run.length, persistent, cparams ) @@ -729,19 +784,24 @@ def _build_full_descriptor( kind: str, values: np.ndarray, persistent: bool, + cparams: dict | None = None, ) -> dict: order = np.argsort(values, kind="stable") positions = order.astype(np.int64, copy=False) sorted_values = values[order] - values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) + values_sidecar = _store_array_sidecar( + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full", "positions", positions, persistent, cparams=cparams + ) full = { "values_path": values_sidecar["path"], "positions_path": positions_sidecar["path"], "runs": [], "next_run_id": 0, } - _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) return full @@ -794,11 +854,12 @@ def _build_reduced_descriptor( values: np.ndarray, optlevel: int, persistent: bool, + cparams: dict | None = None, ) -> dict: chunk_len = int(array.chunks[0]) nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( - values, chunk_len, nav_segment_len + values, chunk_len, nav_segment_len, cparams ) l1 = _compute_sorted_boundaries(sorted_values, np.dtype(values.dtype), chunk_len) reduced = _chunk_index_payload_storage( @@ -816,6 +877,7 @@ def _build_reduced_descriptor( persistent, chunk_len, nav_segment_len, + cparams, ) reduced["position_dtype"] = positions.dtype.str reduced["nav_segment_divisor"] = nav_segment_divisor @@ -836,7 +898,7 @@ def _chunk_offsets(size: int, chunk_len: int) -> np.ndarray: return offsets -def _index_build_threads() -> int: +def _index_build_threads(cparams: dict | blosc2.CParams | None = None) -> int: forced = os.getenv("BLOSC2_INDEX_BUILD_THREADS") if forced is not None: try: @@ -844,6 +906,16 @@ def _index_build_threads() -> int: except ValueError: forced_threads = 1 return max(1, forced_threads) + if cparams is not None: + nthreads = cparams.nthreads if isinstance(cparams, blosc2.CParams) else cparams.get("nthreads") + else: + nthreads = None + if nthreads is not None: + try: + cparams_threads = int(nthreads) + except (TypeError, ValueError): + cparams_threads = 1 + return max(1, cparams_threads) return max(1, int(getattr(blosc2, "nthreads", 1) or 1)) @@ -875,6 +947,7 @@ def _build_chunk_sorted_payload( values: np.ndarray, chunk_len: int, nav_segment_len: int, + cparams: dict | None = None, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.dtype]: size = values.shape[0] nchunks = math.ceil(size / chunk_len) @@ -888,16 +961,18 @@ def _build_chunk_sorted_payload( l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(values.dtype)) cursor = 0 + thread_count = _index_build_threads(cparams) for chunk_id in range(nchunks): start = chunk_id * chunk_len stop = min(start + chunk_len, size) chunk = values[start:stop] - order = np.argsort(chunk, kind="stable") chunk_size = stop - start next_cursor = cursor + chunk_size - chunk_sorted = chunk[order] + chunk_sorted, chunk_positions = _sort_chunk_intra_chunk( + chunk, position_dtype, thread_count=thread_count + ) sorted_values[cursor:next_cursor] = chunk_sorted - positions[cursor:next_cursor] = order.astype(position_dtype, copy=False) + positions[cursor:next_cursor] = chunk_positions offsets[chunk_id + 1] = next_cursor l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) @@ -1064,6 +1139,7 @@ def _build_reduced_chunk_payloads_intra_chunk( dtype: np.dtype, chunk_len: int, nav_segment_len: int, + cparams: dict | None = None, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: size = int(array.shape[0]) nchunks = math.ceil(size / chunk_len) @@ -1076,7 +1152,7 @@ def _build_reduced_chunk_payloads_intra_chunk( nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) cursor = 0 - thread_count = _index_build_threads() + thread_count = _index_build_threads(cparams) for chunk_id in range(nchunks): start = chunk_id * chunk_len @@ -1119,6 +1195,7 @@ def _chunk_index_payload_storage( persistent: bool, chunk_len: int, nav_segment_len: int, + cparams: dict | None = None, ) -> dict: nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) payload_sidecar = _store_array_sidecar( @@ -1131,6 +1208,7 @@ def _chunk_index_payload_storage( persistent, chunks=(chunk_len,), blocks=(nav_segment_len,), + cparams=cparams, ) aux_sidecar = _store_array_sidecar( array, @@ -1142,9 +1220,14 @@ def _chunk_index_payload_storage( persistent, chunks=(chunk_len,), blocks=(nav_segment_len,), + cparams=cparams, + ) + offsets_sidecar = _store_array_sidecar( + array, token, kind, category, "offsets", offsets, persistent, cparams=cparams + ) + l1_sidecar = _store_array_sidecar( + array, token, kind, f"{category}_nav", "l1", l1, persistent, cparams=cparams ) - offsets_sidecar = _store_array_sidecar(array, token, kind, category, "offsets", offsets, persistent) - l1_sidecar = _store_array_sidecar(array, token, kind, f"{category}_nav", "l1", l1, persistent) l2_sidecar = _store_array_sidecar( array, token, @@ -1155,6 +1238,7 @@ def _chunk_index_payload_storage( persistent, chunks=(nsegments_per_chunk,), blocks=(min(nsegments_per_chunk, max(1, nsegments_per_chunk)),), + cparams=cparams, ) return { "layout": "chunk-local-v1", @@ -1181,6 +1265,7 @@ def _prepare_chunk_index_payload_sidecars( size: int, chunk_len: int, nav_segment_len: int, + cparams: dict | None = None, ) -> tuple[blosc2.NDArray | None, dict, blosc2.NDArray | None, dict]: payload_handle, payload_sidecar = _create_persistent_sidecar_handle( array, @@ -1192,6 +1277,7 @@ def _prepare_chunk_index_payload_sidecars( payload_dtype, chunks=(chunk_len,), blocks=(nav_segment_len,), + cparams=cparams, ) aux_handle, aux_sidecar = _create_persistent_sidecar_handle( array, @@ -1203,6 +1289,7 @@ def _prepare_chunk_index_payload_sidecars( aux_dtype, chunks=(chunk_len,), blocks=(nav_segment_len,), + cparams=cparams, ) return payload_handle, payload_sidecar, aux_handle, aux_sidecar @@ -1220,10 +1307,13 @@ def _finalize_chunk_index_payload_storage( aux_sidecar: dict, chunk_len: int, nav_segment_len: int, + cparams: dict | None = None, ) -> dict: nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) - offsets_sidecar = _store_array_sidecar(array, token, kind, category, "offsets", offsets, True) - l1_sidecar = _store_array_sidecar(array, token, kind, f"{category}_nav", "l1", l1, True) + offsets_sidecar = _store_array_sidecar( + array, token, kind, category, "offsets", offsets, True, cparams=cparams + ) + l1_sidecar = _store_array_sidecar(array, token, kind, f"{category}_nav", "l1", l1, True, cparams=cparams) l2_sidecar = _store_array_sidecar( array, token, @@ -1234,6 +1324,7 @@ def _finalize_chunk_index_payload_storage( True, chunks=(nsegments_per_chunk,), blocks=(min(nsegments_per_chunk, max(1, nsegments_per_chunk)),), + cparams=cparams, ) return { "layout": "chunk-local-v1", @@ -1256,6 +1347,7 @@ def _build_reduced_descriptor_ooc( dtype: np.dtype, optlevel: int, persistent: bool, + cparams: dict | None = None, ) -> dict: if persistent: size = int(array.shape[0]) @@ -1286,6 +1378,7 @@ def _build_reduced_descriptor_ooc( size, chunk_len, nav_segment_len, + cparams, ) ) cursor = 0 @@ -1327,6 +1420,7 @@ def _build_reduced_descriptor_ooc( positions_sidecar, chunk_len, nav_segment_len, + cparams, ) except Exception: if values_sidecar is not None: @@ -1341,7 +1435,7 @@ def _build_reduced_descriptor_ooc( chunk_len = int(array.chunks[0]) nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) sorted_values, positions, offsets, l1, l2 = _build_reduced_chunk_payloads_intra_chunk( - array, target, dtype, chunk_len, nav_segment_len + array, target, dtype, chunk_len, nav_segment_len, cparams ) reduced = _chunk_index_payload_storage( array, @@ -1358,6 +1452,7 @@ def _build_reduced_descriptor_ooc( persistent, chunk_len, nav_segment_len, + cparams, ) reduced["position_dtype"] = positions.dtype.str reduced["nav_segment_divisor"] = nav_segment_divisor @@ -1544,6 +1639,7 @@ def _build_light_chunk_payloads_intra_chunk( value_lossy_bits: int, bucket_len: int, bucket_dtype: np.dtype, + cparams: dict | None = None, ) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: size = int(array.shape[0]) nchunks = math.ceil(size / chunk_len) @@ -1556,7 +1652,7 @@ def _build_light_chunk_payloads_intra_chunk( l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) position_dtype = _position_dtype(chunk_len - 1) cursor = 0 - thread_count = _index_build_threads() + thread_count = _index_build_threads(cparams) for chunk_id in range(nchunks): start = chunk_id * chunk_len @@ -1596,6 +1692,7 @@ def _build_light_descriptor( values: np.ndarray, optlevel: int, persistent: bool, + cparams: dict | None = None, ) -> dict: chunk_len = int(array.chunks[0]) nav_segment_len = int(array.blocks[0]) @@ -1603,7 +1700,7 @@ def _build_light_descriptor( bucket_count = math.ceil(chunk_len / bucket_len) value_lossy_bits = _light_value_lossy_bits(values.dtype, optlevel) sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( - values, chunk_len, nav_segment_len + values, chunk_len, nav_segment_len, cparams ) if value_lossy_bits > 0: sorted_values = _quantize_light_values_array(sorted_values, value_lossy_bits) @@ -1625,6 +1722,7 @@ def _build_light_descriptor( persistent, chunk_len, nav_segment_len, + cparams, ) light["bucket_count"] = bucket_count light["bucket_len"] = bucket_len @@ -1641,6 +1739,7 @@ def _build_light_descriptor_ooc( dtype: np.dtype, optlevel: int, persistent: bool, + cparams: dict | None = None, ) -> dict: chunk_len = int(array.chunks[0]) nav_segment_len = int(array.blocks[0]) @@ -1649,7 +1748,15 @@ def _build_light_descriptor_ooc( value_lossy_bits = _light_value_lossy_bits(dtype, optlevel) bucket_dtype = _position_dtype(bucket_count - 1) sorted_values, bucket_positions, offsets, l1, l2 = _build_light_chunk_payloads_intra_chunk( - array, target, dtype, chunk_len, nav_segment_len, value_lossy_bits, bucket_len, bucket_dtype + array, + target, + dtype, + chunk_len, + nav_segment_len, + value_lossy_bits, + bucket_len, + bucket_dtype, + cparams, ) if persistent: values_handle = bucket_handle = None @@ -1668,6 +1775,7 @@ def _build_light_descriptor_ooc( len(sorted_values), chunk_len, nav_segment_len, + cparams, ) ) if values_handle is not None: @@ -1688,6 +1796,7 @@ def _build_light_descriptor_ooc( bucket_sidecar, chunk_len, nav_segment_len, + cparams, ) except Exception: if values_sidecar is not None: @@ -1716,6 +1825,7 @@ def _build_light_descriptor_ooc( persistent, chunk_len, nav_segment_len, + cparams, ) light["bucket_count"] = bucket_count light["bucket_len"] = bucket_len @@ -1819,9 +1929,12 @@ def _tracker_register_delete(tracker: TempRunTracker | None, *paths: Path) -> No tracker.current_disk_bytes = max(0, tracker.current_disk_bytes - delta) -def _create_blosc2_temp_array(path: Path, length: int, dtype: np.dtype, buffer_items: int): +def _create_blosc2_temp_array( + path: Path, length: int, dtype: np.dtype, buffer_items: int, cparams: dict | None = None +): chunks, blocks = _temp_run_storage_geometry(length, dtype, buffer_items) - cparams = blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=1) + if cparams is None: + cparams = blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=1) return blosc2.empty( (length,), dtype=dtype, @@ -1858,12 +1971,15 @@ def _materialize_sorted_run( workdir: Path, prefix: str, tracker: TempRunTracker | None = None, + cparams: dict | None = None, ) -> SortedRun: values_path = workdir / f"{prefix}.values.b2nd" positions_path = workdir / f"{prefix}.positions.b2nd" - run_values = _create_blosc2_temp_array(values_path, length, value_dtype, FULL_OOC_MERGE_BUFFER_ITEMS) + run_values = _create_blosc2_temp_array( + values_path, length, value_dtype, FULL_OOC_MERGE_BUFFER_ITEMS, cparams + ) run_positions = _create_blosc2_temp_array( - positions_path, length, np.dtype(np.int64), FULL_OOC_MERGE_BUFFER_ITEMS + positions_path, length, np.dtype(np.int64), FULL_OOC_MERGE_BUFFER_ITEMS, cparams ) run_values[:] = values run_positions[:] = positions @@ -1879,10 +1995,11 @@ def _copy_sidecar_to_temp_run( workdir: Path, prefix: str, tracker: TempRunTracker | None = None, + cparams: dict | None = None, ) -> Path: out_path = workdir / f"{prefix}.b2nd" sidecar = blosc2.open(path, mmap_mode="r") - output = _create_blosc2_temp_array(out_path, length, dtype, FULL_OOC_MERGE_BUFFER_ITEMS) + output = _create_blosc2_temp_array(out_path, length, dtype, FULL_OOC_MERGE_BUFFER_ITEMS, cparams) chunk_len = int(sidecar.chunks[0]) for chunk_id, start in enumerate(range(0, length, chunk_len)): stop = min(start + chunk_len, length) @@ -1919,6 +2036,7 @@ def _merge_run_pair( merge_id: int, buffer_items: int, tracker: TempRunTracker | None = None, + cparams: dict | None = None, ) -> SortedRun: left_values_mm = blosc2.open(str(left.values_path), mmap_mode="r") left_positions_mm = blosc2.open(str(left.positions_path), mmap_mode="r") @@ -1927,9 +2045,11 @@ def _merge_run_pair( out_values_path = workdir / f"full_merge_values_{merge_id}.b2nd" out_positions_path = workdir / f"full_merge_positions_{merge_id}.b2nd" - out_values = _create_blosc2_temp_array(out_values_path, left.length + right.length, dtype, buffer_items) + out_values = _create_blosc2_temp_array( + out_values_path, left.length + right.length, dtype, buffer_items, cparams + ) out_positions = _create_blosc2_temp_array( - out_positions_path, left.length + right.length, np.dtype(np.int64), buffer_items + out_positions_path, left.length + right.length, np.dtype(np.int64), buffer_items, cparams ) left_cursor = 0 @@ -2016,6 +2136,7 @@ def _build_full_descriptor_ooc( dtype: np.dtype, persistent: bool, workdir: Path, + cparams: dict | None = None, ) -> dict: size = int(array.shape[0]) tracker = TempRunTracker() @@ -2023,10 +2144,10 @@ def _build_full_descriptor_ooc( sorted_values = np.empty(0, dtype=dtype) positions = np.empty(0, dtype=np.int64) values_sidecar = _store_array_sidecar( - array, token, kind, "full", "values", sorted_values, persistent + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams ) positions_sidecar = _store_array_sidecar( - array, token, kind, "full", "positions", positions, persistent + array, token, kind, "full", "positions", positions, persistent, cparams=cparams ) full = { "values_path": values_sidecar["path"], @@ -2034,7 +2155,7 @@ def _build_full_descriptor_ooc( "runs": [], "next_run_id": 0, } - _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) return full run_items = max(int(array.chunks[0]), min(size, FULL_OOC_RUN_ITEMS)) runs = [] @@ -2054,6 +2175,7 @@ def _build_full_descriptor_ooc( workdir, f"full_run_{run_id}", tracker, + cparams, ) ) @@ -2073,6 +2195,7 @@ def _build_full_descriptor_ooc( merge_id, FULL_OOC_MERGE_BUFFER_ITEMS, tracker, + cparams, ) ) merge_id += 1 @@ -2090,20 +2213,20 @@ def _build_full_descriptor_ooc( } if persistent: _stream_copy_temp_run_to_full_sidecars( - array, token, kind, full, final_run, dtype, persistent, tracker + array, token, kind, full, final_run, dtype, persistent, tracker, cparams ) else: sorted_values = blosc2.open(str(final_run.values_path), mmap_mode="r")[:] positions = blosc2.open(str(final_run.positions_path), mmap_mode="r")[:] values_sidecar = _store_array_sidecar( - array, token, kind, "full", "values", sorted_values, persistent + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams ) positions_sidecar = _store_array_sidecar( - array, token, kind, "full", "positions", positions, persistent + array, token, kind, "full", "positions", positions, persistent, cparams=cparams ) full["values_path"] = values_sidecar["path"] full["positions_path"] = positions_sidecar["path"] - _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) del sorted_values, positions _tracker_register_delete(tracker, final_run.values_path, final_run.positions_path) final_run.values_path.unlink(missing_ok=True) @@ -2126,6 +2249,7 @@ def _build_descriptor( light: dict | None, reduced: dict | None, full: dict | None, + cparams: dict | None = None, ) -> dict: return { "name": name @@ -2148,6 +2272,7 @@ def _build_descriptor( "light": light, "reduced": reduced, "full": full, + "cparams": _plain_index_cparams(cparams), } @@ -2162,6 +2287,7 @@ def create_index( name: str | None = None, **kwargs, ) -> dict: + cparams = _normalize_index_cparams(kwargs.pop("cparams", None)) del kwargs dtype = _validate_index_target(array, field) target = _field_target_descriptor(field) @@ -2175,14 +2301,14 @@ def create_index( use_ooc = _resolve_ooc_mode(kind, in_mem) if use_ooc and kind in {"light", "medium", "full"}: - levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent, cparams) light = ( - _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) if kind == "light" else None ) reduced = ( - _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) if kind == "medium" else None ) @@ -2190,7 +2316,7 @@ def create_index( if kind == "full": with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: full = _build_full_descriptor_ooc( - array, target, token, kind, dtype, persistent, Path(tmpdir) + array, target, token, kind, dtype, persistent, Path(tmpdir), cparams ) descriptor = _build_descriptor( array, @@ -2207,21 +2333,26 @@ def create_index( light, reduced, full, + cparams, ) else: values = _values_for_target(array, target) - levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent) + levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent, cparams) light = ( - _build_light_descriptor(array, token, kind, values, optlevel, persistent) + _build_light_descriptor(array, token, kind, values, optlevel, persistent, cparams) if kind == "light" else None ) reduced = ( - _build_reduced_descriptor(array, token, kind, values, optlevel, persistent) + _build_reduced_descriptor(array, token, kind, values, optlevel, persistent, cparams) if kind == "medium" else None ) - full = _build_full_descriptor(array, token, kind, values, persistent) if kind == "full" else None + full = ( + _build_full_descriptor(array, token, kind, values, persistent, cparams) + if kind == "full" + else None + ) descriptor = _build_descriptor( array, target, @@ -2237,6 +2368,7 @@ def create_index( light, reduced, full, + cparams, ) store = _load_store(array) @@ -2258,6 +2390,7 @@ def create_expr_index( name: str | None = None, **kwargs, ) -> dict: + cparams = _normalize_index_cparams(kwargs.pop("cparams", None)) del kwargs if operands is None: operands = array.fields if array.dtype.fields is not None else {"value": array} @@ -2276,14 +2409,14 @@ def create_expr_index( token = _target_token(target) if use_ooc and kind in {"light", "medium", "full"}: - levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent) + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent, cparams) light = ( - _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) if kind == "light" else None ) reduced = ( - _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent) + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) if kind == "medium" else None ) @@ -2291,7 +2424,7 @@ def create_expr_index( if kind == "full": with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: full = _build_full_descriptor_ooc( - array, target, token, kind, dtype, persistent, Path(tmpdir) + array, target, token, kind, dtype, persistent, Path(tmpdir), cparams ) descriptor = _build_descriptor( array, @@ -2308,21 +2441,26 @@ def create_expr_index( light, reduced, full, + cparams, ) else: values = _values_for_target(array, target) - levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent) + levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent, cparams) light = ( - _build_light_descriptor(array, token, kind, values, optlevel, persistent) + _build_light_descriptor(array, token, kind, values, optlevel, persistent, cparams) if kind == "light" else None ) reduced = ( - _build_reduced_descriptor(array, token, kind, values, optlevel, persistent) + _build_reduced_descriptor(array, token, kind, values, optlevel, persistent, cparams) if kind == "medium" else None ) - full = _build_full_descriptor(array, token, kind, values, persistent) if kind == "full" else None + full = ( + _build_full_descriptor(array, token, kind, values, persistent, cparams) + if kind == "full" + else None + ) descriptor = _build_descriptor( array, target, @@ -2338,6 +2476,7 @@ def create_expr_index( light, reduced, full, + cparams, ) store = _load_store(array) @@ -2400,13 +2539,16 @@ def _replace_levels_descriptor(array: blosc2.NDArray, descriptor: dict, kind: st size = int(array.shape[0]) target = descriptor["target"] token = descriptor["token"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) for level, level_info in descriptor["levels"].items(): segment_len = int(level_info["segment_len"]) start = 0 summaries = _compute_segment_summaries( _slice_values_for_target(array, target, start, size), np.dtype(descriptor["dtype"]), segment_len ) - sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) level_info["path"] = sidecar["path"] level_info["dtype"] = sidecar["dtype"] level_info["nsegments"] = len(summaries) @@ -2419,6 +2561,7 @@ def _replace_levels_descriptor_tail( token = descriptor["token"] dtype = np.dtype(descriptor["dtype"]) new_size = int(array.shape[0]) + cparams = _normalize_index_cparams(descriptor.get("cparams")) for level, level_info in descriptor["levels"].items(): segment_len = int(level_info["segment_len"]) start_segment = old_size // segment_len @@ -2427,7 +2570,9 @@ def _replace_levels_descriptor_tail( tail_values = _slice_values_for_target(array, target, tail_start, new_size) tail_summaries = _compute_segment_summaries(tail_values, dtype, segment_len) summaries = np.concatenate((prefix, tail_summaries)) if len(prefix) else tail_summaries - sidecar = _store_array_sidecar(array, token, kind, "summary", level, summaries, persistent) + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) level_info["path"] = sidecar["path"] level_info["dtype"] = sidecar["dtype"] level_info["nsegments"] = len(summaries) @@ -2439,6 +2584,7 @@ def _replace_reduced_descriptor_tail( del old_size target = descriptor["target"] reduced = descriptor["reduced"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) for key in ("values_path", "positions_path", "offsets_path", "l1_path", "l2_path"): _remove_sidecar_path(reduced.get(key)) if descriptor.get("ooc", False): @@ -2450,6 +2596,7 @@ def _replace_reduced_descriptor_tail( np.dtype(descriptor["dtype"]), descriptor["optlevel"], persistent, + cparams, ) else: rebuilt = _build_reduced_descriptor( @@ -2459,6 +2606,7 @@ def _replace_reduced_descriptor_tail( _values_for_target(array, target), descriptor["optlevel"], persistent, + cparams, ) descriptor["reduced"] = rebuilt @@ -2469,6 +2617,7 @@ def _replace_light_descriptor_tail( del old_size target = descriptor["target"] light = descriptor["light"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) for key in ("values_path", "bucket_positions_path", "offsets_path", "l1_path", "l2_path"): _remove_sidecar_path(light.get(key)) if descriptor.get("ooc", False): @@ -2480,6 +2629,7 @@ def _replace_light_descriptor_tail( np.dtype(descriptor["dtype"]), descriptor["optlevel"], persistent, + cparams, ) else: rebuilt = _build_light_descriptor( @@ -2489,6 +2639,7 @@ def _replace_light_descriptor_tail( _values_for_target(array, target), descriptor["optlevel"], persistent, + cparams, ) descriptor["light"] = rebuilt @@ -2503,19 +2654,24 @@ def _replace_full_descriptor( kind = descriptor["kind"] token = descriptor["token"] full = descriptor["full"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) for run in full.get("runs", ()): _remove_sidecar_path(run.get("values_path")) _remove_sidecar_path(run.get("positions_path")) _remove_sidecar_path(full.get("l1_path")) _remove_sidecar_path(full.get("l2_path")) _clear_cached_data(array, token) - values_sidecar = _store_array_sidecar(array, token, kind, "full", "values", sorted_values, persistent) - positions_sidecar = _store_array_sidecar(array, token, kind, "full", "positions", positions, persistent) + values_sidecar = _store_array_sidecar( + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full", "positions", positions, persistent, cparams=cparams + ) full["values_path"] = values_sidecar["path"] full["positions_path"] = positions_sidecar["path"] full["runs"] = [] full["next_run_id"] = 0 - _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent) + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) def _replace_full_descriptor_from_paths( @@ -2529,6 +2685,7 @@ def _replace_full_descriptor_from_paths( token = descriptor["token"] full = descriptor["full"] persistent = descriptor["persistent"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) if not persistent: raise ValueError("path-based full replacement requires persistent indexes") for run in full.get("runs", ()): @@ -2548,6 +2705,7 @@ def _replace_full_descriptor_from_paths( np.dtype(descriptor["dtype"]), (int(array.chunks[0]),), (int(array.blocks[0]),), + cparams, ) _stream_copy_sidecar_array( positions_path, @@ -2556,6 +2714,7 @@ def _replace_full_descriptor_from_paths( np.dtype(np.int64), (int(array.chunks[0]),), (int(array.blocks[0]),), + cparams, ) values_path.unlink(missing_ok=True) positions_path.unlink(missing_ok=True) @@ -2564,7 +2723,15 @@ def _replace_full_descriptor_from_paths( full["runs"] = [] full["next_run_id"] = 0 _rebuild_full_navigation_sidecars_from_path( - array, token, kind, full, final_values_path, np.dtype(descriptor["dtype"]), length, persistent + array, + token, + kind, + full, + final_values_path, + np.dtype(descriptor["dtype"]), + length, + persistent, + cparams, ) @@ -2578,11 +2745,12 @@ def _store_full_run_descriptor( kind = descriptor["kind"] token = descriptor["token"] persistent = descriptor["persistent"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) values_sidecar = _store_array_sidecar( - array, token, kind, "full_run", f"{run_id}.values", sorted_values, persistent + array, token, kind, "full_run", f"{run_id}.values", sorted_values, persistent, cparams=cparams ) positions_sidecar = _store_array_sidecar( - array, token, kind, "full_run", f"{run_id}.positions", positions, persistent + array, token, kind, "full_run", f"{run_id}.positions", positions, persistent, cparams=cparams ) return { "id": run_id, diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 367cfd0f..4154f5a9 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4762,9 +4762,12 @@ def create_index( mutation and resize operations mark indexes as stale until rebuild. Chunk-local index creation uses parallel intra-chunk sorting by default. - Set the ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to control - the number of build threads. Setting - ``BLOSC2_INDEX_BUILD_THREADS=1`` disables parallel sorting. + Pass ``cparams`` in ``kwargs`` to control the compression settings used + for the index sidecars. When provided, ``cparams["nthreads"]`` is used + as the default build-thread count. Set the + ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to override that + selection explicitly. Setting ``BLOSC2_INDEX_BUILD_THREADS=1`` + disables parallel sorting. """ from . import indexing @@ -4831,9 +4834,12 @@ def create_expr_index( expression key. Chunk-local index creation uses parallel intra-chunk sorting by default. - Set the ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to control - the number of build threads. Setting - ``BLOSC2_INDEX_BUILD_THREADS=1`` disables parallel sorting. + Pass ``cparams`` in ``kwargs`` to control the compression settings used + for the index sidecars. When provided, ``cparams["nthreads"]`` is used + as the default build-thread count. Set the + ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to override that + selection explicitly. Setting ``BLOSC2_INDEX_BUILD_THREADS=1`` + disables parallel sorting. """ from . import indexing diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index a1d2d025..db211ff5 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -393,7 +393,7 @@ def test_in_mem_override_disables_ooc_builder(kind): @pytest.mark.parametrize("kind", ["light", "medium"]) -def test_chunk_local_ooc_intra_chunk_build_uses_thread_pool_when_strategy_forced(monkeypatch, kind): +def test_chunk_local_ooc_intra_chunk_build_uses_thread_pool_when_threads_forced(monkeypatch, kind): data = np.arange(48_000, dtype=np.int64) arr = blosc2.asarray(data, chunks=(48_000,), blocks=(1_500,)) indexing = __import__("blosc2.indexing", fromlist=["ThreadPoolExecutor"]) @@ -422,6 +422,54 @@ def map(self, fn, iterable): assert observed_workers[0] == 2 +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_in_memory_chunk_local_build_uses_cparams_nthreads(monkeypatch, kind): + data = np.arange(48_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(48_000,), blocks=(1_500,)) + indexing = __import__("blosc2.indexing", fromlist=["ThreadPoolExecutor"]) + observed_workers = [] + + class FakeExecutor: + def __init__(self, *, max_workers): + observed_workers.append(max_workers) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def map(self, fn, iterable): + return [fn(item) for item in iterable] + + monkeypatch.setattr(indexing, "ThreadPoolExecutor", FakeExecutor) + + descriptor = arr.create_index(kind=kind, in_mem=True, cparams=blosc2.CParams(nthreads=2)) + + assert descriptor["ooc"] is False + assert observed_workers + assert observed_workers[0] == 2 + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_persistent_chunk_local_sidecars_use_cparams(tmp_path, kind): + path = tmp_path / f"persistent_cparams_{kind}.b2nd" + data = np.arange(48_000, dtype=np.int64) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(12_000,), blocks=(2_000,)) + cparams = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=2, nthreads=3) + + descriptor = arr.create_index(kind=kind, cparams=cparams) + meta = descriptor["light"] if kind == "light" else descriptor["reduced"] + aux_key = "bucket_positions_path" if kind == "light" else "positions_path" + + values_sidecar = blosc2.open(meta["values_path"]) + aux_sidecar = blosc2.open(meta[aux_key]) + + for sidecar in (values_sidecar, aux_sidecar): + assert sidecar.cparams.codec == blosc2.Codec.LZ4 + assert sidecar.cparams.clevel == 2 + + def test_intra_chunk_sort_run_matches_numpy_stable_order(): indexing_ext = __import__("blosc2.indexing_ext", fromlist=["intra_chunk_sort_run"]) values = np.array([4.0, np.nan, 2.0, 2.0, np.nan, 1.0, 4.0], dtype=np.float64) From 65e6fcb545d74956aa357a79c0208ebcbdeaf415 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 7 Apr 2026 11:41:44 +0200 Subject: [PATCH 37/68] Document index build kwargs and compression controls --- bench/ndarray/index_query_bench.py | 53 +++++++++++++++++++++++++++--- src/blosc2/ndarray.py | 36 +++++++++++++------- 2 files changed, 73 insertions(+), 16 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index b808d79b..68e43493 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -283,13 +283,17 @@ def indexed_array_path( in_mem: bool, chunks: int | None, blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, nthreads: int | None, ) -> Path: mode = "mem" if in_mem else "ooc" + codec_token = "codec-auto" if codec is None else f"codec-{codec.name}" + clevel_token = "clevel-auto" if clevel is None else f"clevel-{clevel}" thread_token = "threads-auto" if nthreads is None else f"threads-{nthreads}" return ( size_dir - / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.{thread_token}" + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.{codec_token}.{clevel_token}.{thread_token}" f".{kind}.opt{optlevel}.{mode}.b2nd" ) @@ -423,6 +427,8 @@ def _open_or_build_indexed_array( in_mem: bool, chunks: int | None, blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, nthreads: int | None, ) -> tuple[blosc2.NDArray, float]: if path.exists(): @@ -436,8 +442,15 @@ def _open_or_build_indexed_array( arr = build_persistent_array(size, dist, id_dtype, path, chunks, blocks) build_start = time.perf_counter() kwargs = {"field": "id", "kind": kind, "optlevel": optlevel, "in_mem": in_mem} + cparams = {} + if codec is not None: + cparams["codec"] = codec + if clevel is not None: + cparams["clevel"] = clevel if nthreads is not None: - kwargs["cparams"] = {"nthreads": nthreads} + cparams["nthreads"] = nthreads + if cparams: + kwargs["cparams"] = cparams arr.create_index(**kwargs) return arr, time.perf_counter() - build_start @@ -453,6 +466,8 @@ def benchmark_size( full_query_mode: str, chunks: int | None, blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, nthreads: int | None, kinds: tuple[str, ...], cold_row_callback=None, @@ -472,7 +487,9 @@ def benchmark_size( rows = [] for kind in kinds: idx_arr, build_time = _open_or_build_indexed_array( - indexed_array_path(size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks, nthreads), + indexed_array_path( + size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks, codec, clevel, nthreads + ), size, dist, id_dtype, @@ -481,6 +498,8 @@ def benchmark_size( in_mem, chunks, blocks, + codec, + clevel, nthreads, ) idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) @@ -640,6 +659,19 @@ def parse_args() -> argparse.Namespace: default="auto", help="How full exact queries should run during the benchmark: auto, selective-ooc, or whole-load.", ) + parser.add_argument( + "--codec", + type=str, + default=None, + choices=[codec.name for codec in blosc2.Codec], + help="Codec to use for index sidecars. Default: library default.", + ) + parser.add_argument( + "--clevel", + type=int, + default=None, + help="Compression level to use for index sidecars. Default: library default.", + ) parser.add_argument( "--nthreads", type=int, @@ -659,6 +691,9 @@ def main() -> None: raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc if id_dtype.kind not in {"b", "i", "u", "f"}: raise SystemExit(f"--dtype only supports bool, integer, and floating-point dtypes; got {id_dtype}") + codec = None if args.codec is None else blosc2.Codec[args.codec] + if args.clevel is not None and args.clevel < 0: + raise SystemExit("--clevel must be >= 0") if args.nthreads is not None and args.nthreads <= 0: raise SystemExit("--nthreads must be a positive integer") sizes = (args.size,) if args.size is not None else SIZES @@ -681,6 +716,8 @@ def main() -> None: args.full_query_mode, args.chunks, args.blocks, + codec, + args.clevel, args.nthreads, ) else: @@ -699,6 +736,8 @@ def main() -> None: args.full_query_mode, args.chunks, args.blocks, + codec, + args.clevel, args.nthreads, ) @@ -717,6 +756,8 @@ def run_benchmarks( full_query_mode: str, chunks: int | None, blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, nthreads: int | None, ) -> None: all_results = [] @@ -732,7 +773,9 @@ def run_benchmarks( print( f"{geometry_label}, repeats={repeats}, dist={dist_label}, " f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " - f"full_query_mode={full_query_mode}, index_nthreads={'auto' if nthreads is None else nthreads}" + f"full_query_mode={full_query_mode}, index_codec={'auto' if codec is None else codec.name}, " + f"index_clevel={'auto' if clevel is None else clevel}, " + f"index_nthreads={'auto' if nthreads is None else nthreads}" ) for dist in dists: for size in sizes: @@ -747,6 +790,8 @@ def run_benchmarks( full_query_mode, chunks, blocks, + codec, + clevel, nthreads, kinds, ) diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 4154f5a9..95cf1055 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4754,6 +4754,14 @@ def create_index( Optional logical label stored in the descriptor. Index identity is still driven by the target field, so creating another index on the same field replaces the previous one. + kwargs : dict, optional + Keyword arguments forwarded to the index builder. At the moment the + supported option is ``cparams``. Pass ``cparams`` to control the + compression settings used for index sidecars, including + ``codec``, ``clevel``, and ``nthreads``. If provided, + ``cparams["nthreads"]`` becomes the default build-thread count for + intra-chunk sorting unless ``BLOSC2_INDEX_BUILD_THREADS`` overrides + it. Notes ----- @@ -4762,12 +4770,10 @@ def create_index( mutation and resize operations mark indexes as stale until rebuild. Chunk-local index creation uses parallel intra-chunk sorting by default. - Pass ``cparams`` in ``kwargs`` to control the compression settings used - for the index sidecars. When provided, ``cparams["nthreads"]`` is used - as the default build-thread count. Set the - ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to override that - selection explicitly. Setting ``BLOSC2_INDEX_BUILD_THREADS=1`` - disables parallel sorting. + Set ``BLOSC2_INDEX_BUILD_THREADS=1`` to disable parallel sorting. If + ``cparams`` is provided in ``kwargs``, its ``nthreads`` value becomes + the default build-thread count unless + ``BLOSC2_INDEX_BUILD_THREADS`` overrides it. """ from . import indexing @@ -4826,6 +4832,14 @@ def create_expr_index( additional temporary arrays for sorting and block payloads, so the default remains ``False`` and uses the out-of-core builders for ``light``, ``medium``, and ``full``. + kwargs : dict, optional + Keyword arguments forwarded to the index builder. At the moment the + supported option is ``cparams``. Pass ``cparams`` to control the + compression settings used for index sidecars, including + ``codec``, ``clevel``, and ``nthreads``. If provided, + ``cparams["nthreads"]`` becomes the default build-thread count for + intra-chunk sorting unless ``BLOSC2_INDEX_BUILD_THREADS`` overrides + it. Notes ----- @@ -4834,12 +4848,10 @@ def create_expr_index( expression key. Chunk-local index creation uses parallel intra-chunk sorting by default. - Pass ``cparams`` in ``kwargs`` to control the compression settings used - for the index sidecars. When provided, ``cparams["nthreads"]`` is used - as the default build-thread count. Set the - ``BLOSC2_INDEX_BUILD_THREADS`` environment variable to override that - selection explicitly. Setting ``BLOSC2_INDEX_BUILD_THREADS=1`` - disables parallel sorting. + Set ``BLOSC2_INDEX_BUILD_THREADS=1`` to disable parallel sorting. If + ``cparams`` is provided in ``kwargs``, its ``nthreads`` value becomes + the default build-thread count unless + ``BLOSC2_INDEX_BUILD_THREADS`` overrides it. """ from . import indexing From e489527ebb635116f080d93e7e30a51683e62acf Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 7 Apr 2026 12:01:55 +0200 Subject: [PATCH 38/68] Reduce chunk size on macos to make index sorting times reasonable --- src/blosc2/core.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/blosc2/core.py b/src/blosc2/core.py index e3d9d4ed..872e5a88 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1439,7 +1439,10 @@ def get_chunksize(blocksize, l3_minimum=4 * 2**20, l3_maximum=2**26, reduc_facto if isinstance(l2_cache_size, int) and l2_cache_size > chunksize: # Apple Silicon has a large L2 cache, and memory bandwidth is high, # so we can use a larger chunksize based on L2 cache size. - chunksize = l2_cache_size * 4 + # chunksize = l2_cache_size * 4 + # But experiments show that using such a large chunksize + # can make indexes too large. Going back to using just L2. + chunksize = l2_cache_size # Ensure a minimum size if chunksize < l3_minimum: From b218dbe2418f96a0b785b71195cc1fcd2bfa4f01 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 7 Apr 2026 12:28:42 +0200 Subject: [PATCH 39/68] Make more common defaults --- bench/ndarray/index_query_bench.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 68e43493..1bc75f84 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -598,8 +598,8 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--query-width", type=parse_human_size, - default=1_000, - help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1000.", + default=1, + help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1.", ) parser.add_argument( "--chunks", @@ -638,7 +638,7 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--dist", choices=(*DISTS, "all"), - default="sorted", + default="permuted", help="Distribution for the indexed field. Use 'all' to benchmark every distribution.", ) parser.add_argument( From d90b6846931c9bd12a4536c7fe849d82b1c8f982 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 7 Apr 2026 13:02:39 +0200 Subject: [PATCH 40/68] Some API cleanup --- src/blosc2/indexing.py | 14 -------------- src/blosc2/ndarray.py | 12 ++++-------- 2 files changed, 4 insertions(+), 22 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 7b648796..ed4ef246 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -2240,7 +2240,6 @@ def _build_descriptor( token: str, kind: str, optlevel: int, - granularity: str, persistent: bool, ooc: bool, name: str | None, @@ -2260,7 +2259,6 @@ def _build_descriptor( "kind": kind, "version": INDEX_FORMAT_VERSION, "optlevel": optlevel, - "granularity": granularity, "persistent": persistent, "ooc": ooc, "stale": False, @@ -2281,7 +2279,6 @@ def create_index( field: str | None = None, kind: str = "light", optlevel: int = 5, - granularity: str = "chunk", persistent: bool | None = None, in_mem: bool = False, name: str | None = None, @@ -2294,8 +2291,6 @@ def create_index( token = _target_token(target) if kind not in SEGMENT_LEVELS_BY_KIND: raise NotImplementedError(f"unsupported index kind {kind!r}") - if granularity != "chunk": - raise NotImplementedError("only chunk-based array indexes are implemented for now") if persistent is None: persistent = _is_persistent_array(array) use_ooc = _resolve_ooc_mode(kind, in_mem) @@ -2324,7 +2319,6 @@ def create_index( token, kind, optlevel, - granularity, persistent, True, name, @@ -2359,7 +2353,6 @@ def create_index( token, kind, optlevel, - granularity, persistent, False, name, @@ -2384,7 +2377,6 @@ def create_expr_index( operands: dict | None = None, kind: str = "light", optlevel: int = 5, - granularity: str = "chunk", persistent: bool | None = None, in_mem: bool = False, name: str | None = None, @@ -2401,8 +2393,6 @@ def create_expr_index( ) if kind not in SEGMENT_LEVELS_BY_KIND: raise NotImplementedError(f"unsupported index kind {kind!r}") - if granularity != "chunk": - raise NotImplementedError("only chunk-based array indexes are implemented for now") if persistent is None: persistent = _is_persistent_array(array) use_ooc = _resolve_ooc_mode(kind, in_mem) @@ -2432,7 +2422,6 @@ def create_expr_index( token, kind, optlevel, - granularity, persistent, True, name, @@ -2467,7 +2456,6 @@ def create_expr_index( token, kind, optlevel, - granularity, persistent, False, name, @@ -2831,7 +2819,6 @@ def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | N operands=operands, kind=descriptor["kind"], optlevel=descriptor["optlevel"], - granularity=descriptor["granularity"], persistent=descriptor["persistent"], in_mem=not descriptor.get("ooc", False), name=descriptor["name"], @@ -2841,7 +2828,6 @@ def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | N field=descriptor["field"], kind=descriptor["kind"], optlevel=descriptor["optlevel"], - granularity=descriptor["granularity"], persistent=descriptor["persistent"], in_mem=not descriptor.get("ooc", False), name=descriptor["name"], diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 95cf1055..1ff10b7b 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4722,7 +4722,6 @@ def create_index( field: str | None = None, kind: str = "light", optlevel: int = 5, - granularity: str = "chunk", persistent: bool | None = None, in_mem: bool = False, name: str | None = None, @@ -4733,7 +4732,9 @@ def create_index( Parameters ---------- field : str or None, optional - Field to index for structured dtypes. Use ``None`` to index the array values. + Field to index for structured dtypes. Use ``None`` to index the + array values for plain 1-D arrays. Structured arrays require an + explicit field name. kind : {"ultralight", "light", "medium", "full"}, optional Index tier to build. Use ``light`` or ``medium`` for faster/lighter filter-oriented indexes, and ``full`` when exact ordered access via @@ -4741,8 +4742,6 @@ def create_index( should reuse the index directly. optlevel : int, optional Optimization level for index payload construction. - granularity : str, optional - Current implementation only supports ``"chunk"``. persistent : bool or None, optional Whether index sidecars should be persisted. If ``None``, this follows whether the base array is persistent. in_mem : bool, optional @@ -4782,7 +4781,6 @@ def create_index( field=field, kind=kind, optlevel=optlevel, - granularity=granularity, persistent=persistent, in_mem=in_mem, name=name, @@ -4807,7 +4805,6 @@ def create_expr_index( operands: dict | None = None, kind: str = "light", optlevel: int = 3, - granularity: str = "chunk", persistent: bool | None = None, in_mem: bool = False, name: str | None = None, @@ -4826,7 +4823,7 @@ def create_expr_index( Operand mapping used for normalization and evaluation. When omitted, structured arrays default to ``self.fields`` and plain arrays use ``{"value": self}``. - kind, optlevel, granularity, persistent, in_mem, name + kind, optlevel, persistent, in_mem, name Same meaning as in :meth:`create_index`. Setting ``in_mem=True`` materializes the derived expression stream in RAM and can allocate additional temporary arrays for sorting and block payloads, so the @@ -4861,7 +4858,6 @@ def create_expr_index( operands=operands, kind=kind, optlevel=optlevel, - granularity=granularity, persistent=persistent, in_mem=in_mem, name=name, From 99fc8d0d13d2d829580a0b65fc8f5abf779197b9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Tue, 7 Apr 2026 13:33:36 +0200 Subject: [PATCH 41/68] Document will_use_index and add tests --- doc/reference/lazyarray.rst | 2 ++ src/blosc2/lazyexpr.py | 1 + tests/ndarray/test_indexing.py | 19 +++++++++++++++++++ 3 files changed, 22 insertions(+) diff --git a/doc/reference/lazyarray.rst b/doc/reference/lazyarray.rst index 1080a62b..d3a21a1e 100644 --- a/doc/reference/lazyarray.rst +++ b/doc/reference/lazyarray.rst @@ -33,10 +33,12 @@ See the `LazyExpr`_ and `LazyUDF`_ sections for more information. .. autosummary:: __getitem__ + will_use_index Methods --------------- .. automethod:: __getitem__ + .. automethod:: will_use_index Attributes ---------- diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index fd6a158d..c8e7c85f 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -3723,6 +3723,7 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray: return lazy_expr def will_use_index(self) -> bool: + """Return whether the current lazy query can use an index.""" from . import indexing return indexing.will_use_index(self) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index db211ff5..992d4d54 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -54,6 +54,25 @@ def test_structured_field_index_matches_scan(kind): np.testing.assert_array_equal(indexed, data[(data["id"] >= 48_000) & (data["id"] < 51_000)]) +def test_module_level_will_use_index_matches_lazyexpr_method(): + import blosc2.indexing as indexing + + indexed = blosc2.asarray(np.arange(100_000, dtype=np.int64), chunks=(10_000,), blocks=(2_000,)) + indexed.create_index(kind="medium") + indexed_expr = ((indexed >= 48_000) & (indexed < 51_000)).where(indexed) + + plain = blosc2.asarray(np.arange(100_000, dtype=np.int64), chunks=(10_000,), blocks=(2_000,)) + plain_expr = ((plain >= 48_000) & (plain < 51_000)).where(plain) + + assert indexing.will_use_index(indexed_expr) is True + assert indexed_expr.will_use_index() is True + assert indexing.will_use_index(indexed_expr) == indexed_expr.will_use_index() + + assert indexing.will_use_index(plain_expr) is False + assert plain_expr.will_use_index() is False + assert indexing.will_use_index(plain_expr) == plain_expr.will_use_index() + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_random_field_index_matches_scan(kind): rng = np.random.default_rng(0) From f4ecadf11438dddbc87582de899f355ea2d0d7df Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 06:58:24 +0200 Subject: [PATCH 42/68] Fix Windows file-locking error in rebuild_index test On Windows, C-Blosc2 cannot update vlmeta while another file handle holds the same path open. Tests that created `arr` with a urlpath and then opened the same file as `reopened` triggered a RuntimeError in blosc2_vlmeta_update when write operations (e.g. rebuild_index) were attempted through the second handle. Add `del arr` before every `reopened = blosc2.open(path, mode="a")` call in tests/ndarray/test_indexing.py (9 sites), following the pattern already established in test_open.py, test_mmap.py and test_schunk.py. --- tests/ndarray/test_indexing.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 992d4d54..b256c35a 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -283,6 +283,7 @@ def test_persistent_index_survives_reopen(tmp_path, kind): else: assert descriptor["full"]["values_path"] is not None + del arr reopened = blosc2.open(path, mode="a") assert len(reopened.indexes) == 1 if kind == "light": @@ -311,6 +312,7 @@ def test_default_ooc_persistent_index_matches_scan_and_rebuilds(tmp_path, kind): assert descriptor["ooc"] is True + del arr reopened = blosc2.open(path, mode="a") assert reopened.indexes[0]["ooc"] is True @@ -340,6 +342,7 @@ def test_persistent_chunk_local_ooc_builds_do_not_use_temp_memmap(tmp_path, kind meta = descriptor["light"] if kind == "light" else descriptor["reduced"] assert meta["values_path"] is not None + del arr reopened = blosc2.open(path, mode="a") expr = ((reopened >= 55_000) & (reopened < 55_010)).where(reopened) np.testing.assert_array_equal(expr.compute()[:], data[(data >= 55_000) & (data < 55_010)]) @@ -382,6 +385,7 @@ def test_chunk_local_index_descriptor_and_lookup_path(tmp_path, kind): if kind == "medium": assert meta["nav_segment_divisor"] == 4 + del arr reopened = blosc2.open(path, mode="a") expr = (reopened == 123_456).where(reopened) explanation = expr.explain() @@ -694,6 +698,7 @@ def test_persistent_full_index_runs_survive_reopen(tmp_path): arr.append(batch1) arr.append(batch2) + del arr reopened = blosc2.open(path, mode="a") assert len(reopened.indexes[0]["full"]["runs"]) == 2 @@ -711,6 +716,7 @@ def test_persistent_compact_full_exact_query_avoids_whole_sidecar_load(monkeypat arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(12_000,), blocks=(2_000,)) arr.create_csindex() + del arr reopened = blosc2.open(path, mode="a") indexing = __import__("blosc2.indexing", fromlist=["_load_array_sidecar"]) original_load = indexing._load_array_sidecar @@ -796,6 +802,7 @@ def test_persistent_expression_index_survives_reopen(tmp_path): arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) descriptor = arr.create_expr_index("abs(x)", kind="medium") + del arr reopened = blosc2.open(path, mode="a") assert reopened.indexes[0]["target"]["source"] == "expression" assert reopened.indexes[0]["target"]["expression_key"] == "abs(x)" @@ -879,6 +886,7 @@ def test_compact_full_index_clears_runs_and_preserves_results(tmp_path): assert compacted["full"]["l1_path"] is not None assert compacted["full"]["l2_path"] is not None + del arr reopened = blosc2.open(path, mode="a") assert reopened.indexes[0]["full"]["runs"] == [] for values_path, positions_path in run_paths: @@ -929,6 +937,7 @@ def test_persistent_large_run_full_query_uses_bounded_fallback(monkeypatch, tmp_ batch = np.array([(100 + run, 10 + run)], dtype=dtype) arr.append(batch) + del arr reopened = blosc2.open(path, mode="a") indexing = __import__("blosc2.indexing", fromlist=["_load_full_arrays"]) From ce11d9940c8aa03d401fd9b37208e23bb8f707f0 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 08:02:49 +0200 Subject: [PATCH 43/68] Fixing windows/mmap issues (I) --- src/blosc2/indexing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index ed4ef246..98384b4c 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -2800,9 +2800,10 @@ def append_to_indexes(array: blosc2.NDArray, old_size: int, appended_values: np. def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: store = _load_store(array) token = _resolve_index_token(store, field, name) + descriptor = store["indexes"][token] + _clear_cached_data(array, descriptor["token"]) descriptor = store["indexes"].pop(token) _save_store(array, store) - _clear_cached_data(array, descriptor["token"]) _drop_descriptor_sidecars(descriptor) From 72da1c87096c83d12d4a7a94caab86b307c0a2ab Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 08:49:03 +0200 Subject: [PATCH 44/68] Use latest miniexpr sources --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9397a6db..d9bae4c6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,7 +72,7 @@ endif() FetchContent_Declare(miniexpr GIT_REPOSITORY https://github.com/Blosc/miniexpr.git - GIT_TAG feadbc633a887bafd84b2fbc370ef2962d01b7ee + GIT_TAG 0ecf0ed5f8c3995c97bd5208a29a120f3c2425a3 # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../miniexpr ) FetchContent_MakeAvailable(miniexpr) From 90f586d5935620309c0b18ed50ce8490fb31db0e Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 08:56:27 +0200 Subject: [PATCH 45/68] Clamp indexing Python threads to 1 on wasm32 --- src/blosc2/indexing.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 98384b4c..f08759a1 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -53,6 +53,13 @@ INDEX_QUERY_MIN_CHUNKS_PER_THREAD = 8 +def _python_executor_threads(requested_threads: int) -> int: + # wasm32 builds do not support spawning Python worker threads reliably. + if blosc2.IS_WASM: + return 1 + return max(1, int(requested_threads)) + + def _sanitize_token(token: str) -> str: return re.sub(r"[^0-9A-Za-z_.-]+", "_", token) @@ -899,13 +906,15 @@ def _chunk_offsets(size: int, chunk_len: int) -> np.ndarray: def _index_build_threads(cparams: dict | blosc2.CParams | None = None) -> int: + if blosc2.IS_WASM: + return 1 forced = os.getenv("BLOSC2_INDEX_BUILD_THREADS") if forced is not None: try: forced_threads = int(forced) except ValueError: forced_threads = 1 - return max(1, forced_threads) + return _python_executor_threads(forced_threads) if cparams is not None: nthreads = cparams.nthreads if isinstance(cparams, blosc2.CParams) else cparams.get("nthreads") else: @@ -915,8 +924,8 @@ def _index_build_threads(cparams: dict | blosc2.CParams | None = None) -> int: cparams_threads = int(nthreads) except (TypeError, ValueError): cparams_threads = 1 - return max(1, cparams_threads) - return max(1, int(getattr(blosc2, "nthreads", 1) or 1)) + return _python_executor_threads(cparams_threads) + return _python_executor_threads(int(getattr(blosc2, "nthreads", 1) or 1)) def _sidecar_block_len(sidecar: dict, fallback_block_len: int) -> int: @@ -3929,10 +3938,12 @@ def _chunk_nav_candidate_runs( def _index_query_thread_count(task_count: int) -> int: + if blosc2.IS_WASM: + return 1 if task_count < INDEX_QUERY_MIN_CHUNKS_PER_THREAD: return 1 configured_threads = int(getattr(blosc2, "nthreads", 1) or 1) - return max(1, min(configured_threads, task_count // INDEX_QUERY_MIN_CHUNKS_PER_THREAD)) + return _python_executor_threads(min(configured_threads, task_count // INDEX_QUERY_MIN_CHUNKS_PER_THREAD)) def _chunk_batches(chunk_ids: np.ndarray, thread_count: int) -> list[np.ndarray]: From 893343974df19e0ea8b9bf4f3fab2ecb26ccdc91 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 09:05:20 +0200 Subject: [PATCH 46/68] Avoid mmap on main array in light index worker on Windows _light_worker_source() opened the main array file with mmap_mode="r" to create per-thread handles for light index evaluation. On Windows, memory-mapped files hold file locks that prevent subsequent fopen(rb+) calls, causing rebuild_index() and drop_index() to fail with RuntimeError when updating vlmeta on the same file. Skip mmap only on Windows (sys.platform == "win32") so Linux/macOS retain the mmap performance benefit for large-array broad queries. --- src/blosc2/indexing.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index f08759a1..ff8f6fe5 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -13,6 +13,7 @@ import math import os import re +import sys import tempfile import weakref from concurrent.futures import ThreadPoolExecutor @@ -3985,7 +3986,10 @@ def _light_batch_result_dtype(where_x) -> np.dtype: def _light_worker_source(where_x): if _supports_block_reads(where_x) and getattr(where_x, "urlpath", None) is not None: - return blosc2.open(str(where_x.urlpath), mmap_mode="r") + # On Windows, mmap holds a file lock that prevents later vlmeta updates + # (e.g. during rebuild_index / drop_index), so use regular file I/O. + mmap = None if sys.platform == "win32" else "r" + return blosc2.open(str(where_x.urlpath), mmap_mode=mmap) return where_x From 9656f28663d16ca875b7e05bfc425ad73807747d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 09:20:13 +0200 Subject: [PATCH 47/68] Use latest miniexpr sources --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d9bae4c6..2852e9cd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,7 +72,7 @@ endif() FetchContent_Declare(miniexpr GIT_REPOSITORY https://github.com/Blosc/miniexpr.git - GIT_TAG 0ecf0ed5f8c3995c97bd5208a29a120f3c2425a3 + GIT_TAG 43ed59829d791f1aac868087d6ef27e26c8fc1e5 # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../miniexpr ) FetchContent_MakeAvailable(miniexpr) From 1a021a8e92f2fb97a428f0e1135d266b12e8f500 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 09:20:44 +0200 Subject: [PATCH 48/68] Reduce Cython indexing helper overhead --- src/blosc2/indexing_ext.pyx | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx index a2cc90ef..a98e7e16 100644 --- a/src/blosc2/indexing_ext.pyx +++ b/src/blosc2/indexing_ext.pyx @@ -4,9 +4,11 @@ # # SPDX-License-Identifier: BSD-3-Clause ####################################################################### +# cython: wraparound=False import numpy as np cimport numpy as np +import cython from libc.stdint cimport int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t @@ -61,6 +63,8 @@ cdef inline bint _le_ordered_pair( return left_position <= right_position +@cython.boundscheck(False) +@cython.wraparound(False) cdef void _stable_mergesort_float( sort_float_t[:] values, uint64_t[:] positions, @@ -131,6 +135,8 @@ cdef void _stable_mergesort_float( positions[start] = src_positions[start] +@cython.boundscheck(False) +@cython.wraparound(False) cdef void _stable_mergesort_ordered( sort_ordered_t[:] values, uint64_t[:] positions, @@ -1475,7 +1481,7 @@ cdef inline tuple _search_boundary_bounds_uint32_impl( cdef inline tuple _search_bounds_uint64_impl( - np.ndarray[np.uint64_t, ndim=1] values, + np.uint64_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1505,8 +1511,8 @@ cdef inline tuple _search_bounds_uint64_impl( cdef inline tuple _search_boundary_bounds_uint64_impl( - np.ndarray[np.uint64_t, ndim=1] starts, - np.ndarray[np.uint64_t, ndim=1] ends, + np.uint64_t[:] starts, + np.uint64_t[:] ends, object lower, bint lower_inclusive, object upper, From f9a0616c146bc2d9596ddc8e42f5931f696957dc Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 09:35:21 +0200 Subject: [PATCH 49/68] Disable mmap for all index I/O on Windows On Windows, memory-mapped files hold file locks that prevent later fopen(rb+) calls on the same path. This causes vlmeta updates and sidecar file recreation to fail with RuntimeError during rebuild_index, drop_index, and create_index. Introduce _INDEX_MMAP_MODE (None on Windows, "r" elsewhere) and use it at all 22 blosc2.open() sites in indexing.py. Linux and macOS keep the mmap read performance benefit; Windows falls back to regular file I/O. --- src/blosc2/indexing.py | 52 ++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index ff8f6fe5..145ecae1 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -30,6 +30,11 @@ INDEX_FORMAT_VERSION = 1 SELF_TARGET_NAME = "__self__" +# On Windows, mmap holds file locks that prevent later writes (vlmeta updates, +# sidecar recreation during rebuild_index, etc.). Disable mmap for all index +# I/O on that platform. +_INDEX_MMAP_MODE = None if sys.platform == "win32" else "r" + FLAG_ALL_NAN = np.uint8(1 << 0) FLAG_HAS_NAN = np.uint8(1 << 1) @@ -483,7 +488,7 @@ def _compute_sorted_boundaries_from_sidecar( ) -> np.ndarray: nsegments = math.ceil(length / segment_len) boundaries = np.empty(nsegments, dtype=_boundary_dtype(dtype)) - sidecar = blosc2.open(path, mmap_mode="r") + sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) start_value = np.empty(1, dtype=dtype) end_value = np.empty(1, dtype=dtype) for idx in range(nsegments): @@ -664,7 +669,7 @@ def _sidecar_storage_geometry( ) -> tuple[int, int]: if path is None: return fallback_chunk_len, fallback_block_len - sidecar = blosc2.open(path, mmap_mode="r") + sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) return int(sidecar.chunks[0]), int(sidecar.blocks[0]) @@ -723,7 +728,7 @@ def _stream_copy_sidecar_array( blocks: tuple[int, ...], cparams: dict | None = None, ) -> None: - source = blosc2.open(str(source_path), mmap_mode="r") + source = blosc2.open(str(source_path), mmap_mode=_INDEX_MMAP_MODE) blosc2.remove_urlpath(str(dest_path)) kwargs = {"chunks": chunks, "blocks": blocks, "urlpath": str(dest_path), "mode": "w"} if cparams is not None: @@ -2008,7 +2013,7 @@ def _copy_sidecar_to_temp_run( cparams: dict | None = None, ) -> Path: out_path = workdir / f"{prefix}.b2nd" - sidecar = blosc2.open(path, mmap_mode="r") + sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) output = _create_blosc2_temp_array(out_path, length, dtype, FULL_OOC_MERGE_BUFFER_ITEMS, cparams) chunk_len = int(sidecar.chunks[0]) for chunk_id, start in enumerate(range(0, length, chunk_len)): @@ -2048,10 +2053,10 @@ def _merge_run_pair( tracker: TempRunTracker | None = None, cparams: dict | None = None, ) -> SortedRun: - left_values_mm = blosc2.open(str(left.values_path), mmap_mode="r") - left_positions_mm = blosc2.open(str(left.positions_path), mmap_mode="r") - right_values_mm = blosc2.open(str(right.values_path), mmap_mode="r") - right_positions_mm = blosc2.open(str(right.positions_path), mmap_mode="r") + left_values_mm = blosc2.open(str(left.values_path), mmap_mode=_INDEX_MMAP_MODE) + left_positions_mm = blosc2.open(str(left.positions_path), mmap_mode=_INDEX_MMAP_MODE) + right_values_mm = blosc2.open(str(right.values_path), mmap_mode=_INDEX_MMAP_MODE) + right_positions_mm = blosc2.open(str(right.positions_path), mmap_mode=_INDEX_MMAP_MODE) out_values_path = workdir / f"full_merge_values_{merge_id}.b2nd" out_positions_path = workdir / f"full_merge_positions_{merge_id}.b2nd" @@ -2226,8 +2231,8 @@ def _build_full_descriptor_ooc( array, token, kind, full, final_run, dtype, persistent, tracker, cparams ) else: - sorted_values = blosc2.open(str(final_run.values_path), mmap_mode="r")[:] - positions = blosc2.open(str(final_run.positions_path), mmap_mode="r")[:] + sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] values_sidecar = _store_array_sidecar( array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams ) @@ -2925,8 +2930,8 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N array, descriptor, final_run.values_path, final_run.positions_path, final_run.length ) else: - sorted_values = blosc2.open(str(final_run.values_path), mmap_mode="r")[:] - positions = blosc2.open(str(final_run.positions_path), mmap_mode="r")[:] + sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) del sorted_values, positions final_run.values_path.unlink(missing_ok=True) @@ -3986,10 +3991,7 @@ def _light_batch_result_dtype(where_x) -> np.dtype: def _light_worker_source(where_x): if _supports_block_reads(where_x) and getattr(where_x, "urlpath", None) is not None: - # On Windows, mmap holds a file lock that prevents later vlmeta updates - # (e.g. during rebuild_index / drop_index), so use regular file I/O. - mmap = None if sys.platform == "win32" else "r" - return blosc2.open(str(where_x.urlpath), mmap_mode=mmap) + return blosc2.open(str(where_x.urlpath), mmap_mode=_INDEX_MMAP_MODE) return where_x @@ -4136,9 +4138,9 @@ def _bucket_masks_from_light_chunk_nav_ooc( def process_batch(chunk_ids: np.ndarray) -> tuple[list[tuple[int, np.ndarray]], int]: if len(chunk_ids) == 0: return [], 0 - batch_values = blosc2.open(light["values_path"], mmap_mode="r") - batch_buckets = blosc2.open(light["bucket_positions_path"], mmap_mode="r") - batch_l2 = blosc2.open(light["l2_path"], mmap_mode="r") + batch_values = blosc2.open(light["values_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_buckets = blosc2.open(light["bucket_positions_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(light["l2_path"], mmap_mode=_INDEX_MMAP_MODE) batch_results = [] batch_candidate_segments = 0 l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) @@ -4206,9 +4208,9 @@ def _exact_positions_from_reduced_chunk_nav_ooc( def process_cython_batch(chunk_ids: np.ndarray) -> tuple[np.ndarray, int]: if len(chunk_ids) == 0: return np.empty(0, dtype=np.int64), 0 - batch_values = blosc2.open(reduced["values_path"], mmap_mode="r") - batch_positions = blosc2.open(reduced["positions_path"], mmap_mode="r") - batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode="r") + batch_values = blosc2.open(reduced["values_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_positions = blosc2.open(reduced["positions_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode=_INDEX_MMAP_MODE) batch_l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) batch_span_values = np.empty(chunk_len, dtype=dtype) batch_local_positions = np.empty(chunk_len, dtype=local_position_dtype) @@ -4244,9 +4246,9 @@ def process_cython_batch(chunk_ids: np.ndarray) -> tuple[np.ndarray, int]: def process_batch(chunk_ids: np.ndarray) -> tuple[list[np.ndarray], int]: if len(chunk_ids) == 0: return [], 0 - batch_values = blosc2.open(reduced["values_path"], mmap_mode="r") - batch_positions = blosc2.open(reduced["positions_path"], mmap_mode="r") - batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode="r") + batch_values = blosc2.open(reduced["values_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_positions = blosc2.open(reduced["positions_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode=_INDEX_MMAP_MODE) batch_parts = [] batch_candidate_segments = 0 l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) From 480472676eff48b7f815b6824326b7617a8c7a82 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 18:18:14 +0200 Subject: [PATCH 50/68] Use latest miniexpr sources --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2852e9cd..2dcf6a1c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,7 +129,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG b32256fc1287b6e24c22f09ac202265c7054e2bc + GIT_TAG b9617d145ed46cd77afbbc56fbe5474e3c3269d3 # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) From ebde7c1bcf194a4d6cda2bdb3af9bbca2fb7731d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 18:45:16 +0200 Subject: [PATCH 51/68] Clamp wasm NDArray thread defaults and skip executor-only indexing tests --- src/blosc2/blosc2_ext.pyx | 14 ++++++++++---- src/blosc2/storage.py | 2 +- tests/ndarray/test_indexing.py | 4 ++++ 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index c36c51c5..a263c0fb 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -1042,7 +1042,9 @@ cdef create_cparams_from_kwargs(blosc2_cparams *cparams, kwargs): cparams.clevel = kwargs.get('clevel', blosc2.cparams_dflts['clevel']) cparams.use_dict = kwargs.get('use_dict', blosc2.cparams_dflts['use_dict']) cparams.typesize = typesize = kwargs.get('typesize', blosc2.cparams_dflts['typesize']) - cparams.nthreads = kwargs.get('nthreads', blosc2.nthreads) + cparams.nthreads = kwargs.get('nthreads', 1 if blosc2.IS_WASM else blosc2.nthreads) + if blosc2.IS_WASM: + cparams.nthreads = 1 cparams.blocksize = kwargs.get('blocksize', blosc2.cparams_dflts['blocksize']) splitmode = kwargs.get('splitmode', blosc2.cparams_dflts['splitmode']) cparams.splitmode = splitmode.value @@ -1123,7 +1125,9 @@ def compress2(src, **kwargs): cdef create_dparams_from_kwargs(blosc2_dparams *dparams, kwargs, blosc2_cparams* cparams=NULL): memcpy(dparams, &BLOSC2_DPARAMS_DEFAULTS, sizeof(BLOSC2_DPARAMS_DEFAULTS)) - dparams.nthreads = kwargs.get('nthreads', blosc2.nthreads) + dparams.nthreads = kwargs.get('nthreads', 1 if blosc2.IS_WASM else blosc2.nthreads) + if blosc2.IS_WASM: + dparams.nthreads = 1 dparams.schunk = NULL dparams.postfilter = NULL dparams.postparams = NULL @@ -2832,7 +2836,9 @@ def open(urlpath, mode, offset, **kwargs): if cparams is not None: res.schunk.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) else: - res.schunk.cparams = dataclasses.replace(res.schunk.cparams, nthreads=blosc2.nthreads) + res.schunk.cparams = dataclasses.replace( + res.schunk.cparams, nthreads=(1 if blosc2.IS_WASM else blosc2.nthreads) + ) if dparams is not None: res.schunk.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) res.schunk.mode = mode @@ -2842,7 +2848,7 @@ def open(urlpath, mode, offset, **kwargs): if cparams is not None: res.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) else: - res.cparams = dataclasses.replace(res.cparams, nthreads=blosc2.nthreads) + res.cparams = dataclasses.replace(res.cparams, nthreads=(1 if blosc2.IS_WASM else blosc2.nthreads)) if dparams is not None: res.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 0015aea9..5863b145 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -13,7 +13,7 @@ def default_nthreads(): - return blosc2.nthreads + return 1 if blosc2.IS_WASM else blosc2.nthreads def default_filters(): diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index b256c35a..9277096e 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -417,6 +417,8 @@ def test_in_mem_override_disables_ooc_builder(kind): @pytest.mark.parametrize("kind", ["light", "medium"]) def test_chunk_local_ooc_intra_chunk_build_uses_thread_pool_when_threads_forced(monkeypatch, kind): + if blosc2.IS_WASM: + pytest.skip("wasm32 does not use Python thread pools for index building") data = np.arange(48_000, dtype=np.int64) arr = blosc2.asarray(data, chunks=(48_000,), blocks=(1_500,)) indexing = __import__("blosc2.indexing", fromlist=["ThreadPoolExecutor"]) @@ -447,6 +449,8 @@ def map(self, fn, iterable): @pytest.mark.parametrize("kind", ["light", "medium"]) def test_in_memory_chunk_local_build_uses_cparams_nthreads(monkeypatch, kind): + if blosc2.IS_WASM: + pytest.skip("wasm32 does not use Python thread pools for index building") data = np.arange(48_000, dtype=np.int64) arr = blosc2.asarray(data, chunks=(48_000,), blocks=(1_500,)) indexing = __import__("blosc2.indexing", fromlist=["ThreadPoolExecutor"]) From 7db48be508734ce30853db590464591fe11062f7 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 19:08:05 +0200 Subject: [PATCH 52/68] Use latest miniexpr sources --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2dcf6a1c..c1dd0f17 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -72,7 +72,7 @@ endif() FetchContent_Declare(miniexpr GIT_REPOSITORY https://github.com/Blosc/miniexpr.git - GIT_TAG 43ed59829d791f1aac868087d6ef27e26c8fc1e5 + GIT_TAG f2faef741c4c507bf6a03167c72ce7f92c6f0ae8 # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../miniexpr ) FetchContent_MakeAvailable(miniexpr) From 692c0e38a194b12ddaa6a4f6a802b37ce769aee9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Wed, 8 Apr 2026 20:17:40 +0200 Subject: [PATCH 53/68] Enable broader Cython optimizations in indexing_ext helpers --- src/blosc2/indexing_ext.pyx | 56 ++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx index a98e7e16..2be99330 100644 --- a/src/blosc2/indexing_ext.pyx +++ b/src/blosc2/indexing_ext.pyx @@ -4,7 +4,7 @@ # # SPDX-License-Identifier: BSD-3-Clause ####################################################################### -# cython: wraparound=False +# cython: boundscheck=False, wraparound=False, initializedcheck=False import numpy as np cimport numpy as np @@ -972,7 +972,7 @@ cdef inline Py_ssize_t _search_right_uint64(np.uint64_t[:] values, np.uint64_t t cdef inline tuple _search_bounds_float32_impl( - np.ndarray[np.float32_t, ndim=1] values, + np.float32_t[:] values, object lower, bint lower_inclusive, object upper, @@ -992,8 +992,8 @@ cdef inline tuple _search_bounds_float32_impl( cdef inline tuple _search_boundary_bounds_float32_impl( - np.ndarray[np.float32_t, ndim=1] starts, - np.ndarray[np.float32_t, ndim=1] ends, + np.float32_t[:] starts, + np.float32_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1013,7 +1013,7 @@ cdef inline tuple _search_boundary_bounds_float32_impl( cdef inline tuple _search_bounds_float64_impl( - np.ndarray[np.float64_t, ndim=1] values, + np.float64_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1033,8 +1033,8 @@ cdef inline tuple _search_bounds_float64_impl( cdef inline tuple _search_boundary_bounds_float64_impl( - np.ndarray[np.float64_t, ndim=1] starts, - np.ndarray[np.float64_t, ndim=1] ends, + np.float64_t[:] starts, + np.float64_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1054,7 +1054,7 @@ cdef inline tuple _search_boundary_bounds_float64_impl( cdef inline tuple _search_bounds_int8_impl( - np.ndarray[np.int8_t, ndim=1] values, + np.int8_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1084,8 +1084,8 @@ cdef inline tuple _search_bounds_int8_impl( cdef inline tuple _search_boundary_bounds_int8_impl( - np.ndarray[np.int8_t, ndim=1] starts, - np.ndarray[np.int8_t, ndim=1] ends, + np.int8_t[:] starts, + np.int8_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1115,7 +1115,7 @@ cdef inline tuple _search_boundary_bounds_int8_impl( cdef inline tuple _search_bounds_int16_impl( - np.ndarray[np.int16_t, ndim=1] values, + np.int16_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1145,8 +1145,8 @@ cdef inline tuple _search_bounds_int16_impl( cdef inline tuple _search_boundary_bounds_int16_impl( - np.ndarray[np.int16_t, ndim=1] starts, - np.ndarray[np.int16_t, ndim=1] ends, + np.int16_t[:] starts, + np.int16_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1176,7 +1176,7 @@ cdef inline tuple _search_boundary_bounds_int16_impl( cdef inline tuple _search_bounds_int32_impl( - np.ndarray[np.int32_t, ndim=1] values, + np.int32_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1206,8 +1206,8 @@ cdef inline tuple _search_bounds_int32_impl( cdef inline tuple _search_boundary_bounds_int32_impl( - np.ndarray[np.int32_t, ndim=1] starts, - np.ndarray[np.int32_t, ndim=1] ends, + np.int32_t[:] starts, + np.int32_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1237,7 +1237,7 @@ cdef inline tuple _search_boundary_bounds_int32_impl( cdef inline tuple _search_bounds_int64_impl( - np.ndarray[np.int64_t, ndim=1] values, + np.int64_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1267,8 +1267,8 @@ cdef inline tuple _search_bounds_int64_impl( cdef inline tuple _search_boundary_bounds_int64_impl( - np.ndarray[np.int64_t, ndim=1] starts, - np.ndarray[np.int64_t, ndim=1] ends, + np.int64_t[:] starts, + np.int64_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1298,7 +1298,7 @@ cdef inline tuple _search_boundary_bounds_int64_impl( cdef inline tuple _search_bounds_uint8_impl( - np.ndarray[np.uint8_t, ndim=1] values, + np.uint8_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1328,8 +1328,8 @@ cdef inline tuple _search_bounds_uint8_impl( cdef inline tuple _search_boundary_bounds_uint8_impl( - np.ndarray[np.uint8_t, ndim=1] starts, - np.ndarray[np.uint8_t, ndim=1] ends, + np.uint8_t[:] starts, + np.uint8_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1359,7 +1359,7 @@ cdef inline tuple _search_boundary_bounds_uint8_impl( cdef inline tuple _search_bounds_uint16_impl( - np.ndarray[np.uint16_t, ndim=1] values, + np.uint16_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1389,8 +1389,8 @@ cdef inline tuple _search_bounds_uint16_impl( cdef inline tuple _search_boundary_bounds_uint16_impl( - np.ndarray[np.uint16_t, ndim=1] starts, - np.ndarray[np.uint16_t, ndim=1] ends, + np.uint16_t[:] starts, + np.uint16_t[:] ends, object lower, bint lower_inclusive, object upper, @@ -1420,7 +1420,7 @@ cdef inline tuple _search_boundary_bounds_uint16_impl( cdef inline tuple _search_bounds_uint32_impl( - np.ndarray[np.uint32_t, ndim=1] values, + np.uint32_t[:] values, object lower, bint lower_inclusive, object upper, @@ -1450,8 +1450,8 @@ cdef inline tuple _search_bounds_uint32_impl( cdef inline tuple _search_boundary_bounds_uint32_impl( - np.ndarray[np.uint32_t, ndim=1] starts, - np.ndarray[np.uint32_t, ndim=1] ends, + np.uint32_t[:] starts, + np.uint32_t[:] ends, object lower, bint lower_inclusive, object upper, From 3e06322839cba0c0cd1eb303dd46d8146c99e7df Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 05:36:43 +0200 Subject: [PATCH 54/68] New Index class to unify access for array in index sidecar --- AGENTS.md | 48 +++ .../tutorials/14.indexing-arrays.ipynb | 382 ++++++++++-------- doc/reference/classes.rst | 2 + doc/reference/index.rst | 18 +- doc/reference/ndarray.rst | 20 - doc/reference/ufuncs.rst | 8 +- src/blosc2/__init__.py | 2 + src/blosc2/indexing.py | 184 ++++++++- src/blosc2/ndarray.py | 18 +- tests/ndarray/test_indexing.py | 57 +++ 10 files changed, 523 insertions(+), 216 deletions(-) create mode 100644 AGENTS.md diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..47c6aaf8 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,48 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The Python package lives in `src/blosc2/`, including the C/Cython extension sources +(`blosc2_ext.*`) and core modules such as `core.py`, `ndarray.py`, and `schunk.py`. +Tests are under `tests/`, with additional doctests enabled for select modules per +`pytest.ini`. Documentation sources are in `doc/` and build output lands in `html/`. +Examples are in `examples/`, and performance/benchmark scripts live in `bench/`. + +## Build, Test, and Development Commands +- `pip install .` builds the bundled C-Blosc2 and installs the package. +- `pip install -e .` installs in editable mode for local development. +- `CMAKE_PREFIX_PATH=/usr/local USE_SYSTEM_BLOSC2=1 pip install -e .` builds + against a separately installed C-Blosc2. +- `pytest` runs the default test suite (excludes `heavy` and `network` markers). +- `pytest -m "heavy"` runs long-running tests. +- `pytest -m "network"` runs tests requiring network access. +- `cd doc && rm -rf ../html _build && python -m sphinx . ../html` builds docs. + +## Coding Style & Naming Conventions +Use Ruff for formatting and linting (line length 109). Enable pre-commit hooks: +`python -m pip install pre-commit` then `pre-commit install`. Follow Python +conventions: 4-space indentation, `snake_case` for functions/variables, and +`PascalCase` for classes. Pytest discovery expects `tests/test_*.py` and +`test_*` functions. Do not use leading underscores in module-level helper +function names when those helpers are imported from other modules; reserve +leading underscores for file-local implementation details. Avoid leading +underscores in core module filenames under `src/blosc2/`; prefer non-underscored +module names unless there is a strong reason to keep a module private. + +For documentation and tutorial query examples, prefer the shortest idiom that +matches the intended result type. Use `expr[:]` or `arr[mask][:]` when showing +values, use `expr.compute()` when materializing an `NDArray`, and use +`expr.compute(_use_index=False)` when demonstrating scan-vs-index behavior. +Avoid `expr.compute()[:]` unless a NumPy array is specifically required. + +## Testing Guidelines +Pytest is required; warnings are treated as errors. The default configuration +adds `--doctest-modules`, so keep doctest examples in `blosc2/core.py`, +`blosc2/ndarray.py`, and `blosc2/schunk.py` accurate. Use markers `heavy` and +`network` for slow or network-dependent tests. + +## Commit & Pull Request Guidelines +Recent commit messages are short, imperative sentences (e.g., “Add …”, “Fix …”) +without ticket prefixes. For pull requests: branch from `main`, add tests for +behavior changes, update docs for API changes, ensure the test suite passes, +and avoid introducing new compiler warnings. Link issues when applicable and +include clear reproduction steps for bug fixes. diff --git a/doc/getting_started/tutorials/14.indexing-arrays.ipynb b/doc/getting_started/tutorials/14.indexing-arrays.ipynb index a7515c19..ced5f784 100644 --- a/doc/getting_started/tutorials/14.indexing-arrays.ipynb +++ b/doc/getting_started/tutorials/14.indexing-arrays.ipynb @@ -7,13 +7,13 @@ "source": [ "# Indexing Arrays\n", "\n", - "Blosc2 can attach indexes to 1-D `NDArray` objects and to fields inside 1-D structured arrays. These indexes accelerate selective queries, and `full` indexes can also drive ordered access directly through `sort(order=...)`, `indices(order=...)`, and `itersorted(...)`.\n", + "Blosc2 can attach indexes to 1-D `NDArray` objects and to fields inside 1-D structured arrays. These indexes accelerate selective masks, and `full` indexes can also drive ordered access directly through `sort(order=...)`, `indices(order=...)`, and `itersorted(...)`.\n", "\n", "This tutorial covers:\n", "\n", "- how to create field and expression indexes,\n", - "- how to tell whether a query is using an index,\n", - "- what sort of acceleration different index kinds can deliver on a selective query,\n", + "- how to tell whether a mask is using an index,\n", + "- what sort of acceleration different index kinds can deliver on a selective mask,\n", "- how index persistence works,\n", "- when to rebuild indexes,\n", "- and a recommended workflow for keeping append-heavy `full` indexes compact.\n" @@ -29,15 +29,13 @@ }, { "cell_type": "code", - "execution_count": 1, "id": "8c510216bc394cf9", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:11:20.790474Z", - "start_time": "2026-04-03T12:11:20.514656Z" + "end_time": "2026-04-09T03:17:03.228163Z", + "start_time": "2026-04-09T03:17:03.209583Z" } }, - "outputs": [], "source": [ "import statistics\n", "import time\n", @@ -48,6 +46,18 @@ "import blosc2\n", "\n", "\n", + "def format_bytes(nbytes):\n", + " units = (\"B\", \"KiB\", \"MiB\", \"GiB\", \"TiB\")\n", + " value = float(nbytes)\n", + " for unit in units:\n", + " if value < 1024.0 or unit == units[-1]:\n", + " if unit == \"B\":\n", + " return f\"{int(value)} {unit}\"\n", + " return f\"{value:.2f} {unit}\"\n", + " value /= 1024.0\n", + " return f\"{value:.2f} {units[-1]}\"\n", + "\n", + "\n", "def show_index_summary(label, descriptor):\n", " print(\n", " f\"{label}: kind={descriptor['kind']}, persistent={descriptor['persistent']}, \"\n", @@ -81,7 +91,9 @@ "]\n", "for path in paths:\n", " blosc2.remove_urlpath(path)" - ] + ], + "outputs": [], + "execution_count": 11 }, { "cell_type": "markdown", @@ -94,155 +106,166 @@ "\n", "- `ultralight`: compact summaries only,\n", "- `light`: summary levels plus lightweight per-block payloads,\n", - "- `medium`: richer exact-filter payloads,\n", - "- `full`: globally sorted payloads for exact filtering and ordered reuse.\n", + "- `medium`: richer payloads for exact masks,\n", + "- `full`: globally sorted payloads for exact masks and ordered reuse.\n", "\n", "There is one active index per target field or expression. If you create another index on the same target, it replaces the previous one. The easiest way to compare kinds is to build them on separate arrays.\n", "\n", - "This example uses one million rows so the timing differences are visible without turning the tutorial into a long benchmark." + "The next cell times index creation and reports the compressed storage footprint of each index relative to the compressed base array." ] }, { "cell_type": "code", - "execution_count": 5, "id": "d1a5a37585a045ca", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:12:14.968055Z", - "start_time": "2026-04-03T12:12:02.218201Z" + "end_time": "2026-04-09T03:17:19.644470Z", + "start_time": "2026-04-09T03:17:05.289092Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "ultralight: kind=ultralight, persistent=False, ooc=False, stale=False\n", - "light: kind=light, persistent=False, ooc=True, stale=False\n", - "medium: kind=medium, persistent=False, ooc=True, stale=False\n", - "full: kind=full, persistent=False, ooc=True, stale=False\n" - ] - } - ], "source": [ "N_ROWS = 10_000_000\n", - "QUERY_TEXT = \"(id >= -25.0) & (id < 25.0)\"\n", + "MASK_TEXT = \"(id >= -25.0) & (id < 25.0)\"\n", "\n", "rng = np.random.default_rng(0)\n", "dtype = np.dtype([(\"id\", np.float64), (\"payload\", np.int32)])\n", - "data = np.zeros(N_ROWS, dtype=dtype)\n", + "data = blosc2.zeros(N_ROWS, dtype=dtype)[:]\n", "# Build a predictable id column, then shuffle it so the source data is not already ordered.\n", - "data[\"id\"] = np.arange(data.shape[0], dtype=np.float64) - data.shape[0] / 2\n", + "data[\"id\"] = blosc2.arange(-data.shape[0] // 2, data.shape[0] // 2, dtype=np.float64)\n", "rng.shuffle(data[\"id\"])\n", - "data[\"payload\"] = np.arange(data.shape[0], dtype=np.int32)\n", - "\n", - "chunks = (250_000,)\n", - "blocks = (50_000,)\n", + "data[\"payload\"] = blosc2.arange(data.shape[0], dtype=np.int32)\n", "\n", "indexed_arrays = {}\n", + "build_rows = []\n", + "base_cbytes = None\n", "for kind in (\"ultralight\", \"light\", \"medium\", \"full\"):\n", - " arr = blosc2.asarray(data.copy(), chunks=chunks, blocks=blocks)\n", - " descriptor = arr.create_index(field=\"id\", kind=kind)\n", + " arr = blosc2.asarray(data)\n", + " if base_cbytes is None:\n", + " base_cbytes = arr.cbytes\n", + " t0 = time.perf_counter()\n", + " arr.create_index(field=\"id\", kind=kind)\n", + " build_ms = (time.perf_counter() - t0) * 1e3\n", + " index_obj = arr.index(\"id\")\n", " indexed_arrays[kind] = arr\n", - " show_index_summary(kind, descriptor)" - ] + " build_rows.append((kind, build_ms, index_obj.cbytes, index_obj.cbytes / base_cbytes))\n", + "\n", + "print(f\"Compressed base array size: {format_bytes(base_cbytes)}\")\n", + "print(f\"{'kind':<12} {'build_ms':>10} {'index_size':>12} {'overhead':>10}\")\n", + "for kind, build_ms, index_cbytes, overhead in build_rows:\n", + " print(f\"{kind:<12} {build_ms:10.3f} {format_bytes(index_cbytes):>12} {overhead:>9.2f}x\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compressed base array size: 30.74 MiB\n", + "kind build_ms index_size overhead\n", + "ultralight 54.423 142 B 0.00x\n", + "light 748.307 26.04 MiB 0.85x\n", + "medium 2453.256 34.99 MiB 1.14x\n", + "full 8727.665 28.44 MiB 0.93x\n" + ] + } + ], + "execution_count": 12 }, { "cell_type": "markdown", "id": "bc1cc9b122fe4052", "metadata": {}, "source": [ - "## Using an index for filtering\n", + "## Using an index for masks\n", "\n", - "Range predicates are planned automatically when you use `where(...)`. You can inspect the plan with `explain()` and compare the indexed result with a scan by passing `_use_index=False` to `compute()`." + "Range predicates are planned automatically when you use `where(...)`. If you just want the matching values, `expr[:]` is the shortest form. In the comparisons below we use `compute()` so the result stays as an `NDArray`, and we force a scan by passing `_use_index=False`." ] }, { "cell_type": "code", - "execution_count": 6, "id": "f1b3aaec965b42d6", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:12:20.542680Z", - "start_time": "2026-04-03T12:12:20.096087Z" + "end_time": "2026-04-09T02:55:34.424905Z", + "start_time": "2026-04-09T02:55:34.206800Z" } }, + "source": [ + "medium_arr = indexed_arrays[\"medium\"]\n", + "expr = blosc2.lazyexpr(MASK_TEXT, medium_arr.fields).where(medium_arr)\n", + "\n", + "print(explain_subset(expr))\n", + "\n", + "indexed = expr.compute()\n", + "scanned = expr.compute(_use_index=False)\n", + "np.testing.assert_array_equal(indexed, scanned)\n", + "print(f\"Matched rows: {len(indexed)}\")" + ], "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "{'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'medium', 'level': 'exact', 'lookup_path': None, 'full_runs': 0}\n", + "{'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'medium', 'level': 'exact', 'lookup_path': 'chunk-nav', 'full_runs': 0}\n", "Matched rows: 50\n" ] } ], - "source": [ - "medium_arr = indexed_arrays[\"medium\"]\n", - "expr = blosc2.lazyexpr(QUERY_TEXT, medium_arr.fields).where(medium_arr)\n", - "\n", - "print(explain_subset(expr))\n", - "\n", - "indexed = expr.compute()[:]\n", - "scanned = expr.compute(_use_index=False)[:]\n", - "np.testing.assert_array_equal(indexed, scanned)\n", - "print(f\"Matched rows: {len(indexed)}\")" - ] + "execution_count": 3 }, { "cell_type": "markdown", "id": "1db4bd16a95a48dd", "metadata": {}, "source": [ - "### Timing the query with and without indexes\n", + "### Timing the mask with and without indexes\n", "\n", - "The next cell measures the same selective predicate on all four index kinds and compares it with a forced full scan. On this exact workload, `medium` and `full` usually show the clearest benefit because they carry richer exact-filter payloads." + "The next cell measures the same selective mask on all four index kinds and compares it with a forced full scan. On this exact workload, `medium` and `full` usually show the clearest benefit because they carry richer payloads for exact masks." ] }, { "cell_type": "code", - "execution_count": 7, "id": "c9e932b7561b4ff4", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:12:32.747545Z", - "start_time": "2026-04-03T12:12:24.632866Z" + "end_time": "2026-04-09T02:55:38.336928Z", + "start_time": "2026-04-09T02:55:34.432852Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Selective filter over 10,000,000 rows\n", - "kind scan_ms index_ms speedup\n", - "ultralight 363.919 363.733 1.00x\n", - "light 363.916 22.456 16.21x\n", - "medium 366.537 24.952 14.69x\n", - "full 365.223 23.544 15.51x\n" - ] - } - ], "source": [ "timing_rows = []\n", "expected = None\n", "for kind, arr in indexed_arrays.items():\n", - " expr = blosc2.lazyexpr(QUERY_TEXT, arr.fields).where(arr)\n", - " result = expr.compute()[:]\n", + " expr = blosc2.lazyexpr(MASK_TEXT, arr.fields).where(arr)\n", + " result = expr.compute()\n", " if expected is None:\n", " expected = result\n", " else:\n", " np.testing.assert_array_equal(result, expected)\n", "\n", - " scan_ms = median_ms(lambda expr=expr: expr.compute(_use_index=False)[:], repeats=3)\n", - " index_ms = median_ms(lambda expr=expr: expr.compute()[:], repeats=3)\n", + " scan_ms = median_ms(lambda expr=expr: expr.compute(_use_index=False), repeats=3)\n", + " index_ms = median_ms(lambda expr=expr: expr.compute(), repeats=3)\n", " timing_rows.append((kind, scan_ms, index_ms, scan_ms / index_ms))\n", "\n", - "print(f\"Selective filter over {N_ROWS:,} rows\")\n", + "print(f\"Selective mask over {N_ROWS:,} rows\")\n", "print(f\"{'kind':<12} {'scan_ms':>10} {'index_ms':>10} {'speedup':>10}\")\n", "for kind, scan_ms, index_ms, speedup in timing_rows:\n", " print(f\"{kind:<12} {scan_ms:10.3f} {index_ms:10.3f} {speedup:10.2f}x\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selective mask over 10,000,000 rows\n", + "kind scan_ms index_ms speedup\n", + "ultralight 161.539 174.253 0.93x\n", + "light 197.265 23.717 8.32x\n", + "medium 157.424 23.681 6.65x\n", + "full 157.527 25.280 6.23x\n" + ] + } + ], + "execution_count": 4 }, { "cell_type": "markdown", @@ -256,38 +279,38 @@ }, { "cell_type": "code", - "execution_count": 8, "id": "9ffcb0d8d06a4daa", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:13:02.734653Z", - "start_time": "2026-04-03T12:13:02.675861Z" + "end_time": "2026-04-09T02:55:38.375308Z", + "start_time": "2026-04-09T02:55:38.346312Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sorted positions: [7 5 3 1 6 4 2 0]\n", - "Sorted rows:\n", - "[(1, 2) (1, 4) (1, 6) (1, 8) (2, 3) (2, 5) (2, 7) (2, 9)]\n" - ] - } - ], "source": [ "ordered_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int64)])\n", "ordered_data = np.array(\n", " [(2, 9), (1, 8), (2, 7), (1, 6), (2, 5), (1, 4), (2, 3), (1, 2)],\n", " dtype=ordered_dtype,\n", ")\n", - "ordered_arr = blosc2.asarray(ordered_data, chunks=(4,), blocks=(2,))\n", + "ordered_arr = blosc2.asarray(ordered_data)\n", "ordered_arr.create_csindex(\"id\")\n", "\n", "print(\"Sorted positions:\", ordered_arr.indices(order=[\"id\", \"payload\"])[:])\n", "print(\"Sorted rows:\")\n", "print(ordered_arr.sort(order=[\"id\", \"payload\"])[:])" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sorted positions: [7 5 3 1 6 4 2 0]\n", + "Sorted rows:\n", + "[(1, 2) (1, 4) (1, 6) (1, 8) (2, 3) (2, 5) (2, 7) (2, 9)]\n" + ] + } + ], + "execution_count": 5 }, { "cell_type": "markdown", @@ -296,19 +319,28 @@ "source": [ "## Expression indexes\n", "\n", - "You can also index a deterministic scalar expression stream. Expression indexes are matched by normalized expression identity, so the same expression can be reused for filtering and ordered access." + "You can also index a deterministic scalar expression stream. Expression indexes are matched by normalized expression identity, so the same expression can be reused for masks and ordered access." ] }, { "cell_type": "code", - "execution_count": 9, "id": "7d337ce2f9fb4f32", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:13:10.701850Z", - "start_time": "2026-04-03T12:13:10.650458Z" + "end_time": "2026-04-09T02:55:38.402494Z", + "start_time": "2026-04-09T02:55:38.376197Z" } }, + "source": [ + "expr_dtype = np.dtype([(\"x\", np.int64), (\"payload\", np.int32)])\n", + "expr_data = np.array([(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], dtype=expr_dtype)\n", + "expr_arr = blosc2.asarray(expr_data)\n", + "expr_arr.create_expr_index(\"abs(x)\", kind=\"full\", name=\"abs_x\")\n", + "\n", + "ordered_expr = blosc2.lazyexpr(\"(abs(x) >= 2) & (abs(x) < 8)\", expr_arr.fields).where(expr_arr)\n", + "print(explain_subset(ordered_expr))\n", + "print(\"Expression-order positions:\", ordered_expr.indices(order=\"abs(x)\")[:])" + ], "outputs": [ { "name": "stdout", @@ -319,16 +351,7 @@ ] } ], - "source": [ - "expr_dtype = np.dtype([(\"x\", np.int64), (\"payload\", np.int32)])\n", - "expr_data = np.array([(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], dtype=expr_dtype)\n", - "expr_arr = blosc2.asarray(expr_data, chunks=(4,), blocks=(2,))\n", - "expr_arr.create_expr_index(\"abs(x)\", kind=\"full\", name=\"abs_x\")\n", - "\n", - "ordered_expr = blosc2.lazyexpr(\"(abs(x) >= 2) & (abs(x) < 8)\", expr_arr.fields).where(expr_arr)\n", - "print(explain_subset(ordered_expr))\n", - "print(\"Expression-order positions:\", ordered_expr.indices(order=\"abs(x)\").compute()[:])" - ] + "execution_count": 6 }, { "cell_type": "markdown", @@ -348,14 +371,22 @@ }, { "cell_type": "code", - "execution_count": 10, "id": "0be5f512928f48db", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:13:16.392311Z", - "start_time": "2026-04-03T12:13:13.976166Z" + "end_time": "2026-04-09T02:55:40.518254Z", + "start_time": "2026-04-09T02:55:38.403529Z" } }, + "source": [ + "persistent_arr = blosc2.asarray(data, urlpath=paths[0], mode=\"w\")\n", + "persistent_descriptor = persistent_arr.create_index(field=\"id\", kind=\"medium\")\n", + "show_index_summary(\"persistent medium\", persistent_descriptor)\n", + "\n", + "reopened = blosc2.open(paths[0], mode=\"a\")\n", + "print(f\"Reopened index count: {len(reopened.indexes)}\")\n", + "print(f\"Persisted sidecar path: {reopened.indexes[0]['reduced']['values_path']}\")" + ], "outputs": [ { "name": "stdout", @@ -367,15 +398,7 @@ ] } ], - "source": [ - "persistent_arr = blosc2.asarray(data, urlpath=paths[0], mode=\"w\", chunks=chunks, blocks=blocks)\n", - "persistent_descriptor = persistent_arr.create_index(field=\"id\", kind=\"medium\")\n", - "show_index_summary(\"persistent medium\", persistent_descriptor)\n", - "\n", - "reopened = blosc2.open(paths[0], mode=\"a\")\n", - "print(f\"Reopened index count: {len(reopened.indexes)}\")\n", - "print(f\"Persisted sidecar path: {reopened.indexes[0]['reduced']['values_path']}\")" - ] + "execution_count": 7 }, { "cell_type": "markdown", @@ -389,14 +412,22 @@ }, { "cell_type": "code", - "execution_count": 11, "id": "11f0cd1b910b409a", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:13:18.234684Z", - "start_time": "2026-04-03T12:13:18.166991Z" + "end_time": "2026-04-09T02:55:40.559125Z", + "start_time": "2026-04-09T02:55:40.527487Z" } }, + "source": [ + "mutable_arr = blosc2.arange(20, dtype=np.int64)\n", + "mutable_arr.create_index(kind=\"full\")\n", + "mutable_arr[:3] = -1\n", + "\n", + "print(\"Stale after direct mutation:\", mutable_arr.indexes[0][\"stale\"])\n", + "mutable_arr.rebuild_index()\n", + "print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])" + ], "outputs": [ { "name": "stdout", @@ -407,15 +438,7 @@ ] } ], - "source": [ - "mutable_arr = blosc2.asarray(np.arange(20, dtype=np.int64), chunks=(10,), blocks=(5,))\n", - "mutable_arr.create_index(kind=\"full\")\n", - "mutable_arr[:3] = -1\n", - "\n", - "print(\"Stale after direct mutation:\", mutable_arr.indexes[0][\"stale\"])\n", - "mutable_arr.rebuild_index()\n", - "print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])" - ] + "execution_count": 8 }, { "cell_type": "markdown", @@ -430,74 +453,74 @@ "\n", "1. create a persistent `full` index once,\n", "2. append freely during ingestion,\n", - "3. let queries keep working while runs accumulate,\n", + "3. let masks keep working while runs accumulate,\n", "4. call `compact_index()` after ingestion windows or before latency-sensitive read phases.\n", "\n", - "The next example uses a larger append-heavy array and times the same selective query before and after compaction. The exact query path reports whether it is using a compact lookup layout or a run-aware fallback. After compaction, `full[\"runs\"]` becomes empty again." + "The next example uses a larger append-heavy array and times the same selective mask before and after compaction. The exact mask path reports whether it is using a compact lookup layout or a run-aware fallback. After compaction, `full[\"runs\"]` becomes empty again." ] }, { "cell_type": "code", - "execution_count": 12, "id": "2e1a47a9cf7246e6", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:13:30.781045Z", - "start_time": "2026-04-03T12:13:28.516376Z" + "end_time": "2026-04-09T02:55:41.338819Z", + "start_time": "2026-04-09T02:55:40.559982Z" } }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n", - "Pending runs: 40\n", - "Median query time before compaction: 3.250 ms\n", - "After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n", - "Pending runs: 0\n", - "Median query time after compaction: 0.939 ms\n", - "Speedup after compaction: 3.46x\n" - ] - } - ], "source": [ "append_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int32)])\n", "base_rows = 200_000\n", "append_batch = 500\n", "num_runs = 40\n", "\n", - "append_data = np.zeros(base_rows, dtype=append_dtype)\n", - "append_data[\"id\"] = np.arange(base_rows, dtype=np.int64)\n", - "append_data[\"payload\"] = np.arange(base_rows, dtype=np.int32)\n", + "append_data = blosc2.zeros(base_rows, dtype=append_dtype)[:]\n", + "append_data[\"id\"] = blosc2.arange(base_rows, dtype=np.int64)\n", + "append_data[\"payload\"] = blosc2.arange(base_rows, dtype=np.int32)\n", "\n", - "append_arr = blosc2.asarray(append_data, urlpath=paths[1], mode=\"w\", chunks=(20_000,), blocks=(4_000,))\n", + "append_arr = blosc2.asarray(append_data, urlpath=paths[1], mode=\"w\")\n", "append_arr.create_index(field=\"id\", kind=\"full\")\n", "\n", "for run in range(num_runs):\n", " start = 300_000 + run * append_batch\n", - " batch = np.zeros(append_batch, dtype=append_dtype)\n", - " batch[\"id\"] = np.arange(start, start + append_batch, dtype=np.int64)\n", - " batch[\"payload\"] = np.arange(append_batch, dtype=np.int32)\n", + " batch = blosc2.zeros(append_batch, dtype=append_dtype)[:]\n", + " batch[\"id\"] = blosc2.arange(start, start + append_batch, dtype=np.int64)\n", + " batch[\"payload\"] = blosc2.arange(append_batch, dtype=np.int32)\n", " append_arr.append(batch)\n", "\n", - "append_query = \"(id >= 310_000) & (id < 310_020)\"\n", - "append_expr = blosc2.lazyexpr(append_query, append_arr.fields).where(append_arr)\n", + "mask_str = \"(id >= 310_000) & (id < 310_020)\"\n", + "append_expr = blosc2.lazyexpr(mask_str, append_arr.fields).where(append_arr)\n", "before_info = explain_subset(append_expr)\n", - "before_ms = median_ms(lambda: append_expr.compute()[:], repeats=5)\n", + "before_ms = median_ms(lambda: append_expr.compute(), repeats=5)\n", "print(\"Before compaction:\", before_info)\n", "print(\"Pending runs:\", len(append_arr.indexes[0][\"full\"][\"runs\"]))\n", - "print(f\"Median query time before compaction: {before_ms:.3f} ms\")\n", + "print(f\"Median mask time before compaction: {before_ms:.3f} ms\")\n", "\n", "append_arr.compact_index(\"id\")\n", - "append_expr = blosc2.lazyexpr(append_query, append_arr.fields).where(append_arr)\n", + "append_expr = blosc2.lazyexpr(mask_str, append_arr.fields).where(append_arr)\n", "after_info = explain_subset(append_expr)\n", - "after_ms = median_ms(lambda: append_expr.compute()[:], repeats=5)\n", + "after_ms = median_ms(lambda: append_expr.compute(), repeats=5)\n", "print(\"After compaction:\", after_info)\n", "print(\"Pending runs:\", len(append_arr.indexes[0][\"full\"][\"runs\"]))\n", - "print(f\"Median query time after compaction: {after_ms:.3f} ms\")\n", + "print(f\"Median mask time after compaction: {after_ms:.3f} ms\")\n", "print(f\"Speedup after compaction: {before_ms / after_ms:.2f}x\")" - ] + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n", + "Pending runs: 40\n", + "Median mask time before compaction: 3.514 ms\n", + "After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n", + "Pending runs: 0\n", + "Median mask time after compaction: 0.638 ms\n", + "Speedup after compaction: 5.51x\n" + ] + } + ], + "execution_count": 9 }, { "cell_type": "markdown", @@ -506,7 +529,7 @@ "source": [ "## Practical guidance\n", "\n", - "- Use `medium` when your main goal is faster selective filtering.\n", + "- Use `medium` when your main goal is faster selective masks.\n", "- Use `full` when you also want ordered reuse through `sort(order=...)`, `indices(order=...)`, or `itersorted(...)`.\n", "- Persist the base array if you want indexes to survive reopen automatically.\n", "- After unsupported mutations, use `rebuild_index()`.\n", @@ -516,27 +539,32 @@ }, { "cell_type": "code", - "execution_count": 13, "id": "9833102355db4ec0", "metadata": { "ExecuteTime": { - "end_time": "2026-04-03T12:13:36.744016Z", - "start_time": "2026-04-03T12:13:36.726709Z" + "end_time": "2026-04-09T02:55:41.360620Z", + "start_time": "2026-04-09T02:55:41.350276Z" } }, - "outputs": [], "source": [ "for path in paths:\n", " blosc2.remove_urlpath(path)" - ] + ], + "outputs": [], + "execution_count": 10 }, { "cell_type": "code", - "execution_count": null, "id": "17489b2c3d2ac57", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T02:55:41.380135Z", + "start_time": "2026-04-09T02:55:41.368555Z" + } + }, + "source": [], "outputs": [], - "source": [] + "execution_count": 10 } ], "metadata": { diff --git a/doc/reference/classes.rst b/doc/reference/classes.rst index 0ce750e3..65385d87 100644 --- a/doc/reference/classes.rst +++ b/doc/reference/classes.rst @@ -8,6 +8,7 @@ Main Classes .. autosummary:: NDArray + Index NDField LazyArray C2Array @@ -28,6 +29,7 @@ Main Classes :maxdepth: 1 ndarray + index ndfield lazyarray c2array diff --git a/doc/reference/index.rst b/doc/reference/index.rst index cf4ff7fa..78f9a276 100644 --- a/doc/reference/index.rst +++ b/doc/reference/index.rst @@ -1,14 +1,8 @@ -API Reference -============= +Index +===== -.. toctree:: - :maxdepth: 2 +.. currentmodule:: blosc2 - classes - save_load - msgpack_serialization - storage - array_operations - utilities - low_level - misc +.. autoclass:: Index + :members: + :member-order: groupwise diff --git a/doc/reference/ndarray.rst b/doc/reference/ndarray.rst index 846c54c0..c70ea255 100644 --- a/doc/reference/ndarray.rst +++ b/doc/reference/ndarray.rst @@ -31,26 +31,6 @@ In addition, all the functions from the :ref:`Lazy Functions ` s .. automethod:: __getitem__ .. automethod:: __setitem__ - Index Methods - ------------- - - The following methods are part of the public NDArray indexing lifecycle. - Use ``create_index`` / ``create_expr_index`` to build indexes, - ``rebuild_index`` when a stale index must be refreshed after unsupported - mutations, and ``compact_index`` to consolidate append-heavy ``full`` - indexes explicitly. - - Chunk-local index creation uses parallel intra-chunk sorting by default. - Set ``BLOSC2_INDEX_BUILD_THREADS`` to control the number of build threads. - Set ``BLOSC2_INDEX_BUILD_THREADS=1`` to disable parallel sorting. - - .. automethod:: create_index - .. automethod:: create_csindex - .. automethod:: create_expr_index - .. automethod:: drop_index - .. automethod:: rebuild_index - .. automethod:: compact_index - Constructors ------------ .. _NDArrayConstructors: diff --git a/doc/reference/ufuncs.rst b/doc/reference/ufuncs.rst index 3ae5397a..5fda0d87 100644 --- a/doc/reference/ufuncs.rst +++ b/doc/reference/ufuncs.rst @@ -25,8 +25,8 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica asin asinh atan - atan2 atanh + atan2 bitwise_and bitwise_invert bitwise_left_shift @@ -53,9 +53,9 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica less less_equal log + log10 log1p log2 - log10 logaddexp logical_and logical_not @@ -100,8 +100,8 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica .. autofunction:: blosc2.asin .. autofunction:: blosc2.asinh .. autofunction:: blosc2.atan -.. autofunction:: blosc2.atan2 .. autofunction:: blosc2.atanh +.. autofunction:: blosc2.atan2 .. autofunction:: blosc2.bitwise_and .. autofunction:: blosc2.bitwise_invert .. autofunction:: blosc2.bitwise_left_shift @@ -128,9 +128,9 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica .. autofunction:: blosc2.less .. autofunction:: blosc2.less_equal .. autofunction:: blosc2.log +.. autofunction:: blosc2.log10 .. autofunction:: blosc2.log1p .. autofunction:: blosc2.log2 -.. autofunction:: blosc2.log10 .. autofunction:: blosc2.logaddexp .. autofunction:: blosc2.logical_and .. autofunction:: blosc2.logical_not diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 8ba00c5e..ddee6390 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -557,6 +557,7 @@ def _raise(exc): can_cast, ) from .proxy import Proxy, ProxySource, ProxyNDSource, ProxyNDField, SimpleProxy, jit, as_simpleproxy +from .indexing import Index from .schunk import SChunk, open from . import linalg @@ -730,6 +731,7 @@ def _raise(exc): "DictStore", "EmbedStore", "Filter", + "Index", "LazyArray", "DSLKernel", "DSLSyntaxError", diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 145ecae1..541db79e 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -16,6 +16,7 @@ import sys import tempfile import weakref +from collections.abc import Mapping from concurrent.futures import ThreadPoolExecutor from dataclasses import asdict, dataclass from pathlib import Path @@ -153,6 +154,14 @@ class OrderedIndexPlan: secondary_refinement: bool = False +@dataclass(frozen=True, slots=True) +class IndexComponent: + label: str + category: str + name: str + path: str | None + + def _default_index_store() -> dict: return {"version": INDEX_FORMAT_VERSION, "indexes": {}} @@ -205,6 +214,17 @@ def _copy_descriptor(descriptor: dict) -> dict: return copied +def _descriptor_for_token(array: blosc2.NDArray, token: str) -> dict: + descriptor = _load_store(array)["indexes"].get(token) + if descriptor is None: + raise KeyError("index not found") + return descriptor + + +def _copy_descriptor_for_token(array: blosc2.NDArray, token: str) -> dict: + return _copy_descriptor(_descriptor_for_token(array, token)) + + def _is_persistent_array(array: blosc2.NDArray) -> bool: return array.urlpath is not None @@ -2508,6 +2528,160 @@ def _resolve_index_token(store: dict, field: str | None, name: str | None) -> st return token +def iter_index_components(array: blosc2.NDArray, descriptor: dict): + for level in descriptor["levels"]: + level_info = descriptor["levels"][level] + yield IndexComponent(f"summary.{level}", "summary", level, level_info.get("path")) + + light = descriptor.get("light") + if light is not None: + yield IndexComponent("light.values", "light", "values", light.get("values_path")) + yield IndexComponent( + "light.bucket_positions", "light", "bucket_positions", light.get("bucket_positions_path") + ) + yield IndexComponent("light.offsets", "light", "offsets", light.get("offsets_path")) + yield IndexComponent("light_nav.l1", "light_nav", "l1", light.get("l1_path")) + yield IndexComponent("light_nav.l2", "light_nav", "l2", light.get("l2_path")) + + reduced = descriptor.get("reduced") + if reduced is not None: + yield IndexComponent("reduced.values", "reduced", "values", reduced.get("values_path")) + yield IndexComponent("reduced.positions", "reduced", "positions", reduced.get("positions_path")) + yield IndexComponent("reduced.offsets", "reduced", "offsets", reduced.get("offsets_path")) + yield IndexComponent("reduced_nav.l1", "reduced_nav", "l1", reduced.get("l1_path")) + yield IndexComponent("reduced_nav.l2", "reduced_nav", "l2", reduced.get("l2_path")) + + full = descriptor.get("full") + if full is not None: + yield IndexComponent("full.values", "full", "values", full.get("values_path")) + yield IndexComponent("full.positions", "full", "positions", full.get("positions_path")) + yield IndexComponent("full_nav.l1", "full_nav", "l1", full.get("l1_path")) + yield IndexComponent("full_nav.l2", "full_nav", "l2", full.get("l2_path")) + for run in full.get("runs", ()): + run_id = int(run["id"]) + yield IndexComponent( + f"full_run.{run_id}.values", + "full_run", + f"{run_id}.values", + run.get("values_path"), + ) + yield IndexComponent( + f"full_run.{run_id}.positions", + "full_run", + f"{run_id}.positions", + run.get("positions_path"), + ) + + +def _component_nbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: + if component.path is not None: + return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).nbytes) + token = descriptor["token"] + return int(_load_array_sidecar(array, token, component.category, component.name, component.path).nbytes) + + +def _component_cbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: + if component.path is not None: + return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).cbytes) + token = descriptor["token"] + sidecar = _load_array_sidecar(array, token, component.category, component.name, component.path) + kwargs = {} + cparams = descriptor.get("cparams") + if cparams is not None: + kwargs["cparams"] = cparams + return int(blosc2.asarray(sidecar, **kwargs).cbytes) + + +class Index(Mapping): + def __init__(self, array: blosc2.NDArray, token: str): + self._array = array + self._token = token + + def _descriptor(self) -> dict: + return _descriptor_for_token(self._array, self._token) + + @property + def descriptor(self) -> dict: + return _copy_descriptor_for_token(self._array, self._token) + + @property + def kind(self) -> str: + return self._descriptor()["kind"] + + @property + def field(self) -> str | None: + return self._descriptor()["field"] + + @property + def name(self) -> str | None: + return self._descriptor()["name"] + + @property + def target(self) -> dict: + return self.descriptor["target"] + + @property + def persistent(self) -> bool: + return bool(self._descriptor()["persistent"]) + + @property + def stale(self) -> bool: + return bool(self._descriptor()["stale"]) + + @property + def nbytes(self) -> int: + descriptor = self._descriptor() + return sum( + _component_nbytes(self._array, descriptor, component) + for component in iter_index_components(self._array, descriptor) + ) + + @property + def cbytes(self) -> int: + descriptor = self._descriptor() + return sum( + _component_cbytes(self._array, descriptor, component) + for component in iter_index_components(self._array, descriptor) + ) + + @property + def cratio(self) -> float: + cbytes = self.cbytes + if cbytes == 0: + return math.inf + return self.nbytes / cbytes + + def drop(self) -> None: + drop_index(self._array, field=self.field, name=self.name) + + def rebuild(self) -> Index: + rebuild_index(self._array, field=self.field, name=self.name) + return self + + def compact(self) -> Index: + compact_index(self._array, field=self.field, name=self.name) + return self + + def __getitem__(self, key): + return self.descriptor[key] + + def __iter__(self): + return iter(self.descriptor) + + def __len__(self) -> int: + return len(self.descriptor) + + def __repr__(self) -> str: + try: + descriptor = self._descriptor() + except KeyError: + return "Index()" + return ( + f"Index(kind={descriptor['kind']!r}, field={descriptor['field']!r}, " + f"name={descriptor['name']!r}, stale={descriptor['stale']!r})" + ) + + def _remove_sidecar_path(path: str | None) -> None: if path: blosc2.remove_urlpath(path) @@ -2942,9 +3116,15 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N return _copy_descriptor(descriptor) -def get_indexes(array: blosc2.NDArray) -> list[dict]: +def get_indexes(array: blosc2.NDArray) -> list[Index]: store = _load_store(array) - return [_copy_descriptor(store["indexes"][key]) for key in sorted(store["indexes"])] + return [Index(array, key) for key in sorted(store["indexes"])] + + +def get_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> Index: + store = _load_store(array) + token = _resolve_index_token(store, field, name) + return Index(array, token) def mark_indexes_stale(array: blosc2.NDArray) -> None: diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 1ff10b7b..72b29da2 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -4926,8 +4926,24 @@ def compact_index(self, field: str | None = None, name: str | None = None) -> di return indexing.compact_index(self, field=field, name=name) + def index(self, field: str | None = None, name: str | None = None) -> blosc2.indexing.Index: + """Return a live view over one index. + + Parameters + ---------- + field : str or None, optional + Structured field identifying the target index. Use ``None`` for the + value index on a plain 1-D array. + name : str or None, optional + Optional logical index label. When omitted and the array has a + single index, that index is selected automatically. + """ + from . import indexing + + return indexing.get_index(self, field=field, name=name) + @property - def indexes(self) -> list[dict]: + def indexes(self) -> list[blosc2.indexing.Index]: from . import indexing return indexing.get_indexes(self) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 9277096e..727c83e8 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -73,6 +73,63 @@ def test_module_level_will_use_index_matches_lazyexpr_method(): assert indexing.will_use_index(plain_expr) == plain_expr.will_use_index() +def test_index_accessor_exposes_live_view_and_sizes(): + import blosc2.indexing as indexing + + arr = blosc2.asarray(np.arange(1_000, dtype=np.int64), chunks=(250,), blocks=(50,)) + arr.create_index(kind="medium") + + idx = arr.index() + assert isinstance(idx, indexing.Index) + assert idx.kind == "medium" + assert idx.field is None + assert idx.name == "__self__" + assert idx.target == {"source": "field", "field": None} + assert idx.persistent is False + assert idx.stale is False + assert idx["kind"] == "medium" + assert idx["target"]["field"] is None + assert idx.nbytes > 0 + assert idx.cbytes > 0 + assert idx.cratio == pytest.approx(idx.nbytes / idx.cbytes) + + arr[:3] = -1 + assert idx.stale is True + + rebuilt = idx.rebuild() + assert rebuilt is idx + assert idx.stale is False + + idx.drop() + assert arr.indexes == [] + with pytest.raises(KeyError): + _ = idx.kind + + +def test_index_accessor_compact_updates_live_view(tmp_path): + path = tmp_path / "index_accessor_compact.b2nd" + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + idx = arr.index("a") + assert idx.kind == "full" + assert idx.persistent is True + assert idx.nbytes > 0 + assert idx.cbytes > 0 + + arr.append(np.array([(0, 100), (3, 101)], dtype=dtype)) + assert len(idx["full"]["runs"]) == 1 + + compacted = idx.compact() + assert compacted is idx + assert idx["full"]["runs"] == [] + + reopened = blosc2.open(path, mode="a") + assert reopened.index("a")["full"]["runs"] == [] + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_random_field_index_matches_scan(kind): rng = np.random.default_rng(0) From d91d4f480037ed26a99d4437ab129fd26ad145ce Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 05:44:31 +0200 Subject: [PATCH 55/68] Raise on direct assignment to NDArray.fields entries --- src/blosc2/ndarray.py | 51 ++++++++++++++++++++++++++++++----- tests/ndarray/test_ndarray.py | 15 +++++++++++ 2 files changed, 60 insertions(+), 6 deletions(-) diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index 72b29da2..a6085057 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -13,6 +13,7 @@ import tempfile from abc import abstractmethod from collections import OrderedDict, namedtuple +from collections.abc import Mapping from functools import reduce from itertools import product from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, runtime_checkable @@ -143,6 +144,41 @@ def __getitem__(self, key: Any) -> Any: ... +class FieldsAccessor(Mapping): + """Read-only mapping of structured field views.""" + + def __init__(self, field_views: dict[str, Any]): + self._field_views = field_views + + def __getitem__(self, key: str) -> Any: + return self._field_views[key] + + def __iter__(self) -> Iterator[str]: + return iter(self._field_views) + + def __len__(self) -> int: + return len(self._field_views) + + def __setitem__(self, key: str, value: object) -> None: + raise TypeError(f'assign through the field view, e.g. array.fields["{key}"][:] = values') + + def copy(self) -> dict[str, Any]: + return dict(self._field_views) + + def __or__(self, other: object) -> dict[str, Any]: + if not isinstance(other, Mapping): + return NotImplemented + return self.copy() | dict(other) + + def __ror__(self, other: object) -> dict[str, Any]: + if not isinstance(other, Mapping): + return NotImplemented + return dict(other) | self.copy() + + def __repr__(self) -> str: + return repr(self._field_views) + + def is_documented_by(original): def wrapper(target): target.__doc__ = original.__doc__ @@ -3695,10 +3731,11 @@ def __init__(self, **kwargs): base = kwargs.pop("_base", None) super().__init__(kwargs["_array"], base=base) # Accessor to fields - self._fields = {} + field_views = {} if self.dtype.fields: for field in self.dtype.fields: - self._fields[field] = NDField(self, field) + field_views[field] = NDField(self, field) + self._fields = FieldsAccessor(field_views) @property def cparams(self) -> blosc2.CParams: @@ -3747,14 +3784,14 @@ def vlmeta(self) -> dict: return self.schunk.vlmeta @property - def fields(self) -> dict: + def fields(self) -> Mapping[str, NDField]: """ - Dictionary with the fields of the structured array. + Read-only mapping with the fields of the structured array. Returns ------- - fields: dict - A dictionary with the fields of the structured array. + fields: Mapping + A read-only mapping with the fields of the structured array. See Also -------- @@ -3770,6 +3807,8 @@ def fields(self) -> dict: >>> sa = blosc2.zeros(shape, dtype=dtype) >>> # Check that fields are equal >>> assert sa.fields['a'] == sa.fields['b'] + >>> # Assign through the field view + >>> sa.fields['a'][:] = 1 """ return self._fields diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index b557bb65..820c881f 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -115,6 +115,21 @@ def test_ndarray_info_has_human_sizes(): assert "cbytes" in text +def test_fields_assignment_requires_field_view_slice(): + dtype = np.dtype([("id", np.float64), ("payload", np.int32)]) + array = blosc2.zeros(4, dtype=dtype) + + with pytest.raises( + TypeError, match=r'assign through the field view, e\.g\. array\.fields\["id"\]\[:\] = values' + ): + array.fields["id"] = np.arange(4, dtype=np.float64) + + np.testing.assert_array_equal(array[:], np.zeros(4, dtype=dtype)) + + array.fields["id"][:] = np.arange(4, dtype=np.float64) + np.testing.assert_array_equal(array.fields["id"][:], np.arange(4, dtype=np.float64)) + + @pytest.mark.parametrize( ("shape", "newshape", "chunks", "blocks"), [ From 4da954721e2b9c1db496e261206eea934386c258 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 06:27:33 +0200 Subject: [PATCH 56/68] Speed up exact row gathering for scattered index hits --- bench/ndarray/index_query_bench.py | 25 +++- .../tutorials/14.indexing-arrays.ipynb | 141 +++++++----------- src/blosc2/indexing.py | 48 ++++-- tests/ndarray/test_indexing.py | 35 +++++ 4 files changed, 141 insertions(+), 108 deletions(-) diff --git a/bench/ndarray/index_query_bench.py b/bench/ndarray/index_query_bench.py index 1bc75f84..0ed6dc98 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/ndarray/index_query_bench.py @@ -25,10 +25,11 @@ DEFAULT_REPEATS = 3 KINDS = ("ultralight", "light", "medium", "full") DEFAULT_KIND = "light" -DISTS = ("sorted", "block-shuffled", "permuted") +DISTS = ("sorted", "block-shuffled", "permuted", "random") RNG_SEED = 0 DEFAULT_OPLEVEL = 5 FULL_QUERY_MODES = ("auto", "selective-ooc", "whole-load") +DATASET_LAYOUT_VERSION = "payload-ramp-v1" COLD_COLUMNS = [ ("rows", lambda result: f"{result['size']:,}"), @@ -72,6 +73,11 @@ def source_dtype(id_dtype: np.dtype) -> np.dtype: return np.dtype([("id", np.dtype(id_dtype)), ("payload", np.float32)]) +def payload_slice(start: int, stop: int) -> np.ndarray: + """Deterministic nontrivial payload values for structured benchmark rows.""" + return np.arange(start, stop, dtype=np.float32) + + def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: dtype = np.dtype(dtype) if dtype == np.dtype(np.bool_): @@ -240,6 +246,12 @@ def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step: ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype) +def _randomized_ids(size: int, dtype: np.dtype) -> np.ndarray: + ids = make_ordered_ids(size, dtype) + np.random.default_rng(RNG_SEED).shuffle(ids) + return ids + + def build_persistent_array( size: int, dist: str, id_dtype: np.dtype, path: Path, chunks: int | None, blocks: int | None ) -> blosc2.NDArray: @@ -254,6 +266,7 @@ def build_persistent_array( block_len = int(arr.blocks[0]) block_order = _block_order(size, block_len) if dist == "block-shuffled" else None permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0) + random_ids = _randomized_ids(size, id_dtype) if dist == "random" else None for start in range(0, size, chunk_len): stop = min(start + chunk_len, size) chunk = np.zeros(stop - start, dtype=dtype) @@ -263,14 +276,20 @@ def build_persistent_array( _fill_block_shuffled_ids(chunk["id"], size, start, stop, block_len, block_order) elif dist == "permuted": _fill_permuted_ids(chunk["id"], size, start, stop, permuted_step, permuted_offset) + elif dist == "random": + chunk["id"] = random_ids[start:stop] else: raise ValueError(f"unsupported distribution {dist!r}") + chunk["payload"] = payload_slice(start, stop) arr[start:stop] = chunk return arr def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None) -> Path: - return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.b2nd" + return ( + size_dir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.{geometry_token(chunks, blocks)}.b2nd" + ) def indexed_array_path( @@ -293,7 +312,7 @@ def indexed_array_path( thread_token = "threads-auto" if nthreads is None else f"threads-{nthreads}" return ( size_dir - / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{geometry_token(chunks, blocks)}.{codec_token}.{clevel_token}.{thread_token}" + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.{geometry_token(chunks, blocks)}.{codec_token}.{clevel_token}.{thread_token}" f".{kind}.opt{optlevel}.{mode}.b2nd" ) diff --git a/doc/getting_started/tutorials/14.indexing-arrays.ipynb b/doc/getting_started/tutorials/14.indexing-arrays.ipynb index ced5f784..01f06507 100644 --- a/doc/getting_started/tutorials/14.indexing-arrays.ipynb +++ b/doc/getting_started/tutorials/14.indexing-arrays.ipynb @@ -32,8 +32,8 @@ "id": "8c510216bc394cf9", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T03:17:03.228163Z", - "start_time": "2026-04-09T03:17:03.209583Z" + "end_time": "2026-04-09T04:14:37.432863Z", + "start_time": "2026-04-09T04:14:37.110770Z" } }, "source": [ @@ -93,7 +93,7 @@ " blosc2.remove_urlpath(path)" ], "outputs": [], - "execution_count": 11 + "execution_count": 1 }, { "cell_type": "markdown", @@ -119,8 +119,8 @@ "id": "d1a5a37585a045ca", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T03:17:19.644470Z", - "start_time": "2026-04-09T03:17:05.289092Z" + "end_time": "2026-04-09T04:14:51.428855Z", + "start_time": "2026-04-09T04:14:37.433671Z" } }, "source": [ @@ -129,19 +129,15 @@ "\n", "rng = np.random.default_rng(0)\n", "dtype = np.dtype([(\"id\", np.float64), (\"payload\", np.int32)])\n", - "data = blosc2.zeros(N_ROWS, dtype=dtype)[:]\n", - "# Build a predictable id column, then shuffle it so the source data is not already ordered.\n", - "data[\"id\"] = blosc2.arange(-data.shape[0] // 2, data.shape[0] // 2, dtype=np.float64)\n", - "rng.shuffle(data[\"id\"])\n", - "data[\"payload\"] = blosc2.arange(data.shape[0], dtype=np.int32)\n", + "ids = np.arange(-N_ROWS // 2, N_ROWS // 2, dtype=np.float64)\n", + "rng.shuffle(ids)\n", + "data = blosc2.fromiter(((id_, i) for i, id_ in enumerate(ids)), shape=(N_ROWS,), dtype=dtype)\n", "\n", "indexed_arrays = {}\n", "build_rows = []\n", - "base_cbytes = None\n", + "base_cbytes = data.cbytes\n", "for kind in (\"ultralight\", \"light\", \"medium\", \"full\"):\n", - " arr = blosc2.asarray(data)\n", - " if base_cbytes is None:\n", - " base_cbytes = arr.cbytes\n", + " arr = data.copy()\n", " t0 = time.perf_counter()\n", " arr.create_index(field=\"id\", kind=kind)\n", " build_ms = (time.perf_counter() - t0) * 1e3\n", @@ -161,14 +157,14 @@ "text": [ "Compressed base array size: 30.74 MiB\n", "kind build_ms index_size overhead\n", - "ultralight 54.423 142 B 0.00x\n", - "light 748.307 26.04 MiB 0.85x\n", - "medium 2453.256 34.99 MiB 1.14x\n", - "full 8727.665 28.44 MiB 0.93x\n" + "ultralight 45.783 142 B 0.00x\n", + "light 674.304 26.04 MiB 0.85x\n", + "medium 2195.323 34.99 MiB 1.14x\n", + "full 8483.835 28.44 MiB 0.93x\n" ] } ], - "execution_count": 12 + "execution_count": 2 }, { "cell_type": "markdown", @@ -185,8 +181,8 @@ "id": "f1b3aaec965b42d6", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T02:55:34.424905Z", - "start_time": "2026-04-09T02:55:34.206800Z" + "end_time": "2026-04-09T04:14:51.546053Z", + "start_time": "2026-04-09T04:14:51.449229Z" } }, "source": [ @@ -227,8 +223,8 @@ "id": "c9e932b7561b4ff4", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T02:55:38.336928Z", - "start_time": "2026-04-09T02:55:34.432852Z" + "end_time": "2026-04-09T04:14:53.105689Z", + "start_time": "2026-04-09T04:14:51.548648Z" } }, "source": [ @@ -258,10 +254,10 @@ "text": [ "Selective mask over 10,000,000 rows\n", "kind scan_ms index_ms speedup\n", - "ultralight 161.539 174.253 0.93x\n", - "light 197.265 23.717 8.32x\n", - "medium 157.424 23.681 6.65x\n", - "full 157.527 25.280 6.23x\n" + "ultralight 70.429 67.914 1.04x\n", + "light 68.560 5.011 13.68x\n", + "medium 68.481 4.430 15.46x\n", + "full 68.408 4.263 16.05x\n" ] } ], @@ -282,8 +278,8 @@ "id": "9ffcb0d8d06a4daa", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T02:55:38.375308Z", - "start_time": "2026-04-09T02:55:38.346312Z" + "end_time": "2026-04-09T04:14:53.160261Z", + "start_time": "2026-04-09T04:14:53.118529Z" } }, "source": [ @@ -327,8 +323,8 @@ "id": "7d337ce2f9fb4f32", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T02:55:38.402494Z", - "start_time": "2026-04-09T02:55:38.376197Z" + "end_time": "2026-04-09T04:14:53.206514Z", + "start_time": "2026-04-09T04:14:53.171092Z" } }, "source": [ @@ -374,8 +370,8 @@ "id": "0be5f512928f48db", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T02:55:40.518254Z", - "start_time": "2026-04-09T02:55:38.403529Z" + "end_time": "2026-04-09T04:14:55.722443Z", + "start_time": "2026-04-09T04:14:53.207978Z" } }, "source": [ @@ -392,9 +388,19 @@ "name": "stdout", "output_type": "stream", "text": [ - "persistent medium: kind=medium, persistent=True, ooc=True, stale=False\n", - "Reopened index count: 1\n", - "Persisted sidecar path: indexing_tutorial_medium.__index__.id.medium.reduced.values.b2nd\n" + "persistent medium: kind=medium, persistent=False, ooc=True, stale=False\n" + ] + }, + { + "ename": "FileNotFoundError", + "evalue": "No such file or directory: indexing_tutorial_medium.b2nd", + "output_type": "error", + "traceback": [ + "\u001B[31m---------------------------------------------------------------------------\u001B[39m", + "\u001B[31mFileNotFoundError\u001B[39m Traceback (most recent call last)", + "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[7]\u001B[39m\u001B[32m, line 5\u001B[39m\n\u001B[32m 2\u001B[39m persistent_descriptor = persistent_arr.create_index(field=\u001B[33m\"\u001B[39m\u001B[33mid\u001B[39m\u001B[33m\"\u001B[39m, kind=\u001B[33m\"\u001B[39m\u001B[33mmedium\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 3\u001B[39m show_index_summary(\u001B[33m\"\u001B[39m\u001B[33mpersistent medium\u001B[39m\u001B[33m\"\u001B[39m, persistent_descriptor)\n\u001B[32m----> \u001B[39m\u001B[32m5\u001B[39m reopened = \u001B[43mblosc2\u001B[49m\u001B[43m.\u001B[49m\u001B[43mopen\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpaths\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmode\u001B[49m\u001B[43m=\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43ma\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[32m 6\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mReopened index count: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mlen\u001B[39m(reopened.indexes)\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 7\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mPersisted sidecar path: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mreopened.indexes[\u001B[32m0\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mreduced\u001B[39m\u001B[33m'\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mvalues_path\u001B[39m\u001B[33m'\u001B[39m]\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n", + "\u001B[36mFile \u001B[39m\u001B[32m~/blosc/python-blosc2/src/blosc2/schunk.py:1779\u001B[39m, in \u001B[36mopen\u001B[39m\u001B[34m(urlpath, mode, offset, **kwargs)\u001B[39m\n\u001B[32m 1776\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m special\n\u001B[32m 1778\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m os.path.exists(urlpath):\n\u001B[32m-> \u001B[39m\u001B[32m1779\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mFileNotFoundError\u001B[39;00m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mNo such file or directory: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00murlpath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 1781\u001B[39m _set_default_dparams(kwargs)\n\u001B[32m 1782\u001B[39m res = blosc2_ext.open(urlpath, mode, offset, **kwargs)\n", + "\u001B[31mFileNotFoundError\u001B[39m: No such file or directory: indexing_tutorial_medium.b2nd" ] } ], @@ -413,12 +419,7 @@ { "cell_type": "code", "id": "11f0cd1b910b409a", - "metadata": { - "ExecuteTime": { - "end_time": "2026-04-09T02:55:40.559125Z", - "start_time": "2026-04-09T02:55:40.527487Z" - } - }, + "metadata": {}, "source": [ "mutable_arr = blosc2.arange(20, dtype=np.int64)\n", "mutable_arr.create_index(kind=\"full\")\n", @@ -428,17 +429,8 @@ "mutable_arr.rebuild_index()\n", "print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Stale after direct mutation: True\n", - "Stale after rebuild: False\n" - ] - } - ], - "execution_count": 8 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -462,12 +454,7 @@ { "cell_type": "code", "id": "2e1a47a9cf7246e6", - "metadata": { - "ExecuteTime": { - "end_time": "2026-04-09T02:55:41.338819Z", - "start_time": "2026-04-09T02:55:40.559982Z" - } - }, + "metadata": {}, "source": [ "append_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int32)])\n", "base_rows = 200_000\n", @@ -505,22 +492,8 @@ "print(f\"Median mask time after compaction: {after_ms:.3f} ms\")\n", "print(f\"Speedup after compaction: {before_ms / after_ms:.2f}x\")" ], - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n", - "Pending runs: 40\n", - "Median mask time before compaction: 3.514 ms\n", - "After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n", - "Pending runs: 0\n", - "Median mask time after compaction: 0.638 ms\n", - "Speedup after compaction: 5.51x\n" - ] - } - ], - "execution_count": 9 + "outputs": [], + "execution_count": null }, { "cell_type": "markdown", @@ -540,31 +513,21 @@ { "cell_type": "code", "id": "9833102355db4ec0", - "metadata": { - "ExecuteTime": { - "end_time": "2026-04-09T02:55:41.360620Z", - "start_time": "2026-04-09T02:55:41.350276Z" - } - }, + "metadata": {}, "source": [ "for path in paths:\n", " blosc2.remove_urlpath(path)" ], "outputs": [], - "execution_count": 10 + "execution_count": null }, { "cell_type": "code", "id": "17489b2c3d2ac57", - "metadata": { - "ExecuteTime": { - "end_time": "2026-04-09T02:55:41.380135Z", - "start_time": "2026-04-09T02:55:41.368555Z" - } - }, + "metadata": {}, "source": [], "outputs": [], - "execution_count": 10 + "execution_count": null } ], "metadata": { diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 541db79e..46a3fd0b 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -51,7 +51,6 @@ _PERSISTENT_INDEXES: dict[tuple[str, str | int], dict] = {} _DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} _SIDECAR_HANDLE_CACHE: dict[tuple[int, str | None, str, str], object] = {} -BLOCK_GATHER_POSITIONS_THRESHOLD = 32 FULL_OOC_RUN_ITEMS = 2_000_000 FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 FULL_SELECTIVE_OOC_MAX_SPANS = 128 @@ -4934,21 +4933,37 @@ def _gather_positions_by_block( chunk_id = int(chunk_ids[chunk_start_idx]) chunk_origin = chunk_id * chunk_len local_positions = chunk_positions - chunk_origin - block_ids = local_positions // block_len - unique_blocks = np.unique(block_ids) - if len(unique_blocks) != 1 or np.any(np.diff(local_positions) < 0): - chunk_stop = min(chunk_origin + chunk_len, total_len) - chunk_values = where_x[chunk_origin:chunk_stop] - output[chunk_start_idx:chunk_stop_idx] = chunk_values[local_positions] - chunk_start_idx = chunk_stop_idx - continue + if np.any(np.diff(local_positions) < 0): + order = np.argsort(local_positions, kind="stable") + sorted_local_positions = local_positions[order] + else: + order = None + sorted_local_positions = local_positions + + sorted_output = ( + output[chunk_start_idx:chunk_stop_idx] + if order is None + else np.empty(len(chunk_positions), dtype=output.dtype) + ) + block_ids = sorted_local_positions // block_len + block_breaks = np.nonzero(np.diff(block_ids) != 0)[0] + 1 + block_start_idx = 0 + for block_stop_idx in (*block_breaks, len(sorted_local_positions)): + block_positions = sorted_local_positions[block_start_idx:block_stop_idx] + span_start = int(block_positions[0]) + span_stop = int(block_positions[-1]) + 1 + span_items = span_stop - span_start + span_values = np.empty(span_items, dtype=output.dtype) + where_x.get_1d_span_numpy(span_values, chunk_id, span_start, span_items) + sorted_output[block_start_idx:block_stop_idx] = span_values[block_positions - span_start] + block_start_idx = block_stop_idx - span_start = int(local_positions[0]) - span_stop = int(local_positions[-1]) + 1 - span_items = span_stop - span_start - span_values = np.empty(span_items, dtype=_where_output_dtype(where_x)) - where_x.get_1d_span_numpy(span_values, chunk_id, span_start, span_items) - output[chunk_start_idx:chunk_stop_idx] = span_values[local_positions - span_start] + if order is None: + output[chunk_start_idx:chunk_stop_idx] = sorted_output + else: + inverse = np.empty(len(order), dtype=np.intp) + inverse[order] = np.arange(len(order), dtype=np.intp) + output[chunk_start_idx:chunk_stop_idx] = sorted_output[inverse] chunk_start_idx = chunk_stop_idx return output @@ -4957,7 +4972,8 @@ def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: if plan.exact_positions is None: raise ValueError("full evaluation requires exact positions") if plan.base is not None: - if len(plan.exact_positions) <= BLOCK_GATHER_POSITIONS_THRESHOLD: + block_gather_threshold = int(plan.base.blocks[0]) + if len(plan.exact_positions) <= block_gather_threshold: return _gather_positions_by_block( where["_where_x"], plan.exact_positions, diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 727c83e8..2523e358 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -130,6 +130,41 @@ def test_index_accessor_compact_updates_live_view(tmp_path): assert reopened.index("a")["full"]["runs"] == [] +def test_gather_positions_by_block_avoids_whole_chunk_fallback_for_multi_block_reads(monkeypatch): + import blosc2.indexing as indexing + + class FakeSource: + def __init__(self, data, chunk_len): + self.data = np.asarray(data) + self.dtype = self.data.dtype + self.chunk_len = chunk_len + self.slice_reads = 0 + self.span_reads = [] + + def __getitem__(self, key): + self.slice_reads += 1 + return self.data[key] + + def get_1d_span_numpy(self, out, nchunk, start, nitems): + self.span_reads.append((int(nchunk), int(start), int(nitems))) + base = int(nchunk) * self.chunk_len + int(start) + out[:] = self.data[base : base + int(nitems)] + + chunk_len = 10 + block_len = 4 + data = np.arange(40, dtype=np.int64) + positions = np.array([1, 5, 7, 12, 19], dtype=np.int64) + source = FakeSource(data, chunk_len) + + monkeypatch.setattr(indexing, "_supports_block_reads", lambda _: True) + + gathered = indexing._gather_positions_by_block(source, positions, chunk_len, block_len, len(data)) + + np.testing.assert_array_equal(gathered, data[positions]) + assert source.slice_reads == 0 + assert source.span_reads == [(0, 1, 1), (0, 5, 3), (1, 2, 1), (1, 9, 1)] + + @pytest.mark.parametrize("kind", ["light", "medium", "full"]) def test_random_field_index_matches_scan(kind): rng = np.random.default_rng(0) From 0c5711c536cea916752f7767dcf76adc4aa27e7f Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 08:16:45 +0200 Subject: [PATCH 57/68] Comparison with DuckDB and moved benchmarks to bench/indexing --- bench/indexing/blosc2-vs-duckdb-indexes.md | 352 +++++++++++ bench/indexing/duckdb_query_bench.py | 585 ++++++++++++++++++ .../index_query_bench.py | 22 +- .../index_query_bench_tables.py | 0 bench/indexing/parquet_query_bench.py | 441 +++++++++++++ 5 files changed, 1398 insertions(+), 2 deletions(-) create mode 100644 bench/indexing/blosc2-vs-duckdb-indexes.md create mode 100644 bench/indexing/duckdb_query_bench.py rename bench/{ndarray => indexing}/index_query_bench.py (97%) rename bench/{ndarray => indexing}/index_query_bench_tables.py (100%) create mode 100644 bench/indexing/parquet_query_bench.py diff --git a/bench/indexing/blosc2-vs-duckdb-indexes.md b/bench/indexing/blosc2-vs-duckdb-indexes.md new file mode 100644 index 00000000..5025948f --- /dev/null +++ b/bench/indexing/blosc2-vs-duckdb-indexes.md @@ -0,0 +1,352 @@ +# Blosc2 vs DuckDB Indexes + +This note summarizes the benchmark comparisons we ran between Blosc2 indexes and DuckDB indexing/pruning +mechanisms on a 10M-row structured dataset. + +The goal is not to claim a universal winner, but to document the current observed tradeoffs around: + +- index creation time +- lookup latency +- total storage footprint +- sensitivity to query shape + + +## Benchmark Setup + +### Dataset + +- Rows: `10,000,000` +- Schema: + - `id`: indexed field, `float64` + - `payload`: deterministic nontrivial ramp payload +- Distribution: `random` + - true random shuffle of `id` +- Query widths tested: + - `50` + - `1` + +### Blosc2 + +- Script: `index_query_bench.py` +- Index kinds: + - `ultralight` + - `light` + - `medium` + - `full` +- Default geometry in these runs: + - `chunks=1,250,000` + - `blocks=10,000` + +### DuckDB + +- Script: `duckdb_query_bench.py` +- Layouts: + - `zonemap` + - `art-index` +- Batch size used while loading: + - `1,250,000` + + +## Important Context + +There are two different DuckDB query shapes that matter a lot: + +- range form: + - `id >= lo AND id <= hi` +- single-value form: + - `id = value` + +For Blosc2, switching between a collapsed width-1 range and `==` makes almost no practical difference. + +For DuckDB, this difference is very important: + +- `art-index` was much slower with the range form +- `art-index` became much faster with the single-value `=` predicate + +So any DuckDB comparison must state which predicate shape was used. + + +## Width-50 Comparison + +### DuckDB + +Command: + +```bash +python duckdb_query_bench.py \ + --size 10M \ + --outdir /tmp/duckdb-bench-smoke2 \ + --dist random \ + --query-width 50 \ + --layout all \ + --repeats 1 +``` + +Observed results: + +- `zonemap` + - build: `1180.630 ms` + - filtered lookup: `13.326 ms` + - DB size: `56,111,104` bytes +- `art-index` + - build: `2844.010 ms` + - filtered lookup: `12.419 ms` + - DB size: `478,687,232` bytes + +### Blosc2 + +Command: + +```bash +python index_query_bench.py \ + --size 10M \ + --outdir /tmp/indexes-10M \ + --kind light \ + --query-width 50 \ + --in-mem \ + --dist random +``` + +Observed `light` results: + +- build: `705.193 ms` +- cold lookup: `6.370 ms` +- warm lookup: `6.250 ms` +- base array size: about `31 MB` +- `light` index sidecars: about `27 MB` +- total footprint: about `58 MB` + +### Interpretation + +For this moderately selective random workload: + +- Blosc2 `light` is about `2x` faster than DuckDB `zonemap` +- Blosc2 `light` has a total footprint similar to DuckDB `zonemap` +- DuckDB `art-index` is only slightly faster than `zonemap` here, but much larger + +This suggests that Blosc2 `light` is more than a simple zonemap. It behaves like an active lossy lookup +structure rather than only coarse pruning metadata. + + +## Width-1 Comparison: Generic Range Form + +### DuckDB + +Command: + +```bash +python duckdb_query_bench.py \ + --size 10M \ + --outdir /tmp/duckdb-bench-smoke2 \ + --dist random \ + --query-width 1 \ + --layout all \ + --repeats 3 +``` + +Observed results: + +- `zonemap` + - filtered lookup: `12.612 ms` +- `art-index` + - filtered lookup: `13.641 ms` + +### Blosc2 + +Command: + +```bash +python index_query_bench.py \ + --size 10M \ + --outdir /tmp/indexes-10M \ + --kind all \ + --query-width 1 \ + --dist random +``` + +Observed results: + +- `light` + - cold lookup: `1.463 ms` + - warm lookup: `1.286 ms` +- `medium` + - cold lookup: `1.089 ms` + - warm lookup: `0.986 ms` +- `full` + - cold lookup: `0.618 ms` + - warm lookup: `0.544 ms` + +### Interpretation + +With the generic range form, Blosc2 is much faster than DuckDB: + +- Blosc2 `light` is already about `9x` faster than DuckDB `zonemap` +- Blosc2 exact indexes (`medium`, `full`) are much faster still +- DuckDB `art-index` does not show its real point-lookup behavior in this predicate form + + +## Width-1 Comparison: Single-Value Predicate + +### DuckDB + +Command: + +```bash +python duckdb_query_bench.py \ + --size 10M \ + --outdir /tmp/duckdb-bench-smoke2 \ + --dist random \ + --query-width 1 \ + --layout all \ + --repeats 3 \ + --query-single-value +``` + +Observed results: + +- `zonemap` + - build: `1193.665 ms` + - filtered lookup: `8.646 ms` + - DB size: `56,111,104` bytes +- `art-index` + - build: `2849.869 ms` + - filtered lookup: `0.755 ms` + - DB size: `478,687,232` bytes + +### Blosc2 + +Command: + +```bash +python index_query_bench.py \ + --size 10M \ + --outdir /tmp/indexes-10M \ + --kind all \ + --query-width 1 \ + --dist random \ + --query-single-value +``` + +Observed results: + +- `light` + - build: `1225.637 ms` + - cold lookup: `1.290 ms` + - warm lookup: `2.351 ms` + - index sidecars: `27,497,393` bytes +- `medium` + - build: `5511.863 ms` + - cold lookup: `1.081 ms` + - warm lookup: `0.964 ms` + - index sidecars: `37,645,201` bytes +- `full` + - build: `10954.844 ms` + - cold lookup: `0.603 ms` + - warm lookup: `0.525 ms` + - index sidecars: `29,888,673` bytes + +### Interpretation + +Once DuckDB is allowed to use the more planner-friendly single-value predicate: + +- `art-index` becomes very fast +- `art-index` is now faster than Blosc2 `light` +- Blosc2 `full` still remains slightly faster than DuckDB `art-index` on this measured point-lookup case + +However, the storage costs are very different: + +- DuckDB `art-index` database size: about `478.7 MB` +- DuckDB zonemap baseline size: about `56.1 MB` +- estimated ART overhead over baseline: about `422.6 MB` +- Blosc2 `full` base + index footprint: about `31 MB + 29.9 MB = 60.9 MB` + +So for true point lookups: + +- DuckDB `art-index` is competitive on latency +- Blosc2 `full` is still faster in the measured run +- Blosc2 `full` is much smaller overall +- DuckDB `art-index` is much faster to build than Blosc2 `full` + + +## Blosc2 Light vs DuckDB Zonemap + +This is the cleanest cross-system comparison, because both are lossy pruning structures rather than exact +secondary indexes. + +Main observations: + +- storage footprint is in roughly the same ballpark + - DuckDB zonemap DB: about `56 MB` + - Blosc2 base + `light`: about `58 MB` +- Blosc2 `light` lookup speed is much better + - width `50`: about `6.25 ms` vs `13.33 ms` + - width `1`: about `1.3-1.5 ms` vs `8.6-12.6 ms` + +Conclusion: + +- DuckDB zonemap is closer in spirit to Blosc2 `light` than DuckDB ART is +- but Blosc2 `light` is a materially stronger lookup structure on these workloads + + +## Blosc2 Full vs DuckDB ART + +This is the most relevant exact-index comparison. + +Main observations: + +- point-lookup latency + - DuckDB `art-index`: `0.755 ms` + - Blosc2 `full`: `0.603 ms` cold, `0.525 ms` warm +- build time + - DuckDB `art-index`: `2849.869 ms` + - Blosc2 `full`: `10954.844 ms` +- footprint + - DuckDB `art-index` DB: about `478.7 MB` + - Blosc2 `full` base + index: about `60.9 MB` + +Conclusion: + +- DuckDB ART wins on build time +- Blosc2 `full` wins on storage efficiency +- Blosc2 `full` was slightly faster on the measured point lookup +- DuckDB ART is much more sensitive to predicate shape + + +## Why `--query-single-value` Matters More in DuckDB + +Observed behavior: + +- Blosc2: + - width-1 range form and `==` are nearly equivalent in performance +- DuckDB: + - width-1 range form was much slower than `id = value` + +Practical implication: + +- Blosc2 benchmarks are fairly robust to whether a point lookup is written as `==` or as a collapsed range +- DuckDB benchmarks must distinguish those two forms explicitly, otherwise ART performance is understated + + +## Caveats + +- These results come from one hardware/software setup and one dataset shape. +- DuckDB stores table data and indexes in one DB file, so payload and index bytes cannot be separated as cleanly + as in Blosc2. +- DuckDB zonemap is built-in table pruning metadata, not a separately managed index. +- Blosc2 and DuckDB are not identical systems: + - Blosc2 benchmark operates over compressed array storage and explicit index sidecars + - DuckDB benchmark operates over a columnar SQL engine with its own optimizer behavior + + +## Current Takeaways + +1. Blosc2 `light` is very competitive against DuckDB zonemap-like pruning. +2. Blosc2 `light` offers much faster selective lookups than DuckDB zonemap at a similar total storage cost. +3. DuckDB `art-index` becomes strong only when queries are written as true equality predicates. +4. Blosc2 `full` compares very well against DuckDB `art-index` on point lookups: + - slightly faster in the measured run + - much smaller on disk + - slower to build +5. Query-shape sensitivity is a major difference: + - small for Blosc2 + - large for DuckDB ART diff --git a/bench/indexing/duckdb_query_bench.py b/bench/indexing/duckdb_query_bench.py new file mode 100644 index 00000000..fddad80d --- /dev/null +++ b/bench/indexing/duckdb_query_bench.py @@ -0,0 +1,585 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import math +import os +import re +import statistics +import time +from pathlib import Path + +import duckdb +import numpy as np +import pyarrow as pa + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +DEFAULT_REPEATS = 3 +DISTS = ("sorted", "block-shuffled", "permuted", "random") +LAYOUTS = ("zonemap", "art-index") +RNG_SEED = 0 +DEFAULT_BATCH_SIZE = 1_250_000 +DATASET_LAYOUT_VERSION = "payload-ramp-v1" + +COLD_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("layout", lambda result: result["layout"]), + ("create_ms", lambda result: f"{result['create_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("db_bytes", lambda result: f"{result['db_bytes']:,}"), + ("query_rows", lambda result: f"{result['query_rows']:,}"), +] + +WARM_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("layout", lambda result: result["layout"]), + ("create_ms", lambda result: f"{result['create_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ("speedup", lambda result: f"{result['warm_speedup']:.2f}x" if result["warm_speedup"] is not None else "-"), + ("db_bytes", lambda result: f"{result['db_bytes']:,}"), + ("query_rows", lambda result: f"{result['query_rows']:,}"), +] + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def payload_slice(start: int, stop: int) -> np.ndarray: + return np.arange(start, stop, dtype=np.float32) + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_slice(size: int, start: int, stop: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if stop <= start: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + values = np.zeros(stop - start, dtype=dtype) + true_start = max(start, size // 2) + if true_start < stop: + values[true_start - start :] = True + return values + + positions = np.arange(start, stop, dtype=np.int64) + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_at(size: int, index: int, dtype: np.dtype) -> object: + return ordered_id_slice(size, index, index + 1, dtype)[0].item() + + +def ordered_ids_from_positions(positions: np.ndarray, size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if positions.size == 0: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + return (positions >= (size // 2)).astype(dtype, copy=False) + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def _block_order(size: int, block_len: int) -> np.ndarray: + nblocks = (size + block_len - 1) // block_len + return np.random.default_rng(RNG_SEED).permutation(nblocks) + + +def _fill_block_shuffled_ids( + ids: np.ndarray, size: int, start: int, stop: int, block_len: int, order: np.ndarray +) -> None: + cursor = start + out_cursor = 0 + while cursor < stop: + dest_block = cursor // block_len + block_offset = cursor % block_len + src_block = int(order[dest_block]) + src_start = src_block * block_len + block_offset + take = min(stop - cursor, block_len - block_offset, size - src_start) + ids[out_cursor : out_cursor + take] = ordered_id_slice(size, src_start, src_start + take, ids.dtype) + cursor += take + out_cursor += take + + +def _permuted_position_params(size: int) -> tuple[int, int]: + if size <= 1: + return 1, 0 + rng = np.random.default_rng(RNG_SEED) + step = int(rng.integers(1, size)) + while math.gcd(step, size) != 1: + step += 1 + if step >= size: + step = 1 + offset = int(rng.integers(0, size)) + return step, offset + + +def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step: int, offset: int) -> None: + positions = np.arange(start, stop, dtype=np.int64) + shuffled_positions = (positions * step + offset) % size + ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype) + + +def _randomized_ids(size: int, dtype: np.dtype) -> np.ndarray: + ids = make_ordered_ids(size, dtype) + np.random.default_rng(RNG_SEED).shuffle(ids) + return ids + + +def duckdb_sql_type(dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "BOOLEAN" + if dtype == np.dtype(np.int8): + return "TINYINT" + if dtype == np.dtype(np.int16): + return "SMALLINT" + if dtype == np.dtype(np.int32): + return "INTEGER" + if dtype == np.dtype(np.int64): + return "BIGINT" + if dtype == np.dtype(np.uint8): + return "UTINYINT" + if dtype == np.dtype(np.uint16): + return "USMALLINT" + if dtype == np.dtype(np.uint32): + return "UINTEGER" + if dtype == np.dtype(np.uint64): + return "UBIGINT" + if dtype == np.dtype(np.float32): + return "REAL" + if dtype == np.dtype(np.float64): + return "DOUBLE" + raise ValueError(f"unsupported duckdb dtype: {dtype}") + + +def duckdb_path(outdir: Path, size: int, dist: str, id_dtype: np.dtype, layout: str, batch_size: int) -> Path: + return ( + outdir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.layout-{layout}.batch-{batch_size}.duckdb" + ) + + +def _duckdb_wal_path(path: Path) -> Path: + return path.with_name(f"{path.name}.wal") + + +def _remove_duckdb_path(path: Path) -> None: + if path.exists(): + path.unlink() + wal_path = _duckdb_wal_path(path) + if wal_path.exists(): + wal_path.unlink() + + +def _valid_duckdb_file(path: Path, layout: str) -> bool: + if not path.exists(): + return False + + con = None + try: + con = duckdb.connect(str(path), read_only=True) + has_data = bool( + con.execute( + "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'main' AND table_name = 'data'" + ).fetchone()[0] + ) + if not has_data: + return False + if layout == "art-index": + index_count = con.execute( + "SELECT COUNT(*) FROM duckdb_indexes() WHERE schema_name = 'main' AND table_name = 'data' " + "AND index_name = 'data_id_idx'" + ).fetchone()[0] + return bool(index_count) + return layout == "zonemap" + except duckdb.Error: + return False + finally: + if con is not None: + con.close() + + +def build_duckdb_file( + size: int, + dist: str, + id_dtype: np.dtype, + path: Path, + *, + layout: str, + batch_size: int, +) -> float: + path.parent.mkdir(parents=True, exist_ok=True) + _remove_duckdb_path(path) + + id_type = duckdb_sql_type(id_dtype) + block_order = _block_order(size, batch_size) if dist == "block-shuffled" else None + permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0) + random_ids = _randomized_ids(size, id_dtype) if dist == "random" else None + + start_time = time.perf_counter() + con = duckdb.connect(str(path)) + try: + con.execute("PRAGMA threads=8") + con.execute(f"CREATE TABLE data (id {id_type}, payload FLOAT)") + for start in range(0, size, batch_size): + stop = min(start + batch_size, size) + ids = np.empty(stop - start, dtype=id_dtype) + if dist == "sorted": + ids[:] = ordered_id_slice(size, start, stop, id_dtype) + elif dist == "block-shuffled": + _fill_block_shuffled_ids(ids, size, start, stop, batch_size, block_order) + elif dist == "permuted": + _fill_permuted_ids(ids, size, start, stop, permuted_step, permuted_offset) + elif dist == "random": + ids[:] = random_ids[start:stop] + else: + raise ValueError(f"unsupported distribution {dist!r}") + + payload = payload_slice(start, stop) + batch = pa.table({"id": ids, "payload": payload}) + con.register("batch_arrow", batch) + con.execute("INSERT INTO data SELECT * FROM batch_arrow") + con.unregister("batch_arrow") + + if layout == "art-index": + con.execute("CREATE INDEX data_id_idx ON data(id)") + elif layout != "zonemap": + raise ValueError(f"unsupported layout {layout!r}") + + con.execute("CHECKPOINT") + finally: + con.close() + return time.perf_counter() - start_time + + +def _open_or_build_duckdb_file( + size: int, + dist: str, + id_dtype: np.dtype, + path: Path, + *, + layout: str, + batch_size: int, +) -> float: + if _valid_duckdb_file(path, layout): + return 0.0 + return build_duckdb_file(size, dist, id_dtype, path, layout=layout, batch_size=batch_size) + + +def _query_bounds(size: int, query_width: int, dtype: np.dtype) -> tuple[object, object]: + lo_idx = size // 2 + hi_idx = min(size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_id_at(size, lo_idx, dtype), ordered_id_at(size, hi_idx, dtype) + + +def _literal(value: object, dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "TRUE" if bool(value) else "FALSE" + if dtype.kind == "f": + return repr(float(value)) + if dtype.kind in {"i", "u"}: + return str(int(value)) + raise ValueError(f"unsupported dtype for literal formatting: {dtype}") + + +def _condition_sql(lo: object, hi: object, dtype: np.dtype, *, exact_query: bool = False) -> str: + if exact_query: + if lo != hi: + raise ValueError(f"exact queries require a single lookup value, got lo={lo!r}, hi={hi!r}") + return f"id = {_literal(lo, dtype)}" + return f"id >= {_literal(lo, dtype)} AND id <= {_literal(hi, dtype)}" + + +def benchmark_scan_once(path: Path, lo, hi) -> tuple[float, int]: + con = duckdb.connect(str(path), read_only=True) + try: + start = time.perf_counter() + table = con.execute("SELECT * FROM data").arrow().read_all() + ids = table["id"].to_numpy() + result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) + elapsed = time.perf_counter() - start + return elapsed, result_len + finally: + con.close() + + +def benchmark_filtered_once(path: Path, lo, hi, dtype: np.dtype, *, exact_query: bool = False) -> tuple[float, int]: + con = duckdb.connect(str(path), read_only=True) + try: + condition_sql = _condition_sql(lo, hi, dtype, exact_query=exact_query) + start = time.perf_counter() + table = con.execute(f"SELECT * FROM data WHERE {condition_sql}").arrow().read_all() + ids = table["id"].to_numpy() + result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) + elapsed = time.perf_counter() - start + return elapsed, result_len + finally: + con.close() + + +def benchmark_filtered_once_con( + con: duckdb.DuckDBPyConnection, lo, hi, dtype: np.dtype, *, exact_query: bool = False +) -> tuple[float, int]: + condition_sql = _condition_sql(lo, hi, dtype, exact_query=exact_query) + start = time.perf_counter() + table = con.execute(f"SELECT * FROM data WHERE {condition_sql}").arrow().read_all() + ids = table["id"].to_numpy() + result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) + elapsed = time.perf_counter() - start + return elapsed, result_len + + +def median(values: list[float]) -> float: + return statistics.median(values) + + +def benchmark_layout( + size: int, + outdir: Path, + dist: str, + query_width: int, + id_dtype: np.dtype, + layout: str, + batch_size: int, + repeats: int, + exact_query: bool, +) -> dict: + path = duckdb_path(outdir, size, dist, id_dtype, layout, batch_size) + create_s = _open_or_build_duckdb_file(size, dist, id_dtype, path, layout=layout, batch_size=batch_size) + lo, hi = _query_bounds(size, query_width, id_dtype) + + scan_elapsed, scan_rows = benchmark_scan_once(path, lo, hi) + + con = duckdb.connect(str(path), read_only=True) + try: + cold_elapsed, filtered_rows = benchmark_filtered_once_con(con, lo, hi, id_dtype, exact_query=exact_query) + warm_times = [ + benchmark_filtered_once_con(con, lo, hi, id_dtype, exact_query=exact_query)[0] * 1_000 + for _ in range(repeats) + ] + finally: + con.close() + + if scan_rows != filtered_rows: + raise AssertionError(f"filtered rows mismatch: scan={scan_rows}, filtered={filtered_rows}") + + scan_ms = scan_elapsed * 1_000 + cold_ms = cold_elapsed * 1_000 + warm_ms = median(warm_times) if warm_times else None + + return { + "size": size, + "dist": dist, + "layout": layout, + "create_ms": create_s * 1_000, + "scan_ms": scan_ms, + "cold_ms": cold_ms, + "cold_speedup": scan_ms / cold_ms, + "warm_ms": warm_ms, + "warm_speedup": None if warm_ms is None else scan_ms / warm_ms, + "db_bytes": os.path.getsize(path), + "query_rows": int(filtered_rows), + "path": path, + } + + +def parse_human_int(value: str) -> int: + value = value.strip().lower().replace("_", "") + multipliers = {"k": 1_000, "m": 1_000_000} + if value[-1:] in multipliers: + return int(float(value[:-1]) * multipliers[value[-1]]) + return int(value) + + +def print_results( + results: list[dict], + *, + batch_size: int, + repeats: int, + dist: str, + query_width: int, + id_dtype: np.dtype, + exact_query: bool, +) -> None: + print("DuckDB range-query benchmark via SQL filtered reads") + print( + f"batch_size={batch_size:,}, repeats={repeats}, dist={dist}, query_width={query_width:,}, " + f"dtype={id_dtype.name}, query_single_value={exact_query}" + ) + print("Note: 'zonemap' is DuckDB's default table layout with automatic min/max pruning.") + print(" 'art-index' adds an explicit secondary index on id.") + if exact_query: + print(" Filter predicate uses `id = value`.") + else: + print(" Filter predicate uses `id >= lo AND id <= hi`.") + cold_widths = table_widths(results, COLD_COLUMNS) + print() + print("Cold Query Table") + print_table(results, COLD_COLUMNS, cold_widths) + if repeats > 0: + warm_widths = table_widths(results, WARM_COLUMNS) + shared_width_by_header = {} + for (header, _), width in zip(COLD_COLUMNS, cold_widths, strict=True): + shared_width_by_header[header] = width + for (header, _), width in zip(WARM_COLUMNS, warm_widths, strict=True): + shared_width_by_header[header] = max(shared_width_by_header.get(header, 0), width) + warm_widths = [shared_width_by_header[header] for header, _ in WARM_COLUMNS] + print() + print("Warm Query Table") + print_table(results, WARM_COLUMNS, warm_widths) + + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def _table_rows(results: list[dict], columns: list[tuple[str, callable]]) -> tuple[list[str], list[list[str]], list[int]]: + headers = [header for header, _ in columns] + widths = [len(header) for header in headers] + rows = [[formatter(result) for _, formatter in columns] for result in results] + for row in rows: + widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] + return headers, rows, widths + + +def print_table(results: list[dict], columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + headers, rows, computed_widths = _table_rows(results, columns) + widths = computed_widths if widths is None else widths + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in rows: + print(_format_row(row, widths)) + + +def table_widths(results: list[dict], columns: list[tuple[str, callable]]) -> list[int]: + _, _, widths = _table_rows(results, columns) + return widths + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--size", default="10M", help="Number of rows, or 'all'. Default: 10M.") + parser.add_argument("--outdir", type=Path, required=True, help="Directory for generated DuckDB files.") + parser.add_argument("--dist", choices=(*DISTS, "all"), default="permuted", help="Row distribution.") + parser.add_argument("--layout", choices=(*LAYOUTS, "all"), default="all", help="DuckDB layout to benchmark.") + parser.add_argument("--query-width", type=parse_human_int, default=1, help="Query width. Default: 1.") + parser.add_argument( + "--query-single-value", + action=argparse.BooleanOptionalAction, + default=False, + help="Use `id = value` instead of a range predicate. Requires query-width=1.", + ) + parser.add_argument("--dtype", default="float64", help="Indexed id dtype. Default: float64.") + parser.add_argument( + "--batch-size", + type=parse_human_int, + default=DEFAULT_BATCH_SIZE, + help="Batch size used while loading the table. Default: 1.25M.", + ) + parser.add_argument("--repeats", type=int, default=DEFAULT_REPEATS, help="Benchmark repeats. Default: 3.") + args = parser.parse_args() + + if args.query_single_value and args.query_width != 1: + raise ValueError("--query-single-value requires --query-width 1") + + id_dtype = np.dtype(args.dtype) + sizes = SIZES if args.size == "all" else (parse_human_int(args.size),) + dists = DISTS if args.dist == "all" else (args.dist,) + layouts = LAYOUTS if args.layout == "all" else (args.layout,) + + results = [] + for size in sizes: + for dist in dists: + for layout in layouts: + results.append( + benchmark_layout( + size, + args.outdir, + dist, + args.query_width, + id_dtype, + layout, + args.batch_size, + args.repeats, + args.query_single_value, + ) + ) + + print_results( + results, + batch_size=args.batch_size, + repeats=args.repeats, + dist=args.dist, + query_width=args.query_width, + id_dtype=id_dtype, + exact_query=args.query_single_value, + ) + + +if __name__ == "__main__": + main() diff --git a/bench/ndarray/index_query_bench.py b/bench/indexing/index_query_bench.py similarity index 97% rename from bench/ndarray/index_query_bench.py rename to bench/indexing/index_query_bench.py index 0ed6dc98..54d844f9 100644 --- a/bench/ndarray/index_query_bench.py +++ b/bench/indexing/index_query_bench.py @@ -405,8 +405,12 @@ def _literal(value: object, dtype: np.dtype) -> str: raise ValueError(f"unsupported dtype for literal formatting: {dtype}") -def _condition_expr(lo: object, hi: object, dtype: np.dtype) -> str: +def _condition_expr(lo: object, hi: object, dtype: np.dtype, *, query_single_value: bool = False) -> str: lo_literal = _literal(lo, dtype) + if query_single_value: + if lo != hi: + raise ValueError(f"single-value queries require a single lookup value, got lo={lo!r}, hi={hi!r}") + return f"id == {lo_literal}" hi_literal = _literal(hi, dtype) return f"(id >= {lo_literal}) & (id <= {hi_literal})" @@ -479,6 +483,7 @@ def benchmark_size( size_dir: Path, dist: str, query_width: int, + query_single_value: bool, optlevel: int, id_dtype: np.dtype, in_mem: bool, @@ -495,7 +500,7 @@ def benchmark_size( base_array_path(size_dir, size, dist, id_dtype, chunks, blocks), size, dist, id_dtype, chunks, blocks ) lo, hi = _query_bounds(size, query_width, id_dtype) - condition_str = _condition_expr(lo, hi, id_dtype) + condition_str = _condition_expr(lo, hi, id_dtype, query_single_value=query_single_value) condition = blosc2.lazyexpr(condition_str, arr.fields) expr = condition.where(arr) base_bytes = size * arr.dtype.itemsize @@ -620,6 +625,12 @@ def parse_args() -> argparse.Namespace: default=1, help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1.", ) + parser.add_argument( + "--query-single-value", + action=argparse.BooleanOptionalAction, + default=False, + help="Use `id == value` instead of a range predicate. Requires query-width=1.", + ) parser.add_argument( "--chunks", type=parse_human_size_or_auto, @@ -704,6 +715,8 @@ def main() -> None: args = parse_args() if args.repeats < 0: raise SystemExit("--repeats must be >= 0") + if args.query_single_value and args.query_width != 1: + raise SystemExit("--query-single-value requires --query-width 1") try: id_dtype = np.dtype(args.dtype) except TypeError as exc: @@ -728,6 +741,7 @@ def main() -> None: Path(tmpdir), args.dist, args.query_width, + args.query_single_value, args.repeats, args.optlevel, id_dtype, @@ -748,6 +762,7 @@ def main() -> None: args.outdir, args.dist, args.query_width, + args.query_single_value, args.repeats, args.optlevel, id_dtype, @@ -768,6 +783,7 @@ def run_benchmarks( size_dir: Path, dist_label: str, query_width: int, + query_single_value: bool, repeats: int, optlevel: int, id_dtype: np.dtype, @@ -792,6 +808,7 @@ def run_benchmarks( print( f"{geometry_label}, repeats={repeats}, dist={dist_label}, " f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " + f"query_single_value={query_single_value}, " f"full_query_mode={full_query_mode}, index_codec={'auto' if codec is None else codec.name}, " f"index_clevel={'auto' if clevel is None else clevel}, " f"index_nthreads={'auto' if nthreads is None else nthreads}" @@ -803,6 +820,7 @@ def run_benchmarks( size_dir, dist, query_width, + query_single_value, optlevel, id_dtype, in_mem, diff --git a/bench/ndarray/index_query_bench_tables.py b/bench/indexing/index_query_bench_tables.py similarity index 100% rename from bench/ndarray/index_query_bench_tables.py rename to bench/indexing/index_query_bench_tables.py diff --git a/bench/indexing/parquet_query_bench.py b/bench/indexing/parquet_query_bench.py new file mode 100644 index 00000000..1db29940 --- /dev/null +++ b/bench/indexing/parquet_query_bench.py @@ -0,0 +1,441 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import math +import os +import re +import statistics +import time +from pathlib import Path + +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +DEFAULT_REPEATS = 3 +DISTS = ("sorted", "block-shuffled", "permuted", "random") +LAYOUTS = ("row-group", "page-index") +RNG_SEED = 0 +DEFAULT_ROW_GROUP_SIZE = 1_250_000 +DEFAULT_MAX_ROWS_PER_PAGE = 10_000 +DEFAULT_COMPRESSION = "snappy" +DATASET_LAYOUT_VERSION = "payload-ramp-v1" + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def payload_slice(start: int, stop: int) -> np.ndarray: + return np.arange(start, stop, dtype=np.float32) + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_slice(size: int, start: int, stop: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if stop <= start: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + values = np.zeros(stop - start, dtype=dtype) + true_start = max(start, size // 2) + if true_start < stop: + values[true_start - start :] = True + return values + + positions = np.arange(start, stop, dtype=np.int64) + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_at(size: int, index: int, dtype: np.dtype) -> object: + return ordered_id_slice(size, index, index + 1, dtype)[0].item() + + +def ordered_ids_from_positions(positions: np.ndarray, size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if positions.size == 0: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + return (positions >= (size // 2)).astype(dtype, copy=False) + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def _block_order(size: int, block_len: int) -> np.ndarray: + nblocks = (size + block_len - 1) // block_len + return np.random.default_rng(RNG_SEED).permutation(nblocks) + + +def _fill_block_shuffled_ids( + ids: np.ndarray, size: int, start: int, stop: int, block_len: int, order: np.ndarray +) -> None: + cursor = start + out_cursor = 0 + while cursor < stop: + dest_block = cursor // block_len + block_offset = cursor % block_len + src_block = int(order[dest_block]) + src_start = src_block * block_len + block_offset + take = min(stop - cursor, block_len - block_offset, size - src_start) + ids[out_cursor : out_cursor + take] = ordered_id_slice(size, src_start, src_start + take, ids.dtype) + cursor += take + out_cursor += take + + +def _permuted_position_params(size: int) -> tuple[int, int]: + if size <= 1: + return 1, 0 + rng = np.random.default_rng(RNG_SEED) + step = int(rng.integers(1, size)) + while math.gcd(step, size) != 1: + step += 1 + if step >= size: + step = 1 + offset = int(rng.integers(0, size)) + return step, offset + + +def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step: int, offset: int) -> None: + positions = np.arange(start, stop, dtype=np.int64) + shuffled_positions = (positions * step + offset) % size + ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype) + + +def _randomized_ids(size: int, dtype: np.dtype) -> np.ndarray: + ids = make_ordered_ids(size, dtype) + np.random.default_rng(RNG_SEED).shuffle(ids) + return ids + + +def parquet_path( + outdir: Path, + size: int, + dist: str, + id_dtype: np.dtype, + layout: str, + row_group_size: int, + max_rows_per_page: int, + compression: str, +) -> Path: + return ( + outdir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.layout-{layout}.rg-{row_group_size}.page-{max_rows_per_page}.codec-{compression}.parquet" + ) + + +def build_parquet_file( + size: int, + dist: str, + id_dtype: np.dtype, + path: Path, + *, + row_group_size: int, + max_rows_per_page: int, + compression: str, + write_page_index: bool, +) -> float: + path.parent.mkdir(parents=True, exist_ok=True) + if path.exists(): + path.unlink() + + schema = pa.schema([("id", pa.from_numpy_dtype(id_dtype)), ("payload", pa.float32())]) + block_order = _block_order(size, max_rows_per_page) if dist == "block-shuffled" else None + permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0) + random_ids = _randomized_ids(size, id_dtype) if dist == "random" else None + + start_time = time.perf_counter() + writer = pq.ParquetWriter( + path, + schema, + compression=compression, + write_statistics=True, + write_page_index=write_page_index, + max_rows_per_page=max_rows_per_page, + ) + try: + for start in range(0, size, row_group_size): + stop = min(start + row_group_size, size) + ids = np.empty(stop - start, dtype=id_dtype) + if dist == "sorted": + ids[:] = ordered_id_slice(size, start, stop, id_dtype) + elif dist == "block-shuffled": + _fill_block_shuffled_ids(ids, size, start, stop, max_rows_per_page, block_order) + elif dist == "permuted": + _fill_permuted_ids(ids, size, start, stop, permuted_step, permuted_offset) + elif dist == "random": + ids[:] = random_ids[start:stop] + else: + raise ValueError(f"unsupported distribution {dist!r}") + + payload = payload_slice(start, stop) + table = pa.table({"id": ids, "payload": payload}, schema=schema) + writer.write_table(table, row_group_size=row_group_size) + finally: + writer.close() + return time.perf_counter() - start_time + + +def _query_bounds(size: int, query_width: int, dtype: np.dtype) -> tuple[object, object]: + lo_idx = size // 2 + hi_idx = min(size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_id_at(size, lo_idx, dtype), ordered_id_at(size, hi_idx, dtype) + + +def benchmark_scan_once(path: Path, lo, hi) -> tuple[float, int]: + start = time.perf_counter() + table = pq.read_table(path, use_threads=True) + ids = table["id"].to_numpy() + mask = (ids >= lo) & (ids <= hi) + result_len = int(np.count_nonzero(mask)) + elapsed = time.perf_counter() - start + return elapsed, result_len + + +def benchmark_filtered_once(path: Path, lo, hi) -> tuple[float, int]: + start = time.perf_counter() + table = pq.read_table(path, filters=[("id", ">=", lo), ("id", "<=", hi)], use_threads=True) + ids = table["id"].to_numpy() + result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) + elapsed = time.perf_counter() - start + return elapsed, result_len + + +def parquet_payload_bytes(path: Path) -> int: + metadata = pq.ParquetFile(path).metadata + payload = 0 + for row_group_idx in range(metadata.num_row_groups): + row_group = metadata.row_group(row_group_idx) + for column_idx in range(row_group.num_columns): + payload += int(row_group.column(column_idx).total_compressed_size) + return payload + + +def median(values: list[float]) -> float: + return statistics.median(values) + + +def benchmark_layout( + size: int, + outdir: Path, + dist: str, + query_width: int, + id_dtype: np.dtype, + layout: str, + row_group_size: int, + max_rows_per_page: int, + compression: str, + repeats: int, +) -> dict: + path = parquet_path(outdir, size, dist, id_dtype, layout, row_group_size, max_rows_per_page, compression) + write_page_index = layout == "page-index" + create_s = build_parquet_file( + size, + dist, + id_dtype, + path, + row_group_size=row_group_size, + max_rows_per_page=max_rows_per_page, + compression=compression, + write_page_index=write_page_index, + ) + lo, hi = _query_bounds(size, query_width, id_dtype) + + scan_times = [] + filtered_times = [] + scan_rows = None + filtered_rows = None + for _ in range(repeats): + scan_elapsed, scan_rows = benchmark_scan_once(path, lo, hi) + filtered_elapsed, filtered_rows = benchmark_filtered_once(path, lo, hi) + scan_times.append(scan_elapsed * 1_000) + filtered_times.append(filtered_elapsed * 1_000) + + if scan_rows != filtered_rows: + raise AssertionError(f"filtered rows mismatch: scan={scan_rows}, filtered={filtered_rows}") + + file_bytes = os.path.getsize(path) + payload_bytes = parquet_payload_bytes(path) + overhead_bytes = file_bytes - payload_bytes + + return { + "size": size, + "dist": dist, + "layout": layout, + "create_ms": create_s * 1_000, + "scan_ms": median(scan_times), + "filtered_ms": median(filtered_times), + "speedup": median(scan_times) / median(filtered_times), + "file_bytes": file_bytes, + "payload_bytes": payload_bytes, + "overhead_bytes": overhead_bytes, + "payload_pct": (payload_bytes / file_bytes * 100) if file_bytes else 0.0, + "overhead_pct": (overhead_bytes / file_bytes * 100) if file_bytes else 0.0, + "query_rows": int(filtered_rows), + "path": path, + } + + +def print_results( + results: list[dict], + *, + row_group_size: int, + max_rows_per_page: int, + repeats: int, + dist: str, + query_width: int, + id_dtype: np.dtype, + compression: str, +) -> None: + print("Parquet range-query benchmark via pyarrow filtered reads") + print( + f"row_group_size={row_group_size:,}, max_rows_per_page={max_rows_per_page:,}, repeats={repeats}, " + f"dist={dist}, query_width={query_width:,}, dtype={id_dtype.name}, compression={compression}" + ) + print("Note: filtered reads are measured with pyarrow.parquet.read_table(filters=...).") + print(" Pruning behavior depends on what the current PyArrow reader can exploit.") + print() + print( + f"{'rows':<10} {'dist':<8} {'layout':<11} {'create_ms':>12} {'scan_ms':>9} {'filtered_ms':>12} " + f"{'speedup':>9} {'file_bytes':>12} {'payload':>12} {'overhead':>12} {'query_rows':>11}" + ) + print( + f"{'-' * 10} {'-' * 8} {'-' * 11} {'-' * 12} {'-' * 9} {'-' * 12} {'-' * 9} {'-' * 12} {'-' * 12} {'-' * 12} {'-' * 11}" + ) + for row in results: + print( + f"{row['size']:<10,} {row['dist']:<8} {row['layout']:<11} {row['create_ms']:12.3f} " + f"{row['scan_ms']:9.3f} {row['filtered_ms']:12.3f} {row['speedup']:9.2f}x " + f"{row['file_bytes']:12,} {row['payload_bytes']:12,} {row['overhead_bytes']:12,} {row['query_rows']:11,}" + ) + + +def parse_human_int(value: str) -> int: + value = value.strip().lower().replace("_", "") + multipliers = {"k": 1_000, "m": 1_000_000} + if value[-1:] in multipliers: + return int(float(value[:-1]) * multipliers[value[-1]]) + return int(value) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--size", default="10M", help="Number of rows, or 'all'. Default: 10M.") + parser.add_argument("--outdir", type=Path, required=True, help="Directory for generated Parquet files.") + parser.add_argument("--dist", choices=(*DISTS, "all"), default="permuted", help="Row distribution.") + parser.add_argument("--layout", choices=(*LAYOUTS, "all"), default="all", help="Parquet layout to benchmark.") + parser.add_argument("--query-width", type=parse_human_int, default=1, help="Query width. Default: 1.") + parser.add_argument("--dtype", default="float64", help="Indexed id dtype. Default: float64.") + parser.add_argument( + "--row-group-size", + type=parse_human_int, + default=DEFAULT_ROW_GROUP_SIZE, + help="Parquet row group size. Default: 1.25M.", + ) + parser.add_argument( + "--max-rows-per-page", + type=parse_human_int, + default=DEFAULT_MAX_ROWS_PER_PAGE, + help="Parquet max rows per page. Default: 10k.", + ) + parser.add_argument("--compression", default=DEFAULT_COMPRESSION, help="Parquet compression codec.") + parser.add_argument("--repeats", type=int, default=DEFAULT_REPEATS, help="Benchmark repeats. Default: 3.") + args = parser.parse_args() + + id_dtype = np.dtype(args.dtype) + sizes = SIZES if args.size == "all" else (parse_human_int(args.size),) + dists = DISTS if args.dist == "all" else (args.dist,) + layouts = LAYOUTS if args.layout == "all" else (args.layout,) + + results = [] + for size in sizes: + for dist in dists: + for layout in layouts: + results.append( + benchmark_layout( + size, + args.outdir, + dist, + args.query_width, + id_dtype, + layout, + args.row_group_size, + args.max_rows_per_page, + args.compression, + args.repeats, + ) + ) + + print_results( + results, + row_group_size=args.row_group_size, + max_rows_per_page=args.max_rows_per_page, + repeats=args.repeats, + dist=args.dist, + query_width=args.query_width, + id_dtype=id_dtype, + compression=args.compression, + ) + + +if __name__ == "__main__": + main() From fbb0d550a99c4a8ed089fb35ad941ae784767015 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 08:29:01 +0200 Subject: [PATCH 58/68] New version of indexes tutorial --- .../tutorials/14.indexing-arrays.ipynb | 135 +++++++++++------- 1 file changed, 84 insertions(+), 51 deletions(-) diff --git a/doc/getting_started/tutorials/14.indexing-arrays.ipynb b/doc/getting_started/tutorials/14.indexing-arrays.ipynb index 01f06507..a89d54ca 100644 --- a/doc/getting_started/tutorials/14.indexing-arrays.ipynb +++ b/doc/getting_started/tutorials/14.indexing-arrays.ipynb @@ -32,8 +32,8 @@ "id": "8c510216bc394cf9", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T04:14:37.432863Z", - "start_time": "2026-04-09T04:14:37.110770Z" + "end_time": "2026-04-09T06:27:37.470903Z", + "start_time": "2026-04-09T06:27:37.098590Z" } }, "source": [ @@ -119,13 +119,13 @@ "id": "d1a5a37585a045ca", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T04:14:51.428855Z", - "start_time": "2026-04-09T04:14:37.433671Z" + "end_time": "2026-04-09T06:27:52.097578Z", + "start_time": "2026-04-09T06:27:37.471828Z" } }, "source": [ "N_ROWS = 10_000_000\n", - "MASK_TEXT = \"(id >= -25.0) & (id < 25.0)\"\n", + "MASK_TEXT = \"(id >= -5.0) & (id < 5.0)\"\n", "\n", "rng = np.random.default_rng(0)\n", "dtype = np.dtype([(\"id\", np.float64), (\"payload\", np.int32)])\n", @@ -157,10 +157,10 @@ "text": [ "Compressed base array size: 30.74 MiB\n", "kind build_ms index_size overhead\n", - "ultralight 45.783 142 B 0.00x\n", - "light 674.304 26.04 MiB 0.85x\n", - "medium 2195.323 34.99 MiB 1.14x\n", - "full 8483.835 28.44 MiB 0.93x\n" + "ultralight 45.528 142 B 0.00x\n", + "light 679.027 26.04 MiB 0.85x\n", + "medium 2342.959 34.99 MiB 1.14x\n", + "full 8925.948 28.44 MiB 0.93x\n" ] } ], @@ -181,8 +181,8 @@ "id": "f1b3aaec965b42d6", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T04:14:51.546053Z", - "start_time": "2026-04-09T04:14:51.449229Z" + "end_time": "2026-04-09T06:27:52.220533Z", + "start_time": "2026-04-09T06:27:52.120176Z" } }, "source": [ @@ -202,7 +202,7 @@ "output_type": "stream", "text": [ "{'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'medium', 'level': 'exact', 'lookup_path': 'chunk-nav', 'full_runs': 0}\n", - "Matched rows: 50\n" + "Matched rows: 10\n" ] } ], @@ -223,8 +223,8 @@ "id": "c9e932b7561b4ff4", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T04:14:53.105689Z", - "start_time": "2026-04-09T04:14:51.548648Z" + "end_time": "2026-04-09T06:27:53.696948Z", + "start_time": "2026-04-09T06:27:52.222040Z" } }, "source": [ @@ -243,9 +243,9 @@ " timing_rows.append((kind, scan_ms, index_ms, scan_ms / index_ms))\n", "\n", "print(f\"Selective mask over {N_ROWS:,} rows\")\n", - "print(f\"{'kind':<12} {'scan_ms':>10} {'index_ms':>10} {'speedup':>10}\")\n", + "print(f\"{'kind':<12} {'scan_ms':>11} {'index_ms':>10} {'speedup':>10}\")\n", "for kind, scan_ms, index_ms, speedup in timing_rows:\n", - " print(f\"{kind:<12} {scan_ms:10.3f} {index_ms:10.3f} {speedup:10.2f}x\")" + " print(f\"{kind:<12} {scan_ms:11.3f} {index_ms:10.3f} {speedup:10.2f}x\")" ], "outputs": [ { @@ -253,11 +253,11 @@ "output_type": "stream", "text": [ "Selective mask over 10,000,000 rows\n", - "kind scan_ms index_ms speedup\n", - "ultralight 70.429 67.914 1.04x\n", - "light 68.560 5.011 13.68x\n", - "medium 68.481 4.430 15.46x\n", - "full 68.408 4.263 16.05x\n" + "kind scan_ms index_ms speedup\n", + "ultralight 73.371 70.249 1.04x\n", + "light 65.966 1.478 44.63x\n", + "medium 65.349 1.253 52.16x\n", + "full 65.108 1.221 53.31x\n" ] } ], @@ -278,8 +278,8 @@ "id": "9ffcb0d8d06a4daa", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T04:14:53.160261Z", - "start_time": "2026-04-09T04:14:53.118529Z" + "end_time": "2026-04-09T06:27:53.735085Z", + "start_time": "2026-04-09T06:27:53.707924Z" } }, "source": [ @@ -323,8 +323,8 @@ "id": "7d337ce2f9fb4f32", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T04:14:53.206514Z", - "start_time": "2026-04-09T04:14:53.171092Z" + "end_time": "2026-04-09T06:27:53.759337Z", + "start_time": "2026-04-09T06:27:53.736407Z" } }, "source": [ @@ -370,12 +370,12 @@ "id": "0be5f512928f48db", "metadata": { "ExecuteTime": { - "end_time": "2026-04-09T04:14:55.722443Z", - "start_time": "2026-04-09T04:14:53.207978Z" + "end_time": "2026-04-09T06:27:58.801567Z", + "start_time": "2026-04-09T06:27:53.761336Z" } }, "source": [ - "persistent_arr = blosc2.asarray(data, urlpath=paths[0], mode=\"w\")\n", + "persistent_arr = data.copy(urlpath=paths[0], mode=\"w\")\n", "persistent_descriptor = persistent_arr.create_index(field=\"id\", kind=\"medium\")\n", "show_index_summary(\"persistent medium\", persistent_descriptor)\n", "\n", @@ -388,19 +388,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "persistent medium: kind=medium, persistent=False, ooc=True, stale=False\n" - ] - }, - { - "ename": "FileNotFoundError", - "evalue": "No such file or directory: indexing_tutorial_medium.b2nd", - "output_type": "error", - "traceback": [ - "\u001B[31m---------------------------------------------------------------------------\u001B[39m", - "\u001B[31mFileNotFoundError\u001B[39m Traceback (most recent call last)", - "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[7]\u001B[39m\u001B[32m, line 5\u001B[39m\n\u001B[32m 2\u001B[39m persistent_descriptor = persistent_arr.create_index(field=\u001B[33m\"\u001B[39m\u001B[33mid\u001B[39m\u001B[33m\"\u001B[39m, kind=\u001B[33m\"\u001B[39m\u001B[33mmedium\u001B[39m\u001B[33m\"\u001B[39m)\n\u001B[32m 3\u001B[39m show_index_summary(\u001B[33m\"\u001B[39m\u001B[33mpersistent medium\u001B[39m\u001B[33m\"\u001B[39m, persistent_descriptor)\n\u001B[32m----> \u001B[39m\u001B[32m5\u001B[39m reopened = \u001B[43mblosc2\u001B[49m\u001B[43m.\u001B[49m\u001B[43mopen\u001B[49m\u001B[43m(\u001B[49m\u001B[43mpaths\u001B[49m\u001B[43m[\u001B[49m\u001B[32;43m0\u001B[39;49m\u001B[43m]\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mmode\u001B[49m\u001B[43m=\u001B[49m\u001B[33;43m\"\u001B[39;49m\u001B[33;43ma\u001B[39;49m\u001B[33;43m\"\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[32m 6\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mReopened index count: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00m\u001B[38;5;28mlen\u001B[39m(reopened.indexes)\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 7\u001B[39m \u001B[38;5;28mprint\u001B[39m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mPersisted sidecar path: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mreopened.indexes[\u001B[32m0\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mreduced\u001B[39m\u001B[33m'\u001B[39m][\u001B[33m'\u001B[39m\u001B[33mvalues_path\u001B[39m\u001B[33m'\u001B[39m]\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n", - "\u001B[36mFile \u001B[39m\u001B[32m~/blosc/python-blosc2/src/blosc2/schunk.py:1779\u001B[39m, in \u001B[36mopen\u001B[39m\u001B[34m(urlpath, mode, offset, **kwargs)\u001B[39m\n\u001B[32m 1776\u001B[39m \u001B[38;5;28;01mreturn\u001B[39;00m special\n\u001B[32m 1778\u001B[39m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m os.path.exists(urlpath):\n\u001B[32m-> \u001B[39m\u001B[32m1779\u001B[39m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mFileNotFoundError\u001B[39;00m(\u001B[33mf\u001B[39m\u001B[33m\"\u001B[39m\u001B[33mNo such file or directory: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00murlpath\u001B[38;5;132;01m}\u001B[39;00m\u001B[33m\"\u001B[39m)\n\u001B[32m 1781\u001B[39m _set_default_dparams(kwargs)\n\u001B[32m 1782\u001B[39m res = blosc2_ext.open(urlpath, mode, offset, **kwargs)\n", - "\u001B[31mFileNotFoundError\u001B[39m: No such file or directory: indexing_tutorial_medium.b2nd" + "persistent medium: kind=medium, persistent=True, ooc=True, stale=False\n", + "Reopened index count: 1\n", + "Persisted sidecar path: indexing_tutorial_medium.__index__.id.medium.reduced.values.b2nd\n" ] } ], @@ -419,7 +409,12 @@ { "cell_type": "code", "id": "11f0cd1b910b409a", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:58.852040Z", + "start_time": "2026-04-09T06:27:58.814043Z" + } + }, "source": [ "mutable_arr = blosc2.arange(20, dtype=np.int64)\n", "mutable_arr.create_index(kind=\"full\")\n", @@ -429,8 +424,17 @@ "mutable_arr.rebuild_index()\n", "print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stale after direct mutation: True\n", + "Stale after rebuild: False\n" + ] + } + ], + "execution_count": 8 }, { "cell_type": "markdown", @@ -454,7 +458,12 @@ { "cell_type": "code", "id": "2e1a47a9cf7246e6", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:59.968401Z", + "start_time": "2026-04-09T06:27:58.852830Z" + } + }, "source": [ "append_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int32)])\n", "base_rows = 200_000\n", @@ -492,8 +501,22 @@ "print(f\"Median mask time after compaction: {after_ms:.3f} ms\")\n", "print(f\"Speedup after compaction: {before_ms / after_ms:.2f}x\")" ], - "outputs": [], - "execution_count": null + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n", + "Pending runs: 40\n", + "Median mask time before compaction: 3.293 ms\n", + "After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n", + "Pending runs: 0\n", + "Median mask time after compaction: 0.689 ms\n", + "Speedup after compaction: 4.78x\n" + ] + } + ], + "execution_count": 9 }, { "cell_type": "markdown", @@ -513,21 +536,31 @@ { "cell_type": "code", "id": "9833102355db4ec0", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:59.991418Z", + "start_time": "2026-04-09T06:27:59.978217Z" + } + }, "source": [ "for path in paths:\n", " blosc2.remove_urlpath(path)" ], "outputs": [], - "execution_count": null + "execution_count": 10 }, { "cell_type": "code", "id": "17489b2c3d2ac57", - "metadata": {}, + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:28:00.015548Z", + "start_time": "2026-04-09T06:27:59.998661Z" + } + }, "source": [], "outputs": [], - "execution_count": null + "execution_count": 10 } ], "metadata": { From 40fc4abb6bab1ff73e6c9e04d6e36cad9ee7fc5c Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 08:37:24 +0200 Subject: [PATCH 59/68] Honor copy-inducing kwargs in asarray for NDArray inputs --- src/blosc2/ndarray.py | 54 ++++++++++++++++++++++++++++++----- tests/ndarray/test_ndarray.py | 25 ++++++++++++++++ 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index a6085057..a571152b 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -6327,6 +6327,32 @@ def save(array: NDArray, urlpath: str, contiguous=True, **kwargs: Any) -> None: array.save(urlpath, contiguous, **kwargs) +def _ndarray_asarray_requires_copy( + array: NDArray, dtype: np.dtype, chunks, blocks, user_kwargs: dict[str, Any] +) -> bool: + if np.dtype(dtype) != np.dtype(array.dtype): + return True + if "chunks" in user_kwargs and tuple(chunks) != tuple(array.chunks): + return True + if "blocks" in user_kwargs and tuple(blocks) != tuple(array.blocks): + return True + + copy_keys = { + "cparams", + "dparams", + "meta", + "urlpath", + "contiguous", + "mode", + "mmap_mode", + "initial_mapping_size", + "storage", + "out", + "_chunksize_reduc_factor", + } + return builtins.any(key in user_kwargs for key in copy_keys) + + def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: Any) -> NDArray: """Convert the `array` to an `NDArray`. @@ -6338,7 +6364,8 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: copy: bool | None, optional Whether to copy the input. If True, the function copies. If False, raise a ValueError if copy is necessary. If None and - input is NDArray, avoid copy by returning lazyexpr. + input is NDArray, return the original array when no dtype, + partition, or storage-related changes are requested. Default: None. kwargs: dict, optional @@ -6346,8 +6373,9 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: Returns ------- - out: :ref:`NDArray` or :ref:`LazyExpr` - An new NDArray or LazyExpr made of :paramref:`array`. + out: :ref:`NDArray` + A new :ref:`NDArray` made of :paramref:`array`, or the original + array when a copy is not required. Notes ----- @@ -6365,7 +6393,11 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: >>> a = np.arange(0, np.prod(shape), dtype=np.int64).reshape(shape) >>> # Create a NDArray from a NumPy array >>> nda = blosc2.asarray(a) + >>> # NDArray inputs are returned as-is unless a copy is requested + >>> blosc2.asarray(nda) is nda + True """ + user_kwargs = kwargs.copy() # Convert scalars to numpy array casting = kwargs.pop("casting", "unsafe") if casting != "unsafe": @@ -6373,7 +6405,7 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: if not hasattr(array, "shape"): array = np.asarray(array) # defaults if dtype=None dtype_ = blosc2.proxy.convert_dtype(array.dtype) - dtype = kwargs.pop("dtype", dtype_) # check if dtype provided + dtype = blosc2.proxy.convert_dtype(kwargs.pop("dtype", dtype_)) # check if dtype provided kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) @@ -6385,9 +6417,17 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: if blocks is None and hasattr(array, "blocks") and isinstance(array.blocks, tuple | list): blocks = array.blocks - copy = True if copy is None and not isinstance(array, NDArray) else copy + requires_copy = isinstance(array, NDArray) and _ndarray_asarray_requires_copy( + array, dtype, chunks, blocks, user_kwargs + ) + if copy is None: + copy = not isinstance(array, NDArray) or requires_copy + elif copy is False and requires_copy: + raise ValueError( + "Cannot satisfy dtype, partition, or storage changes with copy=False for NDArray input." + ) if copy: - chunks, blocks = compute_chunks_blocks(array.shape, chunks, blocks, dtype_, **kwargs) + chunks, blocks = compute_chunks_blocks(array.shape, chunks, blocks, dtype, **kwargs) # Fast path for small arrays. This is not too expensive in terms of memory consumption. shape = array.shape small_size = 2**24 # 16 MB @@ -6402,7 +6442,7 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: return blosc2_ext.asarray(array, chunks, blocks, **kwargs) # Create the empty array - ndarr = empty(shape, dtype_, chunks=chunks, blocks=blocks, **kwargs) + ndarr = empty(shape, dtype, chunks=chunks, blocks=blocks, **kwargs) behaved = are_partitions_behaved(shape, chunks, blocks) # Get the coordinates of the chunks diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index 820c881f..cbdf1af7 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -103,6 +103,31 @@ def test_asarray(a): np.testing.assert_allclose(a, b[:]) +def test_asarray_ndarray_persists_copy_when_urlpath_requested(tmp_path): + array = blosc2.asarray(np.arange(10, dtype=np.int64), chunks=(5,), blocks=(2,)) + path = tmp_path / "persisted_copy.b2nd" + + persisted = blosc2.asarray(array, urlpath=path, mode="w") + + assert persisted is not array + assert persisted.urlpath == str(path) + assert path.exists() + np.testing.assert_array_equal(persisted[:], array[:]) + + +def test_asarray_ndarray_copies_for_dtype_changes_and_rejects_copy_false(tmp_path): + array = blosc2.asarray(np.arange(10, dtype=np.int64), chunks=(5,), blocks=(2,)) + + cast = blosc2.asarray(array, dtype=np.float32) + + assert cast is not array + assert cast.dtype == np.float32 + np.testing.assert_allclose(cast[:], array[:].astype(np.float32)) + + with pytest.raises(ValueError, match="copy=False"): + blosc2.asarray(array, urlpath=tmp_path / "persisted_copy_false.b2nd", mode="w", copy=False) + + def test_ndarray_info_has_human_sizes(): array = blosc2.asarray(np.arange(16, dtype=np.int32)) From c5bbc40e38ff47fb28bb5a23ced3a61526b28e52 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 14:00:46 +0200 Subject: [PATCH 60/68] Add persistent query-result cache for indexed lookups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements a two-level cache for range/value queries on indexed NDArrays: - Hot cache: in-process LRU (≤128 KB) keyed by BLAKE2b digest of the query descriptor (expression, tokens, order). - Persistent cache: results stored as a VLArray in the same .b2nd file (vlmeta catalog + compressed payload), surviving process restarts. Cache is invalidated automatically on append, drop_index, compact_index, and mark_indexes_stale. Only persistent (on-disk) arrays use the persistent layer; in-memory arrays use the hot cache only. Wire-up in lazyexpr.slices_eval: - Value path (arr[cond][:]): check/store persistent + hot cache. - Indices path (.indices().compute()): check/store hot cache. - Cache hit short-circuits plan_query via a minimal IndexPlan. Add 38 new tests in tests/ndarray/test_indexing.py covering: - Cold/warm hits for both in-memory and on-disk arrays. - Cross-array isolation (no cache poisoning between arrays). - Invalidation on append and index rebuild. - Persistence across process-simulated reopens (clear hot cache). - LRU eviction and entry size cap enforcement. --- src/blosc2/indexing.py | 328 ++++++++++++++++++ src/blosc2/lazyexpr.py | 41 ++- tests/ndarray/test_indexing.py | 615 +++++++++++++++++++++++++++++++++ 3 files changed, 982 insertions(+), 2 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 46a3fd0b..06ef1e9c 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -8,6 +8,7 @@ from __future__ import annotations import ast +import contextlib import enum import hashlib import math @@ -51,6 +52,25 @@ _PERSISTENT_INDEXES: dict[tuple[str, str | int], dict] = {} _DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} _SIDECAR_HANDLE_CACHE: dict[tuple[int, str | None, str, str], object] = {} + +# --------------------------------------------------------------------------- +# Query-result cache constants and global state +# --------------------------------------------------------------------------- +QUERY_CACHE_VLMETA_KEY = "_blosc2_query_cache" +QUERY_CACHE_FORMAT_VERSION = 1 +QUERY_CACHE_MAX_ENTRY_CBYTES = 4096 # 4 KB per persistent entry +QUERY_CACHE_MAX_MEM_CBYTES = 131_072 # 128 KB for the in-process hot cache +QUERY_CACHE_MAX_PERSISTENT_CBYTES = 2_147_483_648 # 2 GB for the payload store + +# In-process hot cache: digest -> decoded np.ndarray of coordinates. +_HOT_CACHE: dict[str, np.ndarray] = {} +# Insertion-order list for LRU eviction. +_HOT_CACHE_ORDER: list[str] = [] +# Total bytes of arrays currently in the hot cache. +_HOT_CACHE_BYTES: int = 0 +# Persistent VLArray handles: resolved urlpath -> open VLArray object. +_QUERY_CACHE_STORE_HANDLES: dict[str, object] = {} + FULL_OOC_RUN_ITEMS = 2_000_000 FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 FULL_SELECTIVE_OOC_MAX_SPANS = 128 @@ -267,6 +287,309 @@ def _save_store(array: blosc2.NDArray, store: dict) -> None: _IN_MEMORY_INDEX_FINALIZERS.setdefault(key, weakref.finalize(array, _cleanup_in_memory_store, key)) +# --------------------------------------------------------------------------- +# Stage 1 – Query cache: metadata helpers and container plumbing +# --------------------------------------------------------------------------- + + +def _query_cache_payload_path(array: blosc2.NDArray) -> str: + """Return the path for the persistent query-cache VLArray payload store.""" + path, root = _sanitize_sidecar_root(array.urlpath) + return str(path.with_name(f"{root}.__query_cache__.b2frame")) + + +def _default_query_cache_catalog(payload_path: str) -> dict: + return { + "version": QUERY_CACHE_FORMAT_VERSION, + "payload_ref": {"kind": "urlpath", "version": 1, "urlpath": payload_path}, + "max_entry_cbytes": QUERY_CACHE_MAX_ENTRY_CBYTES, + "max_mem_cbytes": QUERY_CACHE_MAX_MEM_CBYTES, + "max_persistent_cbytes": QUERY_CACHE_MAX_PERSISTENT_CBYTES, + "persistent_cbytes": 0, + "next_slot": 0, + "entries": {}, + } + + +def _load_query_cache_catalog(array: blosc2.NDArray) -> dict | None: + """Read the query-cache catalog from *array* vlmeta, or return None.""" + if not _is_persistent_array(array): + return None + try: + cat = array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] + except KeyError: + return None + if not isinstance(cat, dict) or cat.get("version") != QUERY_CACHE_FORMAT_VERSION: + return None + return cat + + +def _save_query_cache_catalog(array: blosc2.NDArray, catalog: dict) -> None: + """Write *catalog* back to *array* vlmeta.""" + array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] = catalog + + +def _open_query_cache_store(array: blosc2.NDArray, *, create: bool = False): + """Return an open (writable) VLArray for the persistent payload store. + + Returns ``None`` if the array is not persistent. When *create* is True the + store is created if it does not yet exist. + """ + if not _is_persistent_array(array): + return None + path = _query_cache_payload_path(array) + cached = _QUERY_CACHE_STORE_HANDLES.get(path) + if cached is not None: + return cached + if Path(path).exists(): + vla = blosc2.VLArray(storage=blosc2.Storage(urlpath=path, mode="a")) + _QUERY_CACHE_STORE_HANDLES[path] = vla + return vla + if not create: + return None + vla = blosc2.VLArray(storage=blosc2.Storage(urlpath=path, mode="w")) + _QUERY_CACHE_STORE_HANDLES[path] = vla + return vla + + +def _close_query_cache_store(path: str) -> None: + """Drop a cached VLArray handle for *path*.""" + _QUERY_CACHE_STORE_HANDLES.pop(path, None) + + +# --------------------------------------------------------------------------- +# Stage 2 – Cache key normalization +# --------------------------------------------------------------------------- + + +def _normalize_query_descriptor( + expression: str, + tokens: list[str], + order: list[str] | None, +) -> dict: + """Build a canonical, order-stable query descriptor for cache keying.""" + try: + normalized_expr = ast.unparse(ast.parse(expression, mode="eval")) + except Exception: + normalized_expr = expression + return { + "version": QUERY_CACHE_FORMAT_VERSION, + "kind": "indices", + "tokens": sorted(tokens), + "expr": normalized_expr, + "order": sorted(order) if order is not None else None, + } + + +def _query_cache_digest(descriptor: dict) -> str: + """Return a 32-character hex digest for *descriptor*.""" + import json + + canonical = json.dumps(descriptor, sort_keys=True, separators=(",", ":")) + return hashlib.blake2b(canonical.encode(), digest_size=16).hexdigest() + + +# --------------------------------------------------------------------------- +# Stage 3 – Payload encode/decode and hot/persistent cache helpers +# --------------------------------------------------------------------------- + + +def _encode_coords_payload(coords: np.ndarray) -> dict: + """Encode a coordinate array as a compact msgpack-safe mapping.""" + dtype = np.dtype(" np.ndarray: + """Reconstruct a coordinate array from a cached payload mapping.""" + return np.frombuffer(payload["data"], dtype=np.dtype(payload["dtype"])).copy() + + +def _hot_cache_get(digest: str) -> np.ndarray | None: + """Return the cached coordinate array for *digest*, or ``None``.""" + arr = _HOT_CACHE.get(digest) + if arr is None: + return None + # Move to most-recently-used position. + with contextlib.suppress(ValueError): + _HOT_CACHE_ORDER.remove(digest) + _HOT_CACHE_ORDER.append(digest) + return arr + + +def _hot_cache_put(digest: str, coords: np.ndarray) -> None: + """Insert *coords* into the hot cache, evicting LRU entries if needed.""" + global _HOT_CACHE_BYTES + entry_bytes = coords.nbytes + if entry_bytes > QUERY_CACHE_MAX_MEM_CBYTES: + # Single entry too large; skip. + return + # If already present, remove old accounting first. + if digest in _HOT_CACHE: + _HOT_CACHE_BYTES -= _HOT_CACHE[digest].nbytes + with contextlib.suppress(ValueError): + _HOT_CACHE_ORDER.remove(digest) + # Evict LRU entries until there is room. + while _HOT_CACHE_ORDER and _HOT_CACHE_BYTES + entry_bytes > QUERY_CACHE_MAX_MEM_CBYTES: + oldest = _HOT_CACHE_ORDER.pop(0) + evicted = _HOT_CACHE.pop(oldest, None) + if evicted is not None: + _HOT_CACHE_BYTES -= evicted.nbytes + _HOT_CACHE[digest] = coords + _HOT_CACHE_ORDER.append(digest) + _HOT_CACHE_BYTES += entry_bytes + + +def _hot_cache_clear() -> None: + """Clear all in-process hot cache entries.""" + global _HOT_CACHE_BYTES + _HOT_CACHE.clear() + _HOT_CACHE_ORDER.clear() + _HOT_CACHE_BYTES = 0 + + +def _persistent_cache_lookup(array: blosc2.NDArray, digest: str) -> np.ndarray | None: + """Return coordinates from the persistent cache for *digest*, or ``None``.""" + catalog = _load_query_cache_catalog(array) + if catalog is None: + return None + entry = catalog.get("entries", {}).get(digest) + if entry is None: + return None + slot = entry["slot"] + store = _open_query_cache_store(array) + if store is None or slot >= len(store): + return None + payload = store[slot] + if not isinstance(payload, dict) or payload.get("version") != QUERY_CACHE_FORMAT_VERSION: + return None + try: + coords = _decode_coords_payload(payload) + except Exception: + return None + return coords + + +def _persistent_cache_insert( + array: blosc2.NDArray, + digest: str, + coords: np.ndarray, + query_descriptor: dict, +) -> bool: + """Append *coords* to the persistent cache and update the catalog. + + Returns ``True`` on success, ``False`` if the entry is too large or the + persistent budget is exceeded. + """ + catalog = _load_query_cache_catalog(array) + payload_path = _query_cache_payload_path(array) + if catalog is None: + catalog = _default_query_cache_catalog(payload_path) + + payload_mapping = _encode_coords_payload(coords) + raw_data = payload_mapping["data"] + + # Measure the compressed size of the coordinate bytes directly so the + # per-entry limit is independent of VLArray/msgpack encoding overhead. + coord_dtype = np.dtype(payload_mapping["dtype"]) + compressed_coords = blosc2.compress2(raw_data, cparams=blosc2.CParams(typesize=coord_dtype.itemsize)) + cbytes = len(compressed_coords) + + max_entry = catalog.get("max_entry_cbytes", QUERY_CACHE_MAX_ENTRY_CBYTES) + if cbytes > max_entry: + return False + + max_persistent = catalog.get("max_persistent_cbytes", QUERY_CACHE_MAX_PERSISTENT_CBYTES) + current_persistent = int(catalog.get("persistent_cbytes", 0)) + if current_persistent + cbytes > max_persistent: + return False + + store = _open_query_cache_store(array, create=True) + if store is None: + return False + + slot = len(store) + store.append(payload_mapping) + + catalog["entries"][digest] = { + "slot": slot, + "cbytes": cbytes, + "nrows": len(coords), + "dtype": payload_mapping["dtype"], + "query": query_descriptor, + } + catalog["persistent_cbytes"] = current_persistent + cbytes + catalog["next_slot"] = slot + 1 + _save_query_cache_catalog(array, catalog) + return True + + +# --------------------------------------------------------------------------- +# Stage 5 – Query cache invalidation +# --------------------------------------------------------------------------- + + +def _invalidate_query_cache(array: blosc2.NDArray) -> None: + """Drop the entire query cache for *array* (persistent file + hot cache).""" + if not _is_persistent_array(array): + _hot_cache_clear() + return + payload_path = _query_cache_payload_path(array) + _close_query_cache_store(payload_path) + blosc2.remove_urlpath(payload_path) + # Clear the catalog in vlmeta. + with contextlib.suppress(KeyError, Exception): + del array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] + _hot_cache_clear() + + +# --------------------------------------------------------------------------- +# Public helper: cached coordinate lookup (used by lazyexpr.py integration) +# --------------------------------------------------------------------------- + + +def get_cached_coords( + array: blosc2.NDArray, + expression: str, + tokens: list[str], + order: list[str] | None, +) -> np.ndarray | None: + """Return cached coordinates for *expression*/*tokens*/*order*, or ``None``.""" + descriptor = _normalize_query_descriptor(expression, tokens, order) + digest = _query_cache_digest(descriptor) + # 1. In-process hot cache. + coords = _hot_cache_get(digest) + if coords is not None: + return coords + # 2. Persistent cache (persistent arrays only). + if _is_persistent_array(array): + coords = _persistent_cache_lookup(array, digest) + if coords is not None: + _hot_cache_put(digest, coords) + return coords + return None + + +def store_cached_coords( + array: blosc2.NDArray, + expression: str, + tokens: list[str], + order: list[str] | None, + coords: np.ndarray, +) -> None: + """Store *coords* in both the hot cache and (if persistent) the payload store.""" + descriptor = _normalize_query_descriptor(expression, tokens, order) + digest = _query_cache_digest(descriptor) + _hot_cache_put(digest, coords) + if _is_persistent_array(array): + _persistent_cache_insert(array, digest, coords, descriptor) + + def _supported_index_dtype(dtype: np.dtype) -> bool: return np.dtype(dtype).kind in {"b", "i", "u", "f", "m", "M"} @@ -2983,6 +3306,7 @@ def append_to_indexes(array: blosc2.NDArray, old_size: int, appended_values: np. descriptor["blocks"] = tuple(array.blocks) descriptor["stale"] = False _save_store(array, store) + _invalidate_query_cache(array) def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: @@ -2993,6 +3317,7 @@ def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None descriptor = store["indexes"].pop(token) _save_store(array, store) _drop_descriptor_sidecars(descriptor) + _invalidate_query_cache(array) def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> dict: @@ -3077,6 +3402,7 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) _clear_full_merge_cache(array, descriptor["token"]) _save_store(array, store) + _invalidate_query_cache(array) return _copy_descriptor(descriptor) dtype = np.dtype(descriptor["dtype"]) @@ -3112,6 +3438,7 @@ def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | N _clear_full_merge_cache(array, descriptor["token"]) _save_store(array, store) + _invalidate_query_cache(array) return _copy_descriptor(descriptor) @@ -3137,6 +3464,7 @@ def mark_indexes_stale(array: blosc2.NDArray) -> None: changed = True if changed: _save_store(array, store) + _invalidate_query_cache(array) def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index c8e7c85f..4abde075 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1844,19 +1844,56 @@ def slices_eval( # noqa: C901 if where is not None and len(where) == 1 and use_index and _slice == (): from . import indexing - index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) + _cache_array = where["_where_x"] + _cache_tokens = [indexing.SELF_TARGET_NAME] + + # --- Ordered path --- if _order is not None: ordered_plan = indexing.plan_ordered_query(expression, operands, where, _order) if ordered_plan.usable: + cached_coords = indexing.get_cached_coords(_cache_array, expression, _cache_tokens, _order) + if cached_coords is not None: + return cached_coords ordered_positions = indexing.ordered_query_indices(expression, operands, where, _order) if ordered_positions is not None: + indexing.store_cached_coords( + _cache_array, expression, _cache_tokens, _order, ordered_positions + ) return ordered_positions elif indexing.is_expression_order(where["_where_x"], _order): raise ValueError("expression order requires a matching full expression index") + + # --- Indices-only path (.indices().compute()) --- + if _indices and _order is None: + cached_coords = indexing.get_cached_coords(_cache_array, expression, _cache_tokens, None) + if cached_coords is not None: + return cached_coords + + # --- Value-returning path (arr[cond][:]) — cache check before plan_query --- + # Only cache for persistent arrays: in-memory arrays use id() which can be + # reused after GC, making stale hot-cache hits possible. + _cache_urlpath = getattr(_cache_array, "urlpath", None) or getattr( + getattr(_cache_array, "ndarr", None), "urlpath", None + ) + if not _indices and _order is None and _cache_urlpath is not None: + cached_coords = indexing.get_cached_coords(_cache_array, expression, _cache_tokens, None) + if cached_coords is not None: + cached_plan = indexing.IndexPlan( + usable=True, reason="cache-hit", base=_cache_array, exact_positions=cached_coords + ) + return indexing.evaluate_full_query(where, cached_plan) + + index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) + if _indices and _order is None and index_plan.usable and index_plan.exact_positions is not None: - return np.asarray(index_plan.exact_positions, dtype=np.int64) + coords = np.asarray(index_plan.exact_positions, dtype=np.int64) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return coords if index_plan.usable and not (_indices or _order): if index_plan.exact_positions is not None: + if _cache_urlpath is not None: + coords = np.asarray(index_plan.exact_positions, dtype=np.int64) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) return indexing.evaluate_full_query(where, index_plan) if index_plan.bucket_masks is not None: return indexing.evaluate_light_query(expression, operands, ne_args, where, index_plan) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 2523e358..ace222cf 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -4,12 +4,15 @@ # # SPDX-License-Identifier: BSD-3-Clause ####################################################################### +import gc import math +from pathlib import Path import numpy as np import pytest import blosc2 +import blosc2.indexing as indexing @pytest.mark.parametrize("kind", ["ultralight", "light", "medium", "full"]) @@ -1070,3 +1073,615 @@ def guarded_load_full_arrays(*args, **kwargs): snapshot = arr[:] expected = snapshot[(np.abs(snapshot["x"]) >= 22) & (np.abs(snapshot["x"]) <= 25)] np.testing.assert_array_equal(expr.compute()[:], expected) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_persistent_array(tmpdir, n=50_000): + """Create a persistent structured NDArray with a full index.""" + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data = np.empty(n, dtype=dtype) + data["id"] = np.arange(n, dtype=np.int64) + data["val"] = np.linspace(0, 1, n, dtype=np.float32) + urlpath = str(Path(tmpdir) / "arr.b2nd") + arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,), urlpath=urlpath, mode="w") + arr.create_index(field="id", kind="full") + return arr, urlpath + + +def _make_scalar_persistent_array(tmpdir, n=50_000): + """Create a persistent 1-D int64 NDArray with a full index.""" + data = np.arange(n, dtype=np.int64) + urlpath = str(Path(tmpdir) / "scalar.b2nd") + arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,), urlpath=urlpath, mode="w") + arr.create_index(kind="full") + return arr, urlpath + + +def _clear_caches(): + """Clear all in-process index and query caches between tests.""" + indexing._hot_cache_clear() + indexing._QUERY_CACHE_STORE_HANDLES.clear() + indexing._PERSISTENT_INDEXES.clear() + + +# --------------------------------------------------------------------------- +# Stage 2 – Cache key normalization +# --------------------------------------------------------------------------- + + +def test_canonical_digest_is_stable(): + """The same query always hashes to the same digest.""" + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + assert indexing._query_cache_digest(d1) == indexing._query_cache_digest(d2) + + +def test_canonical_digest_differs_on_expression_change(): + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 7)", ["__self__"], None) + assert indexing._query_cache_digest(d1) != indexing._query_cache_digest(d2) + + +def test_canonical_digest_differs_on_order_change(): + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], ["id"]) + assert indexing._query_cache_digest(d1) != indexing._query_cache_digest(d2) + + +def test_ast_normalization_ignores_whitespace(): + """ast.unparse normalizes whitespace so queries match regardless of spacing.""" + d1 = indexing._normalize_query_descriptor("(id>=3)&(id<6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("( id >= 3 ) & ( id < 6 )", ["__self__"], None) + assert indexing._query_cache_digest(d1) == indexing._query_cache_digest(d2) + + +# --------------------------------------------------------------------------- +# Stage 3 – Payload encode / decode +# --------------------------------------------------------------------------- + + +def test_encode_decode_roundtrip_u4(): + coords = np.array([0, 5, 100, 200], dtype=np.int64) + payload = indexing._encode_coords_payload(coords) + assert payload["dtype"] == " 131072); expect oldest evicted. + entry_size = 100 + for i in range(165): + coords = np.arange(entry_size, dtype=np.int64) + indexing._hot_cache_put(f"key{i}", coords) + + # First keys should have been evicted. + assert indexing._hot_cache_get("key0") is None + # Most recent keys should still be present. + assert indexing._hot_cache_get("key164") is not None + assert indexing._HOT_CACHE_BYTES <= indexing.QUERY_CACHE_MAX_MEM_CBYTES + + +def test_hot_cache_clear(): + _clear_caches() + indexing._hot_cache_put("k1", np.array([1, 2, 3], dtype=np.int64)) + indexing._hot_cache_clear() + assert indexing._hot_cache_get("k1") is None + assert indexing._HOT_CACHE_BYTES == 0 + + +# --------------------------------------------------------------------------- +# Stage 4 – End-to-end: cache miss then hit (in-memory array, hot cache only) +# --------------------------------------------------------------------------- + + +def test_in_memory_array_hot_cache_hit(): + """A second identical .indices().compute() reuses the hot cache.""" + _clear_caches() + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data = np.empty(30_000, dtype=dtype) + data["id"] = np.arange(30_000, dtype=np.int64) + data["val"] = np.zeros(30_000, dtype=np.float32) + arr = blosc2.asarray(data, chunks=(3_000,), blocks=(600,)) + arr.create_index(field="id", kind="full") + + expr = blosc2.lazyexpr("(id >= 10_000) & (id < 15_000)", arr.fields).where(arr) + result1 = expr.indices().compute() + + assert indexing._HOT_CACHE_BYTES > 0, "hot cache should be populated after first query" + + result2 = expr.indices().compute() + np.testing.assert_array_equal(result1, result2) + + +# --------------------------------------------------------------------------- +# Stage 4 – Persistent cache: cross-session hit +# --------------------------------------------------------------------------- + + +def test_persistent_cache_survives_reopen(tmp_path): + """After reopening the array the persistent cache should serve the result.""" + arr, urlpath = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 20_000) & (id < 25_000)", arr.fields).where(arr) + result1 = expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists(), "persistent payload store should be created" + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + # Re-open the array in a fresh process-local state. + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + result2 = blosc2.lazyexpr("(id >= 20_000) & (id < 25_000)", arr2.fields).where(arr2).indices().compute() + + np.testing.assert_array_equal(result1, result2) + + +def test_persistent_cache_not_created_for_non_persistent_array(): + _clear_caches() + data = np.arange(10_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(1_000,), blocks=(200,)) + arr.create_index(kind="full") + result = indexing._persistent_cache_lookup(arr, "any_digest") + assert result is None + + +# --------------------------------------------------------------------------- +# Stage 3 – 4 KB per-entry size limit +# --------------------------------------------------------------------------- + + +def test_persistent_entry_size_limit_rejected(tmp_path): + """Entries whose compressed size exceeds 4 KB must not be stored.""" + arr, _ = _make_persistent_array(tmp_path, n=50_000) + _clear_caches() + + # Random (non-sequential) coordinates compress poorly and should exceed 4 KB. + rng = np.random.default_rng(42) + coords = np.sort(rng.choice(50_000, size=5_000, replace=False)).astype(np.int64) + + # Verify this is actually > 4KB compressed with the same method used internally. + payload_mapping = indexing._encode_coords_payload(coords) + raw_data = payload_mapping["data"] + coord_dtype = np.dtype(payload_mapping["dtype"]) + compressed = blosc2.compress2(raw_data, cparams=blosc2.CParams(typesize=coord_dtype.itemsize)) + assert len(compressed) > indexing.QUERY_CACHE_MAX_ENTRY_CBYTES, ( + f"test setup error: compressed size {len(compressed)} must exceed " + f"{indexing.QUERY_CACHE_MAX_ENTRY_CBYTES} for this test to be meaningful" + ) + + descriptor = indexing._normalize_query_descriptor("(id >= 0) & (id < 50000)", ["__self__"], None) + digest = indexing._query_cache_digest(descriptor) + + result = indexing._persistent_cache_insert(arr, digest, coords, descriptor) + assert result is False, "oversized entry must be rejected" + + +# --------------------------------------------------------------------------- +# Stage 5 – Invalidation +# --------------------------------------------------------------------------- + + +def test_invalidation_on_drop_index(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + arr.drop_index() + assert not Path(payload_path).exists(), "payload file should be removed after drop_index" + assert indexing._HOT_CACHE_BYTES == 0 + assert indexing._load_query_cache_catalog(arr) is None + + +def test_invalidation_on_rebuild_index(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + arr.rebuild_index() + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +def test_invalidation_on_compact_index(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + arr.compact_index() + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +def test_invalidation_on_mark_indexes_stale(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + indexing.mark_indexes_stale(arr) + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +def test_invalidation_on_append(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + extra = np.empty(1_000, dtype=dtype) + extra["id"] = np.arange(50_000, 51_000, dtype=np.int64) + extra["val"] = np.zeros(1_000, dtype=np.float32) + arr.append(extra) + # append calls append_to_indexes which calls _invalidate_query_cache. + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +# --------------------------------------------------------------------------- +# Stage 4 – Ordered-coordinate query caching +# --------------------------------------------------------------------------- + + +def test_ordered_query_indices_cached(tmp_path): + """Ordered .indices(order=...).compute() results are cached and reused.""" + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + lazy = blosc2.lazyexpr("(id >= 10_000) & (id < 20_000)", arr.fields).where(arr) + result1 = lazy.indices(order="id").compute() + + assert indexing._HOT_CACHE_BYTES > 0 + + _clear_caches() + arr2 = blosc2.open(arr.urlpath, mode="r") + result2 = ( + blosc2.lazyexpr("(id >= 10_000) & (id < 20_000)", arr2.fields) + .where(arr2) + .indices(order="id") + .compute() + ) + + np.testing.assert_array_equal(result1, result2) + + +# --------------------------------------------------------------------------- +# Stage 4 – Multiple distinct queries stored in same array cache +# --------------------------------------------------------------------------- + + +def test_multiple_distinct_queries_in_same_cache(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr1 = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr2 = blosc2.lazyexpr("(id >= 20_000) & (id < 25_000)", arr.fields).where(arr) + + r1 = expr1.indices().compute() + r2 = expr2.indices().compute() + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 2 + + # Verify both results are consistent with scan. + dtype = arr.dtype + data = arr[:] + np.testing.assert_array_equal(r1, np.where((data["id"] >= 5_000) & (data["id"] < 10_000))[0]) + np.testing.assert_array_equal(r2, np.where((data["id"] >= 20_000) & (data["id"] < 25_000))[0]) + + +# --------------------------------------------------------------------------- +# Stage 4 – In-memory (hot cache only) for structured array query +# --------------------------------------------------------------------------- + + +def test_hot_cache_avoids_recompute(tmp_path): + """Second call returns cached result without re-planning the index.""" + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr.fields).where(arr) + result1 = expr.indices().compute() + hot_bytes_after_first = indexing._HOT_CACHE_BYTES + assert hot_bytes_after_first > 0 + + result2 = expr.indices().compute() + # Hot cache should not have grown (same digest, same entry). + assert hot_bytes_after_first == indexing._HOT_CACHE_BYTES + np.testing.assert_array_equal(result1, result2) + + +# --------------------------------------------------------------------------- +# Value-path (arr[cond][:]) caching for persistent arrays +# --------------------------------------------------------------------------- + + +def test_value_path_cache_hit_persistent(tmp_path): + """arr[cond][:] on a persistent full-indexed array caches coords and serves warm calls.""" + arr, urlpath = _make_persistent_array(tmp_path) + _clear_caches() + + cond = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr.fields) + result1 = arr[cond][:] + + # After first call, cache should have an entry. + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + # Warm call: serve from cache. + _clear_caches() # only clears hot cache; persistent VLArray remains + arr2 = blosc2.open(urlpath, mode="r") + cond2 = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr2.fields) + result2 = arr2[cond2][:] + + np.testing.assert_array_equal(result1, result2) + # Verify against scan. + data = arr[:] + expected = data[(data["id"] >= 10_000) & (data["id"] < 12_000)] + np.testing.assert_array_equal(result1, expected) + + +# =========================================================================== +# In-memory vs on-disk cache scenarios (value path: arr[cond][:]) +# =========================================================================== + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _make_structured_array(tmpdir=None, n=20_000, kind="full"): + """Create a structured NDArray (persistent if tmpdir, in-memory otherwise).""" + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data = np.empty(n, dtype=dtype) + data["id"] = np.arange(n, dtype=np.int64) + data["val"] = np.linspace(0.0, 1.0, n, dtype=np.float32) + kwargs = {} + if tmpdir is not None: + kwargs["urlpath"] = str(Path(tmpdir) / f"arr_{kind}.b2nd") + kwargs["mode"] = "w" + arr = blosc2.asarray(data, chunks=(2_000,), blocks=(500,), **kwargs) + arr.create_index(field="id", kind=kind) + return arr + + +def _make_scalar_array(tmpdir=None, n=20_000, kind="full"): + """Create a 1-D int64 NDArray (persistent if tmpdir, in-memory otherwise).""" + data = np.arange(n, dtype=np.int64) + kwargs = {} + if tmpdir is not None: + kwargs["urlpath"] = str(Path(tmpdir) / f"scalar_{kind}.b2nd") + kwargs["mode"] = "w" + arr = blosc2.asarray(data, chunks=(2_000,), blocks=(500,), **kwargs) + arr.create_index(kind=kind) + return arr + + +def _value_query(arr, lo=5_000, hi=7_000): + """Run arr[cond][:] and return the values.""" + cond = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields) + return arr[cond][:] + + +def _scalar_value_query(arr, lo=5_000, hi=7_000): + """Run arr[cond][:] for a scalar (non-structured) array.""" + cond = (arr >= lo) & (arr < hi) + return arr[cond][:] + + +# --------------------------------------------------------------------------- +# In-memory arrays – value path +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("kind", ["full", "medium", "light"]) +def test_inmem_value_path_correct(kind): + """In-memory value-path queries return correct results for all index kinds.""" + arr = _make_structured_array(kind=kind) + _clear_caches() + + result = _value_query(arr) + data = arr[:] + expected = data[(data["id"] >= 5_000) & (data["id"] < 7_000)] + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize("kind", ["full", "medium", "light"]) +def test_inmem_value_path_repeated_calls_stable(kind): + """Repeated in-memory value-path calls on the same object are stable.""" + arr = _make_structured_array(kind=kind) + _clear_caches() + + r1 = _value_query(arr) + r2 = _value_query(arr) + np.testing.assert_array_equal(r1, r2) + + +def test_inmem_value_path_no_cross_array_contamination(): + """Different in-memory arrays with the same expression never share cache entries. + + This guards against the Python id() address-reuse bug: when array A is GC'd + and array B reuses the same address, a stale hot-cache hit must not occur. + """ + # int32 array: values 0..19999; query value 137 → exactly 1 match + arr_i32 = blosc2.asarray(np.arange(20_000, dtype=np.int32), chunks=(2_000,), blocks=(500,)) + arr_i32.create_index(kind="full") + _clear_caches() + cond_i32 = arr_i32 == np.int32(137) + r1 = arr_i32[cond_i32][:] + assert len(r1) == 1, "int32 query should find exactly 1 match" + + # GC the first array so Python may reuse its id() + del arr_i32, cond_i32 + gc.collect() + + # uint8 array with same values 0..19999 (wraps every 256): 137 matches 78 times + arr_u8 = blosc2.asarray(np.arange(20_000, dtype=np.uint8), chunks=(2_000,), blocks=(500,)) + arr_u8.create_index(kind="full") + cond_u8 = arr_u8 == np.uint8(137) + r2 = arr_u8[cond_u8][:] + expected_count = int(np.sum(np.arange(20_000, dtype=np.uint8) == 137)) + assert len(r2) == expected_count, f"Expected {expected_count} matches for uint8==137, got {len(r2)}" + + +# --------------------------------------------------------------------------- +# On-disk arrays – value path +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("kind", ["full", "medium", "light"]) +def test_ondisk_value_path_correct(tmp_path, kind): + """On-disk value-path queries return correct results for all index kinds.""" + arr = _make_structured_array(tmp_path, kind=kind) + _clear_caches() + + result = _value_query(arr) + data = arr[:] + expected = data[(data["id"] >= 5_000) & (data["id"] < 7_000)] + np.testing.assert_array_equal(result, expected) + + +def test_ondisk_value_path_full_warm_hits_cache(tmp_path): + """After the first on-disk full-index value query, warm calls use the cache.""" + arr = _make_structured_array(tmp_path, kind="full") + urlpath = arr.urlpath + _clear_caches() + + # Cold call – populates persistent cache + r1 = _value_query(arr) + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + # Warm call after clearing hot cache (simulates a new process re-opening the file) + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + r2 = _value_query(arr2) + np.testing.assert_array_equal(r1, r2) + + +@pytest.mark.parametrize("kind", ["medium", "light"]) +def test_ondisk_value_path_non_full_correct(tmp_path, kind): + """Light/medium on-disk value queries are correct (no coord caching, but correct).""" + arr = _make_structured_array(tmp_path, kind=kind) + _clear_caches() + + r1 = _value_query(arr) + r2 = _value_query(arr) # repeated call + data = arr[:] + expected = data[(data["id"] >= 5_000) & (data["id"] < 7_000)] + np.testing.assert_array_equal(r1, expected) + np.testing.assert_array_equal(r2, expected) + + +# --------------------------------------------------------------------------- +# On-disk arrays – indices path (.indices().compute()) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("kind", ["full"]) +def test_ondisk_indices_path_warm_hits_cache(tmp_path, kind): + """After the first on-disk .indices().compute(), warm calls use the cache.""" + arr = _make_structured_array(tmp_path, kind=kind) + urlpath = arr.urlpath + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr.fields).where(arr) + r1 = expr.indices().compute() + + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + expr2 = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr2.fields).where(arr2) + r2 = expr2.indices().compute() + + np.testing.assert_array_equal(r1, r2) + # Verify against scan. + data = arr[:] + expected = np.where((data["id"] >= 5_000) & (data["id"] < 7_000))[0] + np.testing.assert_array_equal(r1, expected) + + +# --------------------------------------------------------------------------- +# In-memory arrays – indices path (.indices().compute()) +# --------------------------------------------------------------------------- + + +def test_inmem_indices_path_hot_cache_hit(): + """Second .indices().compute() call on an in-memory array is served from hot cache.""" + arr = _make_structured_array(kind="full") + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr.fields).where(arr) + r1 = expr.indices().compute() + hot_before = indexing._HOT_CACHE_BYTES + + r2 = expr.indices().compute() + assert hot_before == indexing._HOT_CACHE_BYTES # no new entry added + np.testing.assert_array_equal(r1, r2) + + data = arr[:] + expected = np.where((data["id"] >= 5_000) & (data["id"] < 7_000))[0] + np.testing.assert_array_equal(r1, expected) From b112f3a003d07d5225f8c63d531fd1424a90d9bc Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 14:37:18 +0200 Subject: [PATCH 61/68] Fix query-result cache isolation and non-exact warm reuse Scope hot query-cache entries per array instead of by digest alone, so different arrays no longer share cached coordinates incorrectly. Preserve ordered query field sequence in cache keys and handle empty coordinate results safely. Restore in-memory value-path caching for arr[cond][:] now that hot-cache entries are array-scoped. Extend light and segment evaluators to return exact positions when needed so value/indices queries can populate the cache even when the planner does not start with exact_positions. Add regressions for: - cross-array hot-cache contamination - ordered cache-key collisions - empty cached results - in-memory GC cleanup of scoped hot-cache entries - warm cache hits for light/ultralight/medium/full value-path queries --- src/blosc2/indexing.py | 271 +++++++++++++++++++++++++-------- src/blosc2/lazyexpr.py | 44 ++++-- tests/ndarray/test_indexing.py | 144 +++++++++++++++++- 3 files changed, 380 insertions(+), 79 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 06ef1e9c..c477bb2d 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -62,14 +62,15 @@ QUERY_CACHE_MAX_MEM_CBYTES = 131_072 # 128 KB for the in-process hot cache QUERY_CACHE_MAX_PERSISTENT_CBYTES = 2_147_483_648 # 2 GB for the payload store -# In-process hot cache: digest -> decoded np.ndarray of coordinates. -_HOT_CACHE: dict[str, np.ndarray] = {} +# In-process hot cache: (array-scope, digest) -> decoded np.ndarray of coordinates. +_HOT_CACHE: dict[tuple[tuple[str, str | int], str], np.ndarray] = {} # Insertion-order list for LRU eviction. -_HOT_CACHE_ORDER: list[str] = [] +_HOT_CACHE_ORDER: list[tuple[tuple[str, str | int], str]] = [] # Total bytes of arrays currently in the hot cache. _HOT_CACHE_BYTES: int = 0 # Persistent VLArray handles: resolved urlpath -> open VLArray object. _QUERY_CACHE_STORE_HANDLES: dict[str, object] = {} +_HOT_CACHE_GLOBAL_SCOPE = ("global", 0) FULL_OOC_RUN_ITEMS = 2_000_000 FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 @@ -93,6 +94,7 @@ def _sanitize_token(token: str) -> str: def _cleanup_in_memory_store(key: int) -> None: _IN_MEMORY_INDEXES.pop(key, None) _IN_MEMORY_INDEX_FINALIZERS.pop(key, None) + _hot_cache_clear(scope=("memory", key)) @dataclass(slots=True) @@ -298,6 +300,24 @@ def _query_cache_payload_path(array: blosc2.NDArray) -> str: return str(path.with_name(f"{root}.__query_cache__.b2frame")) +def _query_cache_owner(array: blosc2.NDArray) -> blosc2.NDArray: + owner = getattr(array, "ndarr", None) + return owner if owner is not None else array + + +def _ensure_in_memory_array_finalizer(array: blosc2.NDArray) -> None: + if _is_persistent_array(array): + return + key = id(array) + _IN_MEMORY_INDEX_FINALIZERS.setdefault(key, weakref.finalize(array, _cleanup_in_memory_store, key)) + + +def _query_cache_scope(array: blosc2.NDArray) -> tuple[str, str | int]: + owner = _query_cache_owner(array) + _ensure_in_memory_array_finalizer(owner) + return _array_key(owner) + + def _default_query_cache_catalog(payload_path: str) -> dict: return { "version": QUERY_CACHE_FORMAT_VERSION, @@ -377,7 +397,7 @@ def _normalize_query_descriptor( "kind": "indices", "tokens": sorted(tokens), "expr": normalized_expr, - "order": sorted(order) if order is not None else None, + "order": list(order) if order is not None else None, } @@ -396,7 +416,10 @@ def _query_cache_digest(descriptor: dict) -> str: def _encode_coords_payload(coords: np.ndarray) -> dict: """Encode a coordinate array as a compact msgpack-safe mapping.""" - dtype = np.dtype(" np.ndarray: return np.frombuffer(payload["data"], dtype=np.dtype(payload["dtype"])).copy() -def _hot_cache_get(digest: str) -> np.ndarray | None: +def _hot_cache_key( + digest: str, scope: tuple[str, str | int] | None = None +) -> tuple[tuple[str, str | int], str]: + return (_HOT_CACHE_GLOBAL_SCOPE if scope is None else scope, digest) + + +def _hot_cache_get(digest: str, scope: tuple[str, str | int] | None = None) -> np.ndarray | None: """Return the cached coordinate array for *digest*, or ``None``.""" - arr = _HOT_CACHE.get(digest) + key = _hot_cache_key(digest, scope) + arr = _HOT_CACHE.get(key) if arr is None: return None # Move to most-recently-used position. with contextlib.suppress(ValueError): - _HOT_CACHE_ORDER.remove(digest) - _HOT_CACHE_ORDER.append(digest) + _HOT_CACHE_ORDER.remove(key) + _HOT_CACHE_ORDER.append(key) return arr -def _hot_cache_put(digest: str, coords: np.ndarray) -> None: +def _hot_cache_put(digest: str, coords: np.ndarray, scope: tuple[str, str | int] | None = None) -> None: """Insert *coords* into the hot cache, evicting LRU entries if needed.""" global _HOT_CACHE_BYTES + key = _hot_cache_key(digest, scope) entry_bytes = coords.nbytes if entry_bytes > QUERY_CACHE_MAX_MEM_CBYTES: # Single entry too large; skip. return # If already present, remove old accounting first. - if digest in _HOT_CACHE: - _HOT_CACHE_BYTES -= _HOT_CACHE[digest].nbytes + if key in _HOT_CACHE: + _HOT_CACHE_BYTES -= _HOT_CACHE[key].nbytes with contextlib.suppress(ValueError): - _HOT_CACHE_ORDER.remove(digest) + _HOT_CACHE_ORDER.remove(key) # Evict LRU entries until there is room. while _HOT_CACHE_ORDER and _HOT_CACHE_BYTES + entry_bytes > QUERY_CACHE_MAX_MEM_CBYTES: oldest = _HOT_CACHE_ORDER.pop(0) evicted = _HOT_CACHE.pop(oldest, None) if evicted is not None: _HOT_CACHE_BYTES -= evicted.nbytes - _HOT_CACHE[digest] = coords - _HOT_CACHE_ORDER.append(digest) + _HOT_CACHE[key] = coords + _HOT_CACHE_ORDER.append(key) _HOT_CACHE_BYTES += entry_bytes -def _hot_cache_clear() -> None: - """Clear all in-process hot cache entries.""" +def _hot_cache_clear(scope: tuple[str, str | int] | None = None) -> None: + """Clear all in-process hot cache entries for *scope* (or all scopes).""" global _HOT_CACHE_BYTES + if scope is not None: + keys = [key for key in _HOT_CACHE if key[0] == scope] + for key in keys: + _HOT_CACHE_BYTES -= _HOT_CACHE.pop(key).nbytes + _HOT_CACHE_ORDER[:] = [key for key in _HOT_CACHE_ORDER if key[0] != scope] + return _HOT_CACHE.clear() _HOT_CACHE_ORDER.clear() _HOT_CACHE_BYTES = 0 @@ -536,8 +573,9 @@ def _persistent_cache_insert( def _invalidate_query_cache(array: blosc2.NDArray) -> None: """Drop the entire query cache for *array* (persistent file + hot cache).""" + scope = _query_cache_scope(array) if not _is_persistent_array(array): - _hot_cache_clear() + _hot_cache_clear(scope=scope) return payload_path = _query_cache_payload_path(array) _close_query_cache_store(payload_path) @@ -545,7 +583,7 @@ def _invalidate_query_cache(array: blosc2.NDArray) -> None: # Clear the catalog in vlmeta. with contextlib.suppress(KeyError, Exception): del array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] - _hot_cache_clear() + _hot_cache_clear(scope=scope) # --------------------------------------------------------------------------- @@ -560,17 +598,19 @@ def get_cached_coords( order: list[str] | None, ) -> np.ndarray | None: """Return cached coordinates for *expression*/*tokens*/*order*, or ``None``.""" + owner = _query_cache_owner(array) + scope = _query_cache_scope(owner) descriptor = _normalize_query_descriptor(expression, tokens, order) digest = _query_cache_digest(descriptor) # 1. In-process hot cache. - coords = _hot_cache_get(digest) + coords = _hot_cache_get(digest, scope=scope) if coords is not None: return coords # 2. Persistent cache (persistent arrays only). - if _is_persistent_array(array): - coords = _persistent_cache_lookup(array, digest) + if _is_persistent_array(owner): + coords = _persistent_cache_lookup(owner, digest) if coords is not None: - _hot_cache_put(digest, coords) + _hot_cache_put(digest, coords, scope=scope) return coords return None @@ -583,11 +623,13 @@ def store_cached_coords( coords: np.ndarray, ) -> None: """Store *coords* in both the hot cache and (if persistent) the payload store.""" + owner = _query_cache_owner(array) + scope = _query_cache_scope(owner) descriptor = _normalize_query_descriptor(expression, tokens, order) digest = _query_cache_digest(descriptor) - _hot_cache_put(digest, coords) - if _is_persistent_array(array): - _persistent_cache_insert(array, digest, coords, descriptor) + _hot_cache_put(digest, coords, scope=scope) + if _is_persistent_array(owner): + _persistent_cache_insert(owner, digest, coords, descriptor) def _supported_index_dtype(dtype: np.dtype) -> bool: @@ -4516,9 +4558,15 @@ def _light_match_from_span(span: np.ndarray, plan: IndexPlan) -> np.ndarray: def _process_light_chunk_batch( - chunk_ids: np.ndarray, where_x, plan: IndexPlan, total_len: int, chunk_len: int -) -> np.ndarray: - parts = [] + chunk_ids: np.ndarray, + where_x, + plan: IndexPlan, + total_len: int, + chunk_len: int, + return_positions: bool = False, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + value_parts = [] + position_parts = [] local_where_x = _light_worker_source(where_x) for chunk_id in chunk_ids: bucket_mask = plan.bucket_masks[int(chunk_id)] @@ -4538,10 +4586,14 @@ def _process_light_chunk_batch( span = local_where_x[start:stop] match = _light_match_from_span(span, plan) if np.any(match): - parts.append(np.require(span[match], requirements="C")) - if not parts: + value_parts.append(np.require(span[match], requirements="C")) + if return_positions: + position_parts.append(np.flatnonzero(match).astype(np.int64, copy=False) + start) + if return_positions: + return _merge_value_position_batches(value_parts, position_parts, _light_batch_result_dtype(where_x)) + if not value_parts: return np.empty(0, dtype=_light_batch_result_dtype(where_x)) - return np.concatenate(parts) if len(parts) > 1 else parts[0] + return np.concatenate(value_parts) if len(value_parts) > 1 else value_parts[0] def _merge_result_batches(parts: list[np.ndarray], dtype: np.dtype) -> np.ndarray: @@ -4551,6 +4603,70 @@ def _merge_result_batches(parts: list[np.ndarray], dtype: np.dtype) -> np.ndarra return np.concatenate(parts) if len(parts) > 1 else parts[0] +def _merge_value_position_batches( + value_batches: list[np.ndarray], position_batches: list[np.ndarray], dtype: np.dtype +) -> tuple[np.ndarray, np.ndarray]: + return _merge_result_batches(value_batches, dtype), _merge_position_batches(position_batches) + + +def _merge_segment_query_batches( + parts: list[np.ndarray] | list[tuple[np.ndarray, np.ndarray]], + dtype: np.dtype, + *, + return_positions: bool, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + if return_positions: + value_batches = [] + position_batches = [] + for values, positions in parts: + if len(values) > 0: + value_batches.append(values) + if len(positions) > 0: + position_batches.append(positions) + return _merge_value_position_batches(value_batches, position_batches, dtype) + + value_batches = [part for part in parts if len(part) > 0] + if value_batches: + return np.concatenate(value_batches) if len(value_batches) > 1 else value_batches[0] + return np.empty(0, dtype=dtype) + + +def _process_segment_query_batch( + units: np.ndarray, + expression: str, + operands: dict, + ne_args: dict, + where: dict, + plan: IndexPlan, + result_dtype: np.dtype, + return_positions: bool, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + from .lazyexpr import _get_result, ne_evaluate + from .utils import get_chunk_operands + + chunk_operands = {} + value_parts = [] + position_parts = [] + for unit in units: + start = int(unit) * plan.segment_len + stop = min(start + plan.segment_len, plan.base.shape[0]) + cslice = (slice(start, stop, 1),) + get_chunk_operands(operands, cslice, chunk_operands, plan.base.shape) + if return_positions: + match = ne_evaluate(expression, chunk_operands, **ne_args) + if np.any(match): + value_parts.append(np.require(chunk_operands["_where_x"][match], requirements="C")) + absolute = np.arange(start, stop, dtype=np.int64) + position_parts.append(absolute[match]) + else: + result, _ = _get_result(expression, chunk_operands, ne_args, where) + if len(result) > 0: + value_parts.append(np.require(result, requirements="C")) + if return_positions: + return _merge_value_position_batches(value_parts, position_parts, result_dtype) + return _merge_result_batches(value_parts, result_dtype) + + def _reduced_positions_from_cython_batches( candidate_chunk_ids: np.ndarray, thread_count: int, process_batch ) -> tuple[np.ndarray, int]: @@ -5131,48 +5247,63 @@ def _where_output_dtype(where_x) -> np.dtype: def evaluate_segment_query( - expression: str, operands: dict, ne_args: dict, where: dict, plan: IndexPlan -) -> np.ndarray: - from .lazyexpr import _get_result - from .utils import get_chunk_operands - + expression: str, + operands: dict, + ne_args: dict, + where: dict, + plan: IndexPlan, + *, + return_positions: bool = False, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: if plan.base is None or plan.candidate_units is None or plan.segment_len is None: raise ValueError("segment evaluation requires a segment-based plan") candidate_units = np.flatnonzero(plan.candidate_units).astype(np.intp, copy=False) - - def process_batch(units: np.ndarray) -> np.ndarray: - chunk_operands = {} - parts = [] - for unit in units: - start = int(unit) * plan.segment_len - stop = min(start + plan.segment_len, plan.base.shape[0]) - cslice = (slice(start, stop, 1),) - get_chunk_operands(operands, cslice, chunk_operands, plan.base.shape) - result, _ = _get_result(expression, chunk_operands, ne_args, where) - if len(result) > 0: - parts.append(np.require(result, requirements="C")) - if not parts: - return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) - return np.concatenate(parts) if len(parts) > 1 else parts[0] + result_dtype = _where_output_dtype(where["_where_x"]) thread_count = _downstream_query_thread_count(len(candidate_units), plan) if thread_count <= 1: - parts = [process_batch(candidate_units)] + parts = [ + _process_segment_query_batch( + candidate_units, + expression, + operands, + ne_args, + where, + plan, + result_dtype, + return_positions=return_positions, + ) + ] else: batches = _chunk_batches(candidate_units, thread_count) with ThreadPoolExecutor(max_workers=thread_count) as executor: - parts = list(executor.map(process_batch, batches)) + parts = list( + executor.map( + _process_segment_query_batch, + batches, + [expression] * len(batches), + [operands] * len(batches), + [ne_args] * len(batches), + [where] * len(batches), + [plan] * len(batches), + [result_dtype] * len(batches), + [return_positions] * len(batches), + ) + ) - parts = [part for part in parts if len(part) > 0] - if parts: - return np.concatenate(parts) if len(parts) > 1 else parts[0] - return np.empty(0, dtype=_where_output_dtype(where["_where_x"])) + return _merge_segment_query_batches(parts, result_dtype, return_positions=return_positions) def evaluate_light_query( - expression: str, operands: dict, ne_args: dict, where: dict, plan: IndexPlan -) -> np.ndarray: + expression: str, + operands: dict, + ne_args: dict, + where: dict, + plan: IndexPlan, + *, + return_positions: bool = False, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: del expression, operands, ne_args if plan.base is None or plan.bucket_masks is None or plan.chunk_len is None or plan.bucket_len is None: @@ -5182,10 +5313,15 @@ def evaluate_light_query( chunk_len = int(plan.base.chunks[0]) where_x = where["_where_x"] candidate_chunk_ids = np.flatnonzero(np.any(plan.bucket_masks, axis=1)).astype(np.intp, copy=False) + result_dtype = _where_output_dtype(where["_where_x"]) thread_count = _downstream_query_thread_count(len(candidate_chunk_ids), plan) if thread_count <= 1: - parts = [_process_light_chunk_batch(candidate_chunk_ids, where_x, plan, total_len, chunk_len)] + parts = [ + _process_light_chunk_batch( + candidate_chunk_ids, where_x, plan, total_len, chunk_len, return_positions + ) + ] else: batches = _chunk_batches(candidate_chunk_ids, thread_count) with ThreadPoolExecutor(max_workers=thread_count) as executor: @@ -5197,10 +5333,21 @@ def evaluate_light_query( [plan] * len(batches), [total_len] * len(batches), [chunk_len] * len(batches), + [return_positions] * len(batches), ) ) - return _merge_result_batches(parts, _where_output_dtype(where["_where_x"])) + if return_positions: + value_batches = [] + position_batches = [] + for values, positions in parts: + if len(values) > 0: + value_batches.append(values) + if len(positions) > 0: + position_batches.append(positions) + return _merge_value_position_batches(value_batches, position_batches, result_dtype) + + return _merge_result_batches(parts, result_dtype) def _gather_positions(where_x, positions: np.ndarray) -> np.ndarray: diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 4abde075..5e53a97b 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1870,12 +1870,10 @@ def slices_eval( # noqa: C901 return cached_coords # --- Value-returning path (arr[cond][:]) — cache check before plan_query --- - # Only cache for persistent arrays: in-memory arrays use id() which can be - # reused after GC, making stale hot-cache hits possible. _cache_urlpath = getattr(_cache_array, "urlpath", None) or getattr( getattr(_cache_array, "ndarr", None), "urlpath", None ) - if not _indices and _order is None and _cache_urlpath is not None: + if not _indices and _order is None: cached_coords = indexing.get_cached_coords(_cache_array, expression, _cache_tokens, None) if cached_coords is not None: cached_plan = indexing.IndexPlan( @@ -1885,20 +1883,40 @@ def slices_eval( # noqa: C901 index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) - if _indices and _order is None and index_plan.usable and index_plan.exact_positions is not None: - coords = np.asarray(index_plan.exact_positions, dtype=np.int64) - indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) - return coords + if _indices and _order is None and index_plan.usable: + if index_plan.exact_positions is not None: + coords = np.asarray(index_plan.exact_positions, dtype=np.int64) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return coords + if index_plan.bucket_masks is not None: + _, coords = indexing.evaluate_light_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return coords + if index_plan.candidate_units is not None and index_plan.segment_len is not None: + _, coords = indexing.evaluate_segment_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return coords if index_plan.usable and not (_indices or _order): if index_plan.exact_positions is not None: - if _cache_urlpath is not None: - coords = np.asarray(index_plan.exact_positions, dtype=np.int64) - indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + coords = np.asarray(index_plan.exact_positions, dtype=np.int64) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) return indexing.evaluate_full_query(where, index_plan) if index_plan.bucket_masks is not None: - return indexing.evaluate_light_query(expression, operands, ne_args, where, index_plan) - if index_plan.level not in (None, "chunk"): - return indexing.evaluate_segment_query(expression, operands, ne_args, where, index_plan) + result, coords = indexing.evaluate_light_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return result + if index_plan.candidate_units is not None and index_plan.segment_len is not None: + result, coords = indexing.evaluate_segment_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return result for chunk_slice in intersecting_chunks: # Check whether current cslice intersects with _slice diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index ace222cf..cf813b22 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -1132,6 +1132,12 @@ def test_canonical_digest_differs_on_order_change(): assert indexing._query_cache_digest(d1) != indexing._query_cache_digest(d2) +def test_canonical_digest_preserves_order_field_sequence(): + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], ["a", "b"]) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], ["b", "a"]) + assert indexing._query_cache_digest(d1) != indexing._query_cache_digest(d2) + + def test_ast_normalization_ignores_whitespace(): """ast.unparse normalizes whitespace so queries match regardless of spacing.""" d1 = indexing._normalize_query_descriptor("(id>=3)&(id<6)", ["__self__"], None) @@ -1160,6 +1166,14 @@ def test_encode_decode_roundtrip_u8(): np.testing.assert_array_equal(recovered, coords.astype(np.uint64)) +def test_encode_decode_roundtrip_empty(): + coords = np.array([], dtype=np.int64) + payload = indexing._encode_coords_payload(coords) + assert payload["dtype"] == "= 1)", arr.fields).where(arr) + ordered_ab = expr.indices(order=["a", "b"]).compute()[:] + ordered_ba = expr.indices(order=["b", "a"]).compute()[:] + + np.testing.assert_array_equal(ordered_ab, np.argsort(data, order=["a", "b"])) + np.testing.assert_array_equal(ordered_ba, np.argsort(data, order=["b", "a"])) + + # --------------------------------------------------------------------------- # Stage 4 – Multiple distinct queries stored in same array cache # --------------------------------------------------------------------------- @@ -1537,7 +1578,7 @@ def _scalar_value_query(arr, lo=5_000, hi=7_000): # --------------------------------------------------------------------------- -@pytest.mark.parametrize("kind", ["full", "medium", "light"]) +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) def test_inmem_value_path_correct(kind): """In-memory value-path queries return correct results for all index kinds.""" arr = _make_structured_array(kind=kind) @@ -1549,7 +1590,7 @@ def test_inmem_value_path_correct(kind): np.testing.assert_array_equal(result, expected) -@pytest.mark.parametrize("kind", ["full", "medium", "light"]) +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) def test_inmem_value_path_repeated_calls_stable(kind): """Repeated in-memory value-path calls on the same object are stable.""" arr = _make_structured_array(kind=kind) @@ -1560,6 +1601,21 @@ def test_inmem_value_path_repeated_calls_stable(kind): np.testing.assert_array_equal(r1, r2) +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) +def test_inmem_value_path_hot_cache_hit(kind): + """Second in-memory arr[cond][:] call should reuse the scoped hot cache.""" + arr = _make_structured_array(kind=kind) + _clear_caches() + + r1 = _value_query(arr) + hot_before = indexing._HOT_CACHE_BYTES + assert hot_before > 0 + + r2 = _value_query(arr) + assert hot_before == indexing._HOT_CACHE_BYTES + np.testing.assert_array_equal(r1, r2) + + def test_inmem_value_path_no_cross_array_contamination(): """Different in-memory arrays with the same expression never share cache entries. @@ -1592,7 +1648,7 @@ def test_inmem_value_path_no_cross_array_contamination(): # --------------------------------------------------------------------------- -@pytest.mark.parametrize("kind", ["full", "medium", "light"]) +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) def test_ondisk_value_path_correct(tmp_path, kind): """On-disk value-path queries return correct results for all index kinds.""" arr = _make_structured_array(tmp_path, kind=kind) @@ -1623,9 +1679,28 @@ def test_ondisk_value_path_full_warm_hits_cache(tmp_path): np.testing.assert_array_equal(r1, r2) +@pytest.mark.parametrize("kind", ["ultralight", "light"]) +def test_ondisk_value_path_non_exact_warm_hits_cache(tmp_path, kind): + """Ultralight/light on-disk value queries should populate the coordinate cache.""" + arr = _make_structured_array(tmp_path, kind=kind) + urlpath = arr.urlpath + _clear_caches() + + r1 = _value_query(arr) + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + r2 = _value_query(arr2) + + np.testing.assert_array_equal(r1, r2) + + @pytest.mark.parametrize("kind", ["medium", "light"]) def test_ondisk_value_path_non_full_correct(tmp_path, kind): - """Light/medium on-disk value queries are correct (no coord caching, but correct).""" + """Light/medium on-disk value queries are correct.""" arr = _make_structured_array(tmp_path, kind=kind) _clear_caches() @@ -1685,3 +1760,64 @@ def test_inmem_indices_path_hot_cache_hit(): data = arr[:] expected = np.where((data["id"] >= 5_000) & (data["id"] < 7_000))[0] np.testing.assert_array_equal(r1, expected) + + +def test_inmem_indices_cache_entries_are_dropped_on_gc(): + arr = _make_structured_array(kind="full") + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr.fields).where(arr) + result = expr.indices().compute() + assert result.shape[0] == 2_000 + assert indexing._HOT_CACHE_BYTES > 0 + + del expr, result, arr + gc.collect() + + assert indexing._HOT_CACHE_BYTES == 0 + assert indexing._HOT_CACHE == {} + + +def test_ondisk_indices_path_no_cross_array_hot_cache_contamination(tmp_path): + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data1 = np.empty(1_000, dtype=dtype) + data2 = np.empty(1_000, dtype=dtype) + data1["id"] = np.arange(1_000, dtype=np.int64) + data2["id"] = np.arange(1_000, dtype=np.int64) + 1_000 + data1["val"] = 0 + data2["val"] = 0 + + arr1 = blosc2.asarray(data1, urlpath=tmp_path / "arr1.b2nd", mode="w", chunks=(200,), blocks=(50,)) + arr2 = blosc2.asarray(data2, urlpath=tmp_path / "arr2.b2nd", mode="w", chunks=(200,), blocks=(50,)) + arr1.create_index(field="id", kind="full") + arr2.create_index(field="id", kind="full") + _clear_caches() + + expr1 = blosc2.lazyexpr("(id >= 10) & (id < 20)", arr1.fields).where(arr1) + expr2 = blosc2.lazyexpr("(id >= 10) & (id < 20)", arr2.fields).where(arr2) + + r1 = expr1.indices().compute()[:] + r2 = expr2.indices().compute()[:] + + np.testing.assert_array_equal(r1, np.arange(10, 20, dtype=np.int64)) + assert r2.size == 0 + + +def test_ondisk_empty_indices_result_cached(tmp_path): + arr, urlpath = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 60_000) & (id < 61_000)", arr.fields).where(arr) + result1 = expr.indices().compute()[:] + assert result1.size == 0 + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + result2 = ( + blosc2.lazyexpr("(id >= 60_000) & (id < 61_000)", arr2.fields).where(arr2).indices().compute()[:] + ) + assert result2.size == 0 From 038b6f64052a2f2d9a32ef07b41b405e59b9243d Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 14:44:45 +0200 Subject: [PATCH 62/68] Implement FIFO pruning for persistent query cache Add persistent-cache pruning when a new query result would exceed the configured max_persistent_cbytes budget. Rebuild the query-cache VLArray with FIFO retention of the newest entries, append the new payload, and rewrite the vlmeta slot map with updated persistent_cbytes and next_slot. Skip duplicate persistent inserts for digests that are already cached, and add regression coverage that forces a tiny cache budget to exercise the rebuild path without allocating large files. Update the query-cache plan document to reflect the shipped behavior: scoped per-array hot cache, non-exact warm reuse on value/indices paths, and completed Stage 6 FIFO pruning. --- src/blosc2/indexing.py | 100 ++++++++++++++++++++++++++++++--- tests/ndarray/test_indexing.py | 45 +++++++++++++++ 2 files changed, 137 insertions(+), 8 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index c477bb2d..9831dfb5 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -512,6 +512,82 @@ def _persistent_cache_lookup(array: blosc2.NDArray, digest: str) -> np.ndarray | return coords +def _query_cache_entry_cbytes(payload_mapping: dict) -> int: + """Return the compressed coordinate payload size used for budget accounting.""" + coord_dtype = np.dtype(payload_mapping["dtype"]) + compressed_coords = blosc2.compress2( + payload_mapping["data"], cparams=blosc2.CParams(typesize=coord_dtype.itemsize) + ) + return len(compressed_coords) + + +def _query_cache_entries_fifo(catalog: dict) -> list[tuple[str, dict]]: + """Return catalog entries ordered from oldest to newest insertion.""" + entries = catalog.get("entries", {}) + return sorted(entries.items(), key=lambda item: int(item[1]["slot"])) + + +def _query_cache_rebuild_store( + array: blosc2.NDArray, + catalog: dict, + retained_entries: list[tuple[str, dict]], + appended: tuple[str, dict, dict, int] | None = None, +) -> bool: + """Rewrite the persistent store with retained FIFO entries and an optional appended entry.""" + payload_path = _query_cache_payload_path(array) + temp_path = f"{payload_path}.tmp" + _close_query_cache_store(payload_path) + _close_query_cache_store(temp_path) + blosc2.remove_urlpath(temp_path) + + old_store = _open_query_cache_store(array) + temp_store = blosc2.VLArray(storage=blosc2.Storage(urlpath=temp_path, mode="w")) + new_entries = {} + persistent_cbytes = 0 + slot = 0 + + try: + for digest, entry in retained_entries: + if old_store is None or int(entry["slot"]) >= len(old_store): + continue + payload = old_store[int(entry["slot"])] + if not isinstance(payload, dict) or payload.get("version") != QUERY_CACHE_FORMAT_VERSION: + continue + temp_store.append(payload) + updated = entry.copy() + updated["slot"] = slot + new_entries[digest] = updated + persistent_cbytes += int(updated["cbytes"]) + slot += 1 + + if appended is not None: + digest, payload_mapping, query_descriptor, cbytes = appended + temp_store.append(payload_mapping) + new_entries[digest] = { + "slot": slot, + "cbytes": cbytes, + "nrows": payload_mapping["nrows"], + "dtype": payload_mapping["dtype"], + "query": query_descriptor, + } + persistent_cbytes += cbytes + slot += 1 + finally: + del temp_store + del old_store + _close_query_cache_store(payload_path) + _close_query_cache_store(temp_path) + + blosc2.remove_urlpath(payload_path) + os.replace(temp_path, payload_path) + + catalog["entries"] = new_entries + catalog["persistent_cbytes"] = persistent_cbytes + catalog["next_slot"] = slot + _save_query_cache_catalog(array, catalog) + return True + + def _persistent_cache_insert( array: blosc2.NDArray, digest: str, @@ -527,15 +603,11 @@ def _persistent_cache_insert( payload_path = _query_cache_payload_path(array) if catalog is None: catalog = _default_query_cache_catalog(payload_path) + elif digest in catalog.get("entries", {}): + return True payload_mapping = _encode_coords_payload(coords) - raw_data = payload_mapping["data"] - - # Measure the compressed size of the coordinate bytes directly so the - # per-entry limit is independent of VLArray/msgpack encoding overhead. - coord_dtype = np.dtype(payload_mapping["dtype"]) - compressed_coords = blosc2.compress2(raw_data, cparams=blosc2.CParams(typesize=coord_dtype.itemsize)) - cbytes = len(compressed_coords) + cbytes = _query_cache_entry_cbytes(payload_mapping) max_entry = catalog.get("max_entry_cbytes", QUERY_CACHE_MAX_ENTRY_CBYTES) if cbytes > max_entry: @@ -544,7 +616,19 @@ def _persistent_cache_insert( max_persistent = catalog.get("max_persistent_cbytes", QUERY_CACHE_MAX_PERSISTENT_CBYTES) current_persistent = int(catalog.get("persistent_cbytes", 0)) if current_persistent + cbytes > max_persistent: - return False + retained_entries = _query_cache_entries_fifo(catalog) + retained_cbytes = current_persistent + while retained_entries and retained_cbytes + cbytes > max_persistent: + _, oldest = retained_entries.pop(0) + retained_cbytes -= int(oldest["cbytes"]) + if retained_cbytes + cbytes > max_persistent: + return False + return _query_cache_rebuild_store( + array, + catalog, + retained_entries, + appended=(digest, payload_mapping, query_descriptor, cbytes), + ) store = _open_query_cache_store(array, create=True) if store is None: diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index cf813b22..aa66fbb6 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -1319,6 +1319,51 @@ def test_persistent_entry_size_limit_rejected(tmp_path): assert result is False, "oversized entry must be rejected" +def test_persistent_cache_prunes_oldest_entries_and_rebuilds_slots(tmp_path, monkeypatch): + arr, urlpath = _make_persistent_array(tmp_path, n=8_000) + _clear_caches() + + rng = np.random.default_rng(123) + payloads = [] + for i in range(3): + coords = np.sort(rng.choice(8_000, size=256, replace=False)).astype(np.int64) + descriptor = indexing._normalize_query_descriptor( + f"(id >= {i}) & (id < {i + 1})", ["__self__"], None + ) + digest = indexing._query_cache_digest(descriptor) + payload_mapping = indexing._encode_coords_payload(coords) + cbytes = indexing._query_cache_entry_cbytes(payload_mapping) + payloads.append((digest, descriptor, coords, cbytes)) + + budget = max(payloads[0][3] + payloads[1][3], payloads[1][3] + payloads[2][3]) + monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_CBYTES", budget) + + for digest, descriptor, coords, _ in payloads: + assert indexing._persistent_cache_insert(arr, digest, coords, descriptor) is True + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert catalog["max_persistent_cbytes"] == budget + assert set(catalog["entries"]) == {payloads[1][0], payloads[2][0]} + assert catalog["entries"][payloads[1][0]]["slot"] == 0 + assert catalog["entries"][payloads[2][0]]["slot"] == 1 + assert catalog["next_slot"] == 2 + assert catalog["persistent_cbytes"] == payloads[1][3] + payloads[2][3] + + assert indexing._persistent_cache_lookup(arr, payloads[0][0]) is None + np.testing.assert_array_equal(indexing._persistent_cache_lookup(arr, payloads[1][0]), payloads[1][2]) + np.testing.assert_array_equal(indexing._persistent_cache_lookup(arr, payloads[2][0]), payloads[2][2]) + + _clear_caches() + reopened = blosc2.open(urlpath, mode="r") + np.testing.assert_array_equal( + indexing._persistent_cache_lookup(reopened, payloads[1][0]), payloads[1][2] + ) + np.testing.assert_array_equal( + indexing._persistent_cache_lookup(reopened, payloads[2][0]), payloads[2][2] + ) + + # --------------------------------------------------------------------------- # Stage 5 – Invalidation # --------------------------------------------------------------------------- From 8062d4a89c6a51e09f6791d5f62135dacbf925a4 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 18:52:14 +0200 Subject: [PATCH 63/68] Use cached mmap handles in evaluate_full_query for zero-copy block reads When gathering values at cached exact positions, open the data array with mmap_mode="r" (via a new _gather_mmap_source helper) so that blosc2_schunk_get_lazychunk returns a direct pointer into the mapped region instead of malloc+pread per block. The handle is cached in _GATHER_MMAP_HANDLES (keyed by urlpath) and is dropped from _invalidate_query_cache whenever the array is written to. Disabled on Windows where mmap holds file locks that prevent writes (same policy as the existing _INDEX_MMAP_MODE guard). Warm query speedup observed on Apple M2 (query_width=1, dist=random, 10M rows, on-disk): ~16% for full, ~18% for medium index kinds. --- src/blosc2/indexing.py | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 9831dfb5..83011971 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -70,6 +70,8 @@ _HOT_CACHE_BYTES: int = 0 # Persistent VLArray handles: resolved urlpath -> open VLArray object. _QUERY_CACHE_STORE_HANDLES: dict[str, object] = {} +# Cached mmap handles for data arrays used in full-query gather: urlpath -> NDArray. +_GATHER_MMAP_HANDLES: dict[str, object] = {} _HOT_CACHE_GLOBAL_SCOPE = ("global", 0) FULL_OOC_RUN_ITEMS = 2_000_000 @@ -668,6 +670,11 @@ def _invalidate_query_cache(array: blosc2.NDArray) -> None: with contextlib.suppress(KeyError, Exception): del array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] _hot_cache_clear(scope=scope) + # Drop any cached mmap handle for this array's data file so a re-opened or + # extended array is not served from a stale mapping. + urlpath = getattr(array, "urlpath", None) + if urlpath is not None: + _GATHER_MMAP_HANDLES.pop(str(urlpath), None) # --------------------------------------------------------------------------- @@ -4628,6 +4635,25 @@ def _light_worker_source(where_x): return where_x +def _gather_mmap_source(where_x): + """Return a cached mmap handle for *where_x* for use in repeated gather operations. + + On Windows mmap is disabled (see ``_INDEX_MMAP_MODE``), so the original handle + is returned unchanged. + """ + if _INDEX_MMAP_MODE is None: + return where_x + urlpath = getattr(where_x, "urlpath", None) + if not _supports_block_reads(where_x) or urlpath is None: + return where_x + urlpath = str(urlpath) + handle = _GATHER_MMAP_HANDLES.get(urlpath) + if handle is None: + handle = blosc2.open(urlpath, mmap_mode=_INDEX_MMAP_MODE) + _GATHER_MMAP_HANDLES[urlpath] = handle + return handle + + def _light_match_from_span(span: np.ndarray, plan: IndexPlan) -> np.ndarray: if plan.target is not None and plan.target.get("source") == "expression": field_values = _values_from_numpy_target(span, plan.target) @@ -5531,16 +5557,19 @@ def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: if plan.exact_positions is None: raise ValueError("full evaluation requires exact positions") if plan.base is not None: + # Use a cached mmap handle when available so blosc2_schunk_get_lazychunk can return + # a zero-copy pointer into the mapped region instead of malloc+pread per block. + gather_source = _gather_mmap_source(where["_where_x"]) block_gather_threshold = int(plan.base.blocks[0]) if len(plan.exact_positions) <= block_gather_threshold: return _gather_positions_by_block( - where["_where_x"], + gather_source, plan.exact_positions, int(plan.base.chunks[0]), int(plan.base.blocks[0]), int(plan.base.shape[0]), ) - return _gather_positions_by_chunk(where["_where_x"], plan.exact_positions, int(plan.base.chunks[0])) + return _gather_positions_by_chunk(gather_source, plan.exact_positions, int(plan.base.chunks[0])) return _gather_positions(where["_where_x"], plan.exact_positions) From 38bc290a903c83bee15b422ac357b8fd362f55f9 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 19:33:15 +0200 Subject: [PATCH 64/68] Update comparison with DuckDB on a MacMini w/ M4 Pro, 24GB RAM --- bench/indexing/blosc2-vs-duckdb-indexes.md | 100 +++++++++++---------- 1 file changed, 54 insertions(+), 46 deletions(-) diff --git a/bench/indexing/blosc2-vs-duckdb-indexes.md b/bench/indexing/blosc2-vs-duckdb-indexes.md index 5025948f..b65022d3 100644 --- a/bench/indexing/blosc2-vs-duckdb-indexes.md +++ b/bench/indexing/blosc2-vs-duckdb-indexes.md @@ -10,6 +10,9 @@ The goal is not to claim a universal winner, but to document the current observe - total storage footprint - sensitivity to query shape +The latest width-1 single-value figures below come from a fresh run on a Mac mini with an M4 Pro CPU +and 24 GB of RAM. + ## Benchmark Setup @@ -56,7 +59,7 @@ There are two different DuckDB query shapes that matter a lot: - single-value form: - `id = value` -For Blosc2, switching between a collapsed width-1 range and `==` makes almost no practical difference. +For Blosc2, switching between a collapsed width-1 range and `==` makes only a small difference in practice. For DuckDB, this difference is very important: @@ -167,22 +170,24 @@ python index_query_bench.py \ Observed results: - `light` - - cold lookup: `1.463 ms` - - warm lookup: `1.286 ms` + - cold lookup: `0.841 ms` + - warm lookup: `0.184 ms` - `medium` - - cold lookup: `1.089 ms` - - warm lookup: `0.986 ms` + - cold lookup: `0.564 ms` + - warm lookup: `0.168 ms` - `full` - - cold lookup: `0.618 ms` - - warm lookup: `0.544 ms` + - cold lookup: `0.554 ms` + - warm lookup: `0.167 ms` ### Interpretation -With the generic range form, Blosc2 is much faster than DuckDB: +With the generic width-1 range form, Blosc2 is much faster than DuckDB: -- Blosc2 `light` is already about `9x` faster than DuckDB `zonemap` -- Blosc2 exact indexes (`medium`, `full`) are much faster still +- Blosc2 `light` is already much faster than DuckDB `zonemap`, and comfortably faster than the + generic-range DuckDB `art-index` behavior +- Blosc2 `medium` and `full` are in a different regime on warm hits, at about `0.17 ms` - DuckDB `art-index` does not show its real point-lookup behavior in this predicate form +- Blosc2 warm reuse changes the picture substantially for repeated lookups ## Width-1 Comparison: Single-Value Predicate @@ -205,13 +210,15 @@ python duckdb_query_bench.py \ Observed results: - `zonemap` - - build: `1193.665 ms` - - filtered lookup: `8.646 ms` + - build: `509.338 ms` + - cold lookup: `4.595 ms` + - warm lookup: `2.857 ms` - DB size: `56,111,104` bytes - `art-index` - - build: `2849.869 ms` - - filtered lookup: `0.755 ms` - - DB size: `478,687,232` bytes + - build: `2000.316 ms` + - cold lookup: `0.613 ms` + - warm lookup: `0.246 ms` + - DB size: `478,425,088` bytes ### Blosc2 @@ -230,19 +237,19 @@ python index_query_bench.py \ Observed results: - `light` - - build: `1225.637 ms` - - cold lookup: `1.290 ms` - - warm lookup: `2.351 ms` + - build: `960.048 ms` + - cold lookup: `2.489 ms` + - warm lookup: `0.172 ms` - index sidecars: `27,497,393` bytes - `medium` - - build: `5511.863 ms` - - cold lookup: `1.081 ms` - - warm lookup: `0.964 ms` + - build: `4745.880 ms` + - cold lookup: `2.202 ms` + - warm lookup: `0.147 ms` - index sidecars: `37,645,201` bytes - `full` - - build: `10954.844 ms` - - cold lookup: `0.603 ms` - - warm lookup: `0.525 ms` + - build: `9539.843 ms` + - cold lookup: `1.753 ms` + - warm lookup: `0.144 ms` - index sidecars: `29,888,673` bytes ### Interpretation @@ -250,22 +257,22 @@ Observed results: Once DuckDB is allowed to use the more planner-friendly single-value predicate: - `art-index` becomes very fast -- `art-index` is now faster than Blosc2 `light` -- Blosc2 `full` still remains slightly faster than DuckDB `art-index` on this measured point-lookup case +- `art-index` is clearly faster than Blosc2 on cold point lookups in this run +- Blosc2 is clearly faster on warm repeated point lookups across `light`, `medium`, and `full` However, the storage costs are very different: -- DuckDB `art-index` database size: about `478.7 MB` +- DuckDB `art-index` database size: about `478.4 MB` - DuckDB zonemap baseline size: about `56.1 MB` -- estimated ART overhead over baseline: about `422.6 MB` +- estimated ART overhead over baseline: about `422.3 MB` - Blosc2 `full` base + index footprint: about `31 MB + 29.9 MB = 60.9 MB` So for true point lookups: -- DuckDB `art-index` is competitive on latency -- Blosc2 `full` is still faster in the measured run -- Blosc2 `full` is much smaller overall -- DuckDB `art-index` is much faster to build than Blosc2 `full` +- DuckDB `art-index` wins on cold point-lookup latency in this measurement +- Blosc2 `full` remains much smaller overall +- Blosc2 `light`, `medium`, and `full` all become faster than DuckDB `art-index` on warm repeated hits +- DuckDB `art-index` still has a very large storage premium over both Blosc2 `light` and `full` ## Blosc2 Light vs DuckDB Zonemap @@ -280,7 +287,8 @@ Main observations: - Blosc2 base + `light`: about `58 MB` - Blosc2 `light` lookup speed is much better - width `50`: about `6.25 ms` vs `13.33 ms` - - width `1`: about `1.3-1.5 ms` vs `8.6-12.6 ms` + - width `1` range: about `0.18 ms` warm vs `12.61 ms` generic-range DuckDB + - width `1` equality: about `0.17 ms` warm vs `2.94 ms` DuckDB zonemap warm Conclusion: @@ -295,20 +303,21 @@ This is the most relevant exact-index comparison. Main observations: - point-lookup latency - - DuckDB `art-index`: `0.755 ms` - - Blosc2 `full`: `0.603 ms` cold, `0.525 ms` warm + - DuckDB `art-index`: `0.613 ms` cold, `0.245 ms` warm + - Blosc2 `full`: `1.753 ms` cold, `0.144 ms` warm - build time - - DuckDB `art-index`: `2849.869 ms` - - Blosc2 `full`: `10954.844 ms` + - DuckDB `art-index`: `2000.316 ms` + - Blosc2 `full`: `9539.843 ms` - footprint - - DuckDB `art-index` DB: about `478.7 MB` + - DuckDB `art-index` DB: about `478.4 MB` - Blosc2 `full` base + index: about `60.9 MB` Conclusion: -- DuckDB ART wins on build time - Blosc2 `full` wins on storage efficiency -- Blosc2 `full` was slightly faster on the measured point lookup +- DuckDB `art-index` wins on cold point-lookup latency +- Warm repeated point lookups favor Blosc2 `full` more clearly +- DuckDB `art-index` is much faster to build than Blosc2 `full` - DuckDB ART is much more sensitive to predicate shape @@ -317,7 +326,7 @@ Conclusion: Observed behavior: - Blosc2: - - width-1 range form and `==` are nearly equivalent in performance + - width-1 range form and `==` are close, with `==` giving a small but measurable improvement - DuckDB: - width-1 range form was much slower than `id = value` @@ -343,10 +352,9 @@ Practical implication: 1. Blosc2 `light` is very competitive against DuckDB zonemap-like pruning. 2. Blosc2 `light` offers much faster selective lookups than DuckDB zonemap at a similar total storage cost. 3. DuckDB `art-index` becomes strong only when queries are written as true equality predicates. -4. Blosc2 `full` compares very well against DuckDB `art-index` on point lookups: - - slightly faster in the measured run - - much smaller on disk - - slower to build -5. Query-shape sensitivity is a major difference: +4. On true point lookups, DuckDB `art-index` wins on cold latency in the current M4 Pro run, but + Blosc2 exact indexes are markedly better on warm repeated lookups. +5. Blosc2 exact indexes remain dramatically smaller on disk than DuckDB `art-index`. +6. Query-shape sensitivity is a major difference: - small for Blosc2 - large for DuckDB ART From af8a205b18ee03a0a570d03d01afdcc704f37764 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Thu, 9 Apr 2026 19:34:27 +0200 Subject: [PATCH 65/68] Use latest c-blosc2 sources --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c1dd0f17..e14edcde 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,7 +129,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG b9617d145ed46cd77afbbc56fbe5474e3c3269d3 + GIT_TAG 0568990388e6201240b170947d4c2199572f795d # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) From 5b82cf96d9ccfcf4ceaab98f70c20e769ed03770 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 10 Apr 2026 06:07:04 +0200 Subject: [PATCH 66/68] Use nbytes instead of cbytes as the main metric for cache accounting --- bench/indexing/query_cache_store_bench.py | 458 ++++++++++++++++++++++ src/blosc2/indexing.py | 99 +++-- tests/ndarray/test_indexing.py | 31 +- 3 files changed, 538 insertions(+), 50 deletions(-) create mode 100644 bench/indexing/query_cache_store_bench.py diff --git a/bench/indexing/query_cache_store_bench.py b/bench/indexing/query_cache_store_bench.py new file mode 100644 index 00000000..bf85a853 --- /dev/null +++ b/bench/indexing/query_cache_store_bench.py @@ -0,0 +1,458 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import cProfile +import io +import pstats +import statistics +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +import blosc2 +from blosc2 import indexing + +STRATEGIES = ("baseline", "cache_catalog", "skip_cbytes", "defer_vlmeta", "all") + + +@dataclass +class InsertState: + catalog: dict | None = None + store: object | None = None + + +def _make_array(path: Path, *, size: int, chunks: int, blocks: int) -> blosc2.NDArray: + return blosc2.asarray( + np.arange(size, dtype=np.int64), + urlpath=path, + mode="w", + chunks=(chunks,), + blocks=(blocks,), + ) + + +def _clear_process_caches() -> None: + indexing._hot_cache_clear() + indexing._QUERY_CACHE_STORE_HANDLES.clear() + indexing._PERSISTENT_INDEXES.clear() + + +def _coords_for_count(count: int, spacing: int, modulo: int) -> np.ndarray: + coords = (np.arange(count, dtype=np.int64) * spacing) % modulo + return np.sort(coords, kind="stable") + + +def _median(values: list[float]) -> float: + return statistics.median(values) if values else 0.0 + + +def _build_query_bits(arr: blosc2.NDArray, expr: str, coords: np.ndarray) -> tuple[str, dict, dict]: + descriptor = indexing._normalize_query_descriptor(expr, [indexing.SELF_TARGET_NAME], None) + digest = indexing._query_cache_digest(descriptor) + scope = indexing._query_cache_scope(arr) + indexing._hot_cache_put(digest, coords, scope=scope) + payload_mapping = indexing._encode_coords_payload(coords) + return digest, descriptor, payload_mapping + + +def _load_or_create_catalog(arr: blosc2.NDArray, state: InsertState | None, strategy: str) -> dict: + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None and state.catalog is not None: + return state.catalog + + catalog = indexing._load_query_cache_catalog(arr) + if catalog is None: + catalog = indexing._default_query_cache_catalog(indexing._query_cache_payload_path(arr)) + + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None: + state.catalog = catalog + return catalog + + +def _load_or_create_store(arr: blosc2.NDArray, state: InsertState | None, strategy: str): + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None and state.store is not None: + return state.store + + store = indexing._open_query_cache_store(arr, create=True) + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None: + state.store = store + return store + + +def _entry_nbytes(coords: np.ndarray, payload_mapping: dict, strategy: str) -> int: + if strategy in {"skip_cbytes", "all"}: + return len(payload_mapping["data"]) + return indexing._query_cache_entry_nbytes(coords) + + +def _insert_with_strategy( + arr: blosc2.NDArray, + expr: str, + coords: np.ndarray, + strategy: str, + state: InsertState | None = None, +) -> float: + start = time.perf_counter_ns() + digest, descriptor, payload_mapping = _build_query_bits(arr, expr, coords) + nbytes = _entry_nbytes(coords, payload_mapping, strategy) + catalog = _load_or_create_catalog(arr, state, strategy) + if digest in catalog.get("entries", {}): + end = time.perf_counter_ns() + return (end - start) / 1_000_000 + + store = _load_or_create_store(arr, state, strategy) + slot = len(store) + store.append(payload_mapping) + + catalog["entries"][digest] = { + "slot": slot, + "nbytes": nbytes, + "nrows": len(coords), + "dtype": payload_mapping["dtype"], + "query": descriptor, + } + catalog["persistent_nbytes"] = int(catalog.get("persistent_nbytes", 0)) + nbytes + catalog["next_slot"] = slot + 1 + + if strategy not in {"defer_vlmeta", "all"}: + indexing._save_query_cache_catalog(arr, catalog) + elif state is not None: + state.catalog = catalog + + end = time.perf_counter_ns() + return (end - start) / 1_000_000 + + +def _flush_state(arr: blosc2.NDArray, state: InsertState | None, strategy: str) -> None: + if strategy not in {"defer_vlmeta", "all"} or state is None or state.catalog is None: + return + indexing._save_query_cache_catalog(arr, state.catalog) + + +def _benchmark_fresh( + root: Path, + *, + strategy: str, + coords: np.ndarray, + size: int, + chunks: int, + blocks: int, + repeats: int, +) -> float: + runs = [] + for idx in range(repeats): + arr = _make_array(root / f"fresh-{strategy}-{idx}.b2nd", size=size, chunks=chunks, blocks=blocks) + _clear_process_caches() + state = InsertState() if strategy in {"cache_catalog", "defer_vlmeta", "all"} else None + expr = f"(id >= {idx}) & (id <= {idx})" + start = time.perf_counter_ns() + _insert_with_strategy(arr, expr, coords, strategy, state) + _flush_state(arr, state, strategy) + end = time.perf_counter_ns() + runs.append((end - start) / 1_000_000) + return _median(runs) + + +def _benchmark_steady( + root: Path, + *, + strategy: str, + coords: np.ndarray, + size: int, + chunks: int, + blocks: int, + inserts: int, +) -> float: + arr = _make_array(root / f"steady-{strategy}.b2nd", size=size, chunks=chunks, blocks=blocks) + _clear_process_caches() + state = InsertState() if strategy in {"cache_catalog", "defer_vlmeta", "all"} else None + start = time.perf_counter_ns() + for idx in range(inserts): + expr = f"(id >= {idx}) & (id <= {idx})" + _insert_with_strategy(arr, expr, coords, strategy, state) + _flush_state(arr, state, strategy) + end = time.perf_counter_ns() + return ((end - start) / 1_000_000) / max(1, inserts) + + +def _baseline_step_breakdown( + arr: blosc2.NDArray, expr: str, coords: np.ndarray +) -> dict[str, float | int]: + t0 = time.perf_counter_ns() + descriptor = indexing._normalize_query_descriptor(expr, [indexing.SELF_TARGET_NAME], None) + digest = indexing._query_cache_digest(descriptor) + t1 = time.perf_counter_ns() + + scope = indexing._query_cache_scope(arr) + indexing._hot_cache_put(digest, coords, scope=scope) + t2 = time.perf_counter_ns() + + payload_mapping = indexing._encode_coords_payload(coords) + nbytes = indexing._query_cache_entry_nbytes(coords) + t3 = time.perf_counter_ns() + + catalog = indexing._load_query_cache_catalog(arr) + payload_path = indexing._query_cache_payload_path(arr) + if catalog is None: + catalog = indexing._default_query_cache_catalog(payload_path) + store = indexing._open_query_cache_store(arr, create=True) + t4 = time.perf_counter_ns() + + slot = len(store) + store.append(payload_mapping) + t5 = time.perf_counter_ns() + + catalog["entries"][digest] = { + "slot": slot, + "nbytes": nbytes, + "nrows": len(coords), + "dtype": payload_mapping["dtype"], + "query": descriptor, + } + catalog["persistent_nbytes"] = int(catalog.get("persistent_nbytes", 0)) + nbytes + catalog["next_slot"] = slot + 1 + indexing._save_query_cache_catalog(arr, catalog) + t6 = time.perf_counter_ns() + + return { + "digest_ms": (t1 - t0) / 1_000_000, + "hot_ms": (t2 - t1) / 1_000_000, + "encode_nbytes_ms": (t3 - t2) / 1_000_000, + "open_store_ms": (t4 - t3) / 1_000_000, + "append_ms": (t5 - t4) / 1_000_000, + "catalog_ms": (t6 - t5) / 1_000_000, + "step_total_ms": (t6 - t0) / 1_000_000, + "entry_nbytes": nbytes, + } + + +def _profile_store(arr: blosc2.NDArray, coords: np.ndarray, repeats: int, top: int) -> str: + profiler = cProfile.Profile() + + def run(): + for idx in range(repeats): + expr = f"(id >= {idx}) & (id <= {idx})" + indexing.store_cached_coords(arr, expr, [indexing.SELF_TARGET_NAME], None, coords) + + profiler.enable() + run() + profiler.disable() + + out = io.StringIO() + stats = pstats.Stats(profiler, stream=out).sort_stats("cumulative") + stats.print_stats(top) + return out.getvalue() + + +def _active_cache_store_cparams(arr: blosc2.NDArray) -> blosc2.CParams: + coords = np.asarray([0], dtype=np.int64) + indexing.store_cached_coords(arr, "(id >= 0) & (id <= 0)", [indexing.SELF_TARGET_NAME], None, coords) + payload_path = indexing._query_cache_payload_path(arr) + store = blosc2.VLArray(storage=blosc2.Storage(urlpath=payload_path, mode="r")) + return store.cparams + + +def _print_strategy_table(title: str, rows: list[dict[str, object]]) -> None: + columns = [ + ("coords", lambda row: f"{row['coords_count']:,}"), + ("strategy", lambda row: str(row["strategy"])), + ("time_ms", lambda row: f"{row['time_ms']:.3f}"), + ("speedup", lambda row: f"{row['speedup']:.2f}x"), + ] + widths = [] + for name, render in columns: + width = len(name) + for row in rows: + width = max(width, len(render(row))) + widths.append(width) + + print(title) + header = " ".join(name.ljust(width) for (name, _), width in zip(columns, widths, strict=True)) + rule = " ".join("-" * width for width in widths) + print(header) + print(rule) + for row in rows: + print( + " ".join( + render(row).ljust(width) for (_, render), width in zip(columns, widths, strict=True) + ) + ) + print() + + +def _print_breakdown(rows: list[dict[str, object]]) -> None: + columns = [ + ("coords", lambda row: f"{row['coords_count']:,}"), + ("entry_nbytes", lambda row: f"{row['entry_nbytes']:,}"), + ("digest_ms", lambda row: f"{row['digest_ms']:.3f}"), + ("hot_ms", lambda row: f"{row['hot_ms']:.3f}"), + ("encode_nbytes_ms", lambda row: f"{row['encode_nbytes_ms']:.3f}"), + ("open_store_ms", lambda row: f"{row['open_store_ms']:.3f}"), + ("append_ms", lambda row: f"{row['append_ms']:.3f}"), + ("catalog_ms", lambda row: f"{row['catalog_ms']:.3f}"), + ("step_total_ms", lambda row: f"{row['step_total_ms']:.3f}"), + ] + widths = [] + for name, render in columns: + width = len(name) + for row in rows: + width = max(width, len(render(row))) + widths.append(width) + + print("Baseline Step Breakdown") + header = " ".join(name.ljust(width) for (name, _), width in zip(columns, widths, strict=True)) + rule = " ".join("-" * width for width in widths) + print(header) + print(rule) + for row in rows: + print( + " ".join( + render(row).ljust(width) for (_, render), width in zip(columns, widths, strict=True) + ) + ) + print() + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Microbenchmark persistent query-cache insert strategies.") + parser.add_argument("--size", type=int, default=1_000_000, help="Array size for the backing persistent array.") + parser.add_argument("--chunks", type=int, default=100_000, help="Chunk length for the backing array.") + parser.add_argument("--blocks", type=int, default=10_000, help="Block length for the backing array.") + parser.add_argument( + "--coords-counts", + type=int, + nargs="+", + default=[1, 10, 100, 1_000], + help="Coordinate counts to benchmark.", + ) + parser.add_argument("--fresh-repeats", type=int, default=20, help="Repeated fresh first-insert runs.") + parser.add_argument("--steady-inserts", type=int, default=100, help="Repeated inserts into one array.") + parser.add_argument( + "--breakdown-repeats", type=int, default=20, help="Repeated baseline step breakdown runs." + ) + parser.add_argument( + "--spacing", + type=int, + default=9973, + help="Stride used to synthesize sparse sorted coordinates.", + ) + parser.add_argument( + "--profile-repeats", + type=int, + default=200, + help="Number of repeated baseline inserts to include in the cProfile run.", + ) + parser.add_argument( + "--profile-top", + type=int, + default=25, + help="Number of cProfile entries to print.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + fresh_rows = [] + steady_rows = [] + breakdown_rows = [] + + with tempfile.TemporaryDirectory(prefix="blosc2-query-cache-bench-") as tmpdir: + root = Path(tmpdir) + probe = _make_array(root / "cparams-probe.b2nd", size=args.size, chunks=args.chunks, blocks=args.blocks) + _clear_process_caches() + active_cparams = _active_cache_store_cparams(probe) + _clear_process_caches() + + for coords_count in args.coords_counts: + coords = _coords_for_count(coords_count, args.spacing, args.size) + + fresh_times = {} + steady_times = {} + for strategy in STRATEGIES: + fresh_times[strategy] = _benchmark_fresh( + root, + strategy=strategy, + coords=coords, + size=args.size, + chunks=args.chunks, + blocks=args.blocks, + repeats=args.fresh_repeats, + ) + steady_times[strategy] = _benchmark_steady( + root, + strategy=strategy, + coords=coords, + size=args.size, + chunks=args.chunks, + blocks=args.blocks, + inserts=args.steady_inserts, + ) + + fresh_baseline = fresh_times["baseline"] + steady_baseline = steady_times["baseline"] + for strategy in STRATEGIES: + fresh_rows.append( + { + "coords_count": coords_count, + "strategy": strategy, + "time_ms": fresh_times[strategy], + "speedup": fresh_baseline / fresh_times[strategy] if fresh_times[strategy] else 0.0, + } + ) + steady_rows.append( + { + "coords_count": coords_count, + "strategy": strategy, + "time_ms": steady_times[strategy], + "speedup": steady_baseline / steady_times[strategy] if steady_times[strategy] else 0.0, + } + ) + + baseline_steps = [] + for idx in range(args.breakdown_repeats): + arr = _make_array(root / f"breakdown-{coords_count}-{idx}.b2nd", size=args.size, chunks=args.chunks, blocks=args.blocks) + _clear_process_caches() + expr = f"(id >= {idx}) & (id <= {idx})" + baseline_steps.append(_baseline_step_breakdown(arr, expr, coords)) + breakdown_rows.append( + { + "coords_count": coords_count, + "entry_nbytes": int(_median([float(row["entry_nbytes"]) for row in baseline_steps])), + "digest_ms": _median([float(row["digest_ms"]) for row in baseline_steps]), + "hot_ms": _median([float(row["hot_ms"]) for row in baseline_steps]), + "encode_nbytes_ms": _median([float(row["encode_nbytes_ms"]) for row in baseline_steps]), + "open_store_ms": _median([float(row["open_store_ms"]) for row in baseline_steps]), + "append_ms": _median([float(row["append_ms"]) for row in baseline_steps]), + "catalog_ms": _median([float(row["catalog_ms"]) for row in baseline_steps]), + "step_total_ms": _median([float(row["step_total_ms"]) for row in baseline_steps]), + } + ) + + print( + "Persistent query-cache insert microbenchmark " + f"(codec={active_cparams.codec.name}, clevel={active_cparams.clevel}, use_dict={active_cparams.use_dict})" + ) + print() + _print_strategy_table("Fresh Insert Comparison", fresh_rows) + _print_strategy_table("Steady Insert Comparison", steady_rows) + _print_breakdown(breakdown_rows) + + profile_coords = _coords_for_count(args.coords_counts[0], args.spacing, args.size) + profile_arr = _make_array(root / "profile.b2nd", size=args.size, chunks=args.chunks, blocks=args.blocks) + _clear_process_caches() + print(f"Baseline cProfile for coords_count={args.coords_counts[0]:,} over {args.profile_repeats} inserts") + print(_profile_store(profile_arr, profile_coords, args.profile_repeats, args.profile_top)) + + +if __name__ == "__main__": + main() diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 83011971..07c795a0 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -58,9 +58,9 @@ # --------------------------------------------------------------------------- QUERY_CACHE_VLMETA_KEY = "_blosc2_query_cache" QUERY_CACHE_FORMAT_VERSION = 1 -QUERY_CACHE_MAX_ENTRY_CBYTES = 4096 # 4 KB per persistent entry +QUERY_CACHE_MAX_ENTRY_NBYTES = 65_536 # 64 KB of logical int64 positions per persistent entry QUERY_CACHE_MAX_MEM_CBYTES = 131_072 # 128 KB for the in-process hot cache -QUERY_CACHE_MAX_PERSISTENT_CBYTES = 2_147_483_648 # 2 GB for the payload store +QUERY_CACHE_MAX_PERSISTENT_NBYTES = 2_147_483_648 # 2 GB of logical int64 positions in the payload store # In-process hot cache: (array-scope, digest) -> decoded np.ndarray of coordinates. _HOT_CACHE: dict[tuple[tuple[str, str | int], str], np.ndarray] = {} @@ -324,15 +324,54 @@ def _default_query_cache_catalog(payload_path: str) -> dict: return { "version": QUERY_CACHE_FORMAT_VERSION, "payload_ref": {"kind": "urlpath", "version": 1, "urlpath": payload_path}, - "max_entry_cbytes": QUERY_CACHE_MAX_ENTRY_CBYTES, + "max_entry_nbytes": QUERY_CACHE_MAX_ENTRY_NBYTES, "max_mem_cbytes": QUERY_CACHE_MAX_MEM_CBYTES, - "max_persistent_cbytes": QUERY_CACHE_MAX_PERSISTENT_CBYTES, - "persistent_cbytes": 0, + "max_persistent_nbytes": QUERY_CACHE_MAX_PERSISTENT_NBYTES, + "persistent_nbytes": 0, "next_slot": 0, "entries": {}, } +def _normalize_query_cache_catalog(catalog: dict) -> dict: + """Normalize legacy compressed-byte cache catalogs to logical-byte accounting.""" + if not isinstance(catalog, dict): + return _default_query_cache_catalog("") + catalog.setdefault("version", QUERY_CACHE_FORMAT_VERSION) + catalog.setdefault("entries", {}) + + if "max_entry_nbytes" not in catalog: + catalog["max_entry_nbytes"] = int(catalog.pop("max_entry_cbytes", QUERY_CACHE_MAX_ENTRY_NBYTES)) + else: + catalog.pop("max_entry_cbytes", None) + + catalog.setdefault("max_mem_cbytes", QUERY_CACHE_MAX_MEM_CBYTES) + + if "max_persistent_nbytes" not in catalog: + catalog["max_persistent_nbytes"] = int( + catalog.pop("max_persistent_cbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) + ) + else: + catalog.pop("max_persistent_cbytes", None) + + if "persistent_nbytes" not in catalog: + catalog["persistent_nbytes"] = int(catalog.pop("persistent_cbytes", 0)) + else: + catalog.pop("persistent_cbytes", None) + + total_nbytes = 0 + for entry in catalog["entries"].values(): + if "nbytes" not in entry: + entry["nbytes"] = int(entry.pop("cbytes", 0)) + else: + entry.pop("cbytes", None) + total_nbytes += int(entry.get("nbytes", 0)) + + if catalog["entries"]: + catalog["persistent_nbytes"] = total_nbytes + return catalog + + def _load_query_cache_catalog(array: blosc2.NDArray) -> dict | None: """Read the query-cache catalog from *array* vlmeta, or return None.""" if not _is_persistent_array(array): @@ -343,7 +382,7 @@ def _load_query_cache_catalog(array: blosc2.NDArray) -> dict | None: return None if not isinstance(cat, dict) or cat.get("version") != QUERY_CACHE_FORMAT_VERSION: return None - return cat + return _normalize_query_cache_catalog(cat) def _save_query_cache_catalog(array: blosc2.NDArray, catalog: dict) -> None: @@ -514,13 +553,9 @@ def _persistent_cache_lookup(array: blosc2.NDArray, digest: str) -> np.ndarray | return coords -def _query_cache_entry_cbytes(payload_mapping: dict) -> int: - """Return the compressed coordinate payload size used for budget accounting.""" - coord_dtype = np.dtype(payload_mapping["dtype"]) - compressed_coords = blosc2.compress2( - payload_mapping["data"], cparams=blosc2.CParams(typesize=coord_dtype.itemsize) - ) - return len(compressed_coords) +def _query_cache_entry_nbytes(coords: np.ndarray) -> int: + """Return the logical int64 position bytes used for persistent budget accounting.""" + return int(np.asarray(coords).size) * np.dtype(np.int64).itemsize def _query_cache_entries_fifo(catalog: dict) -> list[tuple[str, dict]]: @@ -545,7 +580,7 @@ def _query_cache_rebuild_store( old_store = _open_query_cache_store(array) temp_store = blosc2.VLArray(storage=blosc2.Storage(urlpath=temp_path, mode="w")) new_entries = {} - persistent_cbytes = 0 + persistent_nbytes = 0 slot = 0 try: @@ -559,20 +594,20 @@ def _query_cache_rebuild_store( updated = entry.copy() updated["slot"] = slot new_entries[digest] = updated - persistent_cbytes += int(updated["cbytes"]) + persistent_nbytes += int(updated["nbytes"]) slot += 1 if appended is not None: - digest, payload_mapping, query_descriptor, cbytes = appended + digest, payload_mapping, query_descriptor, nbytes = appended temp_store.append(payload_mapping) new_entries[digest] = { "slot": slot, - "cbytes": cbytes, + "nbytes": nbytes, "nrows": payload_mapping["nrows"], "dtype": payload_mapping["dtype"], "query": query_descriptor, } - persistent_cbytes += cbytes + persistent_nbytes += nbytes slot += 1 finally: del temp_store @@ -584,7 +619,7 @@ def _query_cache_rebuild_store( os.replace(temp_path, payload_path) catalog["entries"] = new_entries - catalog["persistent_cbytes"] = persistent_cbytes + catalog["persistent_nbytes"] = persistent_nbytes catalog["next_slot"] = slot _save_query_cache_catalog(array, catalog) return True @@ -609,27 +644,27 @@ def _persistent_cache_insert( return True payload_mapping = _encode_coords_payload(coords) - cbytes = _query_cache_entry_cbytes(payload_mapping) + nbytes = _query_cache_entry_nbytes(coords) - max_entry = catalog.get("max_entry_cbytes", QUERY_CACHE_MAX_ENTRY_CBYTES) - if cbytes > max_entry: + max_entry = catalog.get("max_entry_nbytes", QUERY_CACHE_MAX_ENTRY_NBYTES) + if nbytes > max_entry: return False - max_persistent = catalog.get("max_persistent_cbytes", QUERY_CACHE_MAX_PERSISTENT_CBYTES) - current_persistent = int(catalog.get("persistent_cbytes", 0)) - if current_persistent + cbytes > max_persistent: + max_persistent = catalog.get("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) + current_persistent = int(catalog.get("persistent_nbytes", 0)) + if current_persistent + nbytes > max_persistent: retained_entries = _query_cache_entries_fifo(catalog) - retained_cbytes = current_persistent - while retained_entries and retained_cbytes + cbytes > max_persistent: + retained_nbytes = current_persistent + while retained_entries and retained_nbytes + nbytes > max_persistent: _, oldest = retained_entries.pop(0) - retained_cbytes -= int(oldest["cbytes"]) - if retained_cbytes + cbytes > max_persistent: + retained_nbytes -= int(oldest["nbytes"]) + if retained_nbytes + nbytes > max_persistent: return False return _query_cache_rebuild_store( array, catalog, retained_entries, - appended=(digest, payload_mapping, query_descriptor, cbytes), + appended=(digest, payload_mapping, query_descriptor, nbytes), ) store = _open_query_cache_store(array, create=True) @@ -641,12 +676,12 @@ def _persistent_cache_insert( catalog["entries"][digest] = { "slot": slot, - "cbytes": cbytes, + "nbytes": nbytes, "nrows": len(coords), "dtype": payload_mapping["dtype"], "query": query_descriptor, } - catalog["persistent_cbytes"] = current_persistent + cbytes + catalog["persistent_nbytes"] = current_persistent + nbytes catalog["next_slot"] = slot + 1 _save_query_cache_catalog(array, catalog) return True diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index aa66fbb6..49b9a7ba 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -1294,22 +1294,18 @@ def test_persistent_cache_not_created_for_non_persistent_array(): def test_persistent_entry_size_limit_rejected(tmp_path): - """Entries whose compressed size exceeds 4 KB must not be stored.""" + """Entries whose logical int64 position bytes exceed the entry limit must not be stored.""" arr, _ = _make_persistent_array(tmp_path, n=50_000) _clear_caches() - # Random (non-sequential) coordinates compress poorly and should exceed 4 KB. + # 10k coordinates imply 80 KB of logical int64 positions and should exceed the 64 KB limit. rng = np.random.default_rng(42) - coords = np.sort(rng.choice(50_000, size=5_000, replace=False)).astype(np.int64) - - # Verify this is actually > 4KB compressed with the same method used internally. - payload_mapping = indexing._encode_coords_payload(coords) - raw_data = payload_mapping["data"] - coord_dtype = np.dtype(payload_mapping["dtype"]) - compressed = blosc2.compress2(raw_data, cparams=blosc2.CParams(typesize=coord_dtype.itemsize)) - assert len(compressed) > indexing.QUERY_CACHE_MAX_ENTRY_CBYTES, ( - f"test setup error: compressed size {len(compressed)} must exceed " - f"{indexing.QUERY_CACHE_MAX_ENTRY_CBYTES} for this test to be meaningful" + coords = np.sort(rng.choice(50_000, size=10_000, replace=False)).astype(np.int64) + + entry_nbytes = indexing._query_cache_entry_nbytes(coords) + assert entry_nbytes > indexing.QUERY_CACHE_MAX_ENTRY_NBYTES, ( + f"test setup error: logical size {entry_nbytes} must exceed " + f"{indexing.QUERY_CACHE_MAX_ENTRY_NBYTES} for this test to be meaningful" ) descriptor = indexing._normalize_query_descriptor("(id >= 0) & (id < 50000)", ["__self__"], None) @@ -1331,24 +1327,23 @@ def test_persistent_cache_prunes_oldest_entries_and_rebuilds_slots(tmp_path, mon f"(id >= {i}) & (id < {i + 1})", ["__self__"], None ) digest = indexing._query_cache_digest(descriptor) - payload_mapping = indexing._encode_coords_payload(coords) - cbytes = indexing._query_cache_entry_cbytes(payload_mapping) - payloads.append((digest, descriptor, coords, cbytes)) + nbytes = indexing._query_cache_entry_nbytes(coords) + payloads.append((digest, descriptor, coords, nbytes)) budget = max(payloads[0][3] + payloads[1][3], payloads[1][3] + payloads[2][3]) - monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_CBYTES", budget) + monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_NBYTES", budget) for digest, descriptor, coords, _ in payloads: assert indexing._persistent_cache_insert(arr, digest, coords, descriptor) is True catalog = indexing._load_query_cache_catalog(arr) assert catalog is not None - assert catalog["max_persistent_cbytes"] == budget + assert catalog["max_persistent_nbytes"] == budget assert set(catalog["entries"]) == {payloads[1][0], payloads[2][0]} assert catalog["entries"][payloads[1][0]]["slot"] == 0 assert catalog["entries"][payloads[2][0]]["slot"] == 1 assert catalog["next_slot"] == 2 - assert catalog["persistent_cbytes"] == payloads[1][3] + payloads[2][3] + assert catalog["persistent_nbytes"] == payloads[1][3] + payloads[2][3] assert indexing._persistent_cache_lookup(arr, payloads[0][0]) is None np.testing.assert_array_equal(indexing._persistent_cache_lookup(arr, payloads[1][0]), payloads[1][2]) From 49bbc9adc45fbe4fe2aacd6defd5b65bec01ec5a Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 10 Apr 2026 06:25:34 +0200 Subject: [PATCH 67/68] Simplify query cache accounting and overflow policy --- bench/indexing/query_cache_store_bench.py | 4 +- src/blosc2/indexing.py | 136 +++++----------------- tests/ndarray/test_indexing.py | 51 ++++++-- 3 files changed, 68 insertions(+), 123 deletions(-) diff --git a/bench/indexing/query_cache_store_bench.py b/bench/indexing/query_cache_store_bench.py index bf85a853..46f2cbaf 100644 --- a/bench/indexing/query_cache_store_bench.py +++ b/bench/indexing/query_cache_store_bench.py @@ -22,7 +22,7 @@ import blosc2 from blosc2 import indexing -STRATEGIES = ("baseline", "cache_catalog", "skip_cbytes", "defer_vlmeta", "all") +STRATEGIES = ("baseline", "cache_catalog", "skip_observer", "defer_vlmeta", "all") @dataclass @@ -89,7 +89,7 @@ def _load_or_create_store(arr: blosc2.NDArray, state: InsertState | None, strate def _entry_nbytes(coords: np.ndarray, payload_mapping: dict, strategy: str) -> int: - if strategy in {"skip_cbytes", "all"}: + if strategy in {"skip_observer", "all"}: return len(payload_mapping["data"]) return indexing._query_cache_entry_nbytes(coords) diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py index 07c795a0..951dbf7f 100644 --- a/src/blosc2/indexing.py +++ b/src/blosc2/indexing.py @@ -59,8 +59,8 @@ QUERY_CACHE_VLMETA_KEY = "_blosc2_query_cache" QUERY_CACHE_FORMAT_VERSION = 1 QUERY_CACHE_MAX_ENTRY_NBYTES = 65_536 # 64 KB of logical int64 positions per persistent entry -QUERY_CACHE_MAX_MEM_CBYTES = 131_072 # 128 KB for the in-process hot cache -QUERY_CACHE_MAX_PERSISTENT_NBYTES = 2_147_483_648 # 2 GB of logical int64 positions in the payload store +QUERY_CACHE_MAX_MEM_NBYTES = 131_072 # 128 KB for the in-process hot cache +QUERY_CACHE_MAX_PERSISTENT_NBYTES = 4 * 1024 * 1024 # 4 MB of logical int64 positions in the payload store # In-process hot cache: (array-scope, digest) -> decoded np.ndarray of coordinates. _HOT_CACHE: dict[tuple[tuple[str, str | int], str], np.ndarray] = {} @@ -325,7 +325,7 @@ def _default_query_cache_catalog(payload_path: str) -> dict: "version": QUERY_CACHE_FORMAT_VERSION, "payload_ref": {"kind": "urlpath", "version": 1, "urlpath": payload_path}, "max_entry_nbytes": QUERY_CACHE_MAX_ENTRY_NBYTES, - "max_mem_cbytes": QUERY_CACHE_MAX_MEM_CBYTES, + "max_mem_nbytes": QUERY_CACHE_MAX_MEM_NBYTES, "max_persistent_nbytes": QUERY_CACHE_MAX_PERSISTENT_NBYTES, "persistent_nbytes": 0, "next_slot": 0, @@ -334,41 +334,17 @@ def _default_query_cache_catalog(payload_path: str) -> dict: def _normalize_query_cache_catalog(catalog: dict) -> dict: - """Normalize legacy compressed-byte cache catalogs to logical-byte accounting.""" + """Ensure the prototype query-cache catalog has the current nbytes schema.""" if not isinstance(catalog, dict): return _default_query_cache_catalog("") catalog.setdefault("version", QUERY_CACHE_FORMAT_VERSION) + catalog.setdefault("payload_ref", {"kind": "urlpath", "version": 1, "urlpath": ""}) + catalog.setdefault("max_entry_nbytes", QUERY_CACHE_MAX_ENTRY_NBYTES) + catalog.setdefault("max_mem_nbytes", QUERY_CACHE_MAX_MEM_NBYTES) + catalog.setdefault("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) + catalog.setdefault("persistent_nbytes", 0) + catalog.setdefault("next_slot", 0) catalog.setdefault("entries", {}) - - if "max_entry_nbytes" not in catalog: - catalog["max_entry_nbytes"] = int(catalog.pop("max_entry_cbytes", QUERY_CACHE_MAX_ENTRY_NBYTES)) - else: - catalog.pop("max_entry_cbytes", None) - - catalog.setdefault("max_mem_cbytes", QUERY_CACHE_MAX_MEM_CBYTES) - - if "max_persistent_nbytes" not in catalog: - catalog["max_persistent_nbytes"] = int( - catalog.pop("max_persistent_cbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) - ) - else: - catalog.pop("max_persistent_cbytes", None) - - if "persistent_nbytes" not in catalog: - catalog["persistent_nbytes"] = int(catalog.pop("persistent_cbytes", 0)) - else: - catalog.pop("persistent_cbytes", None) - - total_nbytes = 0 - for entry in catalog["entries"].values(): - if "nbytes" not in entry: - entry["nbytes"] = int(entry.pop("cbytes", 0)) - else: - entry.pop("cbytes", None) - total_nbytes += int(entry.get("nbytes", 0)) - - if catalog["entries"]: - catalog["persistent_nbytes"] = total_nbytes return catalog @@ -498,7 +474,7 @@ def _hot_cache_put(digest: str, coords: np.ndarray, scope: tuple[str, str | int] global _HOT_CACHE_BYTES key = _hot_cache_key(digest, scope) entry_bytes = coords.nbytes - if entry_bytes > QUERY_CACHE_MAX_MEM_CBYTES: + if entry_bytes > QUERY_CACHE_MAX_MEM_NBYTES: # Single entry too large; skip. return # If already present, remove old accounting first. @@ -507,7 +483,7 @@ def _hot_cache_put(digest: str, coords: np.ndarray, scope: tuple[str, str | int] with contextlib.suppress(ValueError): _HOT_CACHE_ORDER.remove(key) # Evict LRU entries until there is room. - while _HOT_CACHE_ORDER and _HOT_CACHE_BYTES + entry_bytes > QUERY_CACHE_MAX_MEM_CBYTES: + while _HOT_CACHE_ORDER and _HOT_CACHE_BYTES + entry_bytes > QUERY_CACHE_MAX_MEM_NBYTES: oldest = _HOT_CACHE_ORDER.pop(0) evicted = _HOT_CACHE.pop(oldest, None) if evicted is not None: @@ -558,71 +534,21 @@ def _query_cache_entry_nbytes(coords: np.ndarray) -> int: return int(np.asarray(coords).size) * np.dtype(np.int64).itemsize -def _query_cache_entries_fifo(catalog: dict) -> list[tuple[str, dict]]: - """Return catalog entries ordered from oldest to newest insertion.""" - entries = catalog.get("entries", {}) - return sorted(entries.items(), key=lambda item: int(item[1]["slot"])) - - -def _query_cache_rebuild_store( - array: blosc2.NDArray, - catalog: dict, - retained_entries: list[tuple[str, dict]], - appended: tuple[str, dict, dict, int] | None = None, -) -> bool: - """Rewrite the persistent store with retained FIFO entries and an optional appended entry.""" +def _reset_persistent_query_cache_catalog(array: blosc2.NDArray, catalog: dict | None = None) -> dict: + """Drop persistent cache storage and return a fresh empty catalog preserving limits.""" payload_path = _query_cache_payload_path(array) - temp_path = f"{payload_path}.tmp" _close_query_cache_store(payload_path) - _close_query_cache_store(temp_path) - blosc2.remove_urlpath(temp_path) - - old_store = _open_query_cache_store(array) - temp_store = blosc2.VLArray(storage=blosc2.Storage(urlpath=temp_path, mode="w")) - new_entries = {} - persistent_nbytes = 0 - slot = 0 - - try: - for digest, entry in retained_entries: - if old_store is None or int(entry["slot"]) >= len(old_store): - continue - payload = old_store[int(entry["slot"])] - if not isinstance(payload, dict) or payload.get("version") != QUERY_CACHE_FORMAT_VERSION: - continue - temp_store.append(payload) - updated = entry.copy() - updated["slot"] = slot - new_entries[digest] = updated - persistent_nbytes += int(updated["nbytes"]) - slot += 1 - - if appended is not None: - digest, payload_mapping, query_descriptor, nbytes = appended - temp_store.append(payload_mapping) - new_entries[digest] = { - "slot": slot, - "nbytes": nbytes, - "nrows": payload_mapping["nrows"], - "dtype": payload_mapping["dtype"], - "query": query_descriptor, - } - persistent_nbytes += nbytes - slot += 1 - finally: - del temp_store - del old_store - _close_query_cache_store(payload_path) - _close_query_cache_store(temp_path) - blosc2.remove_urlpath(payload_path) - os.replace(temp_path, payload_path) - catalog["entries"] = new_entries - catalog["persistent_nbytes"] = persistent_nbytes - catalog["next_slot"] = slot - _save_query_cache_catalog(array, catalog) - return True + fresh = _default_query_cache_catalog(payload_path) + if catalog is not None: + fresh["max_entry_nbytes"] = int(catalog.get("max_entry_nbytes", QUERY_CACHE_MAX_ENTRY_NBYTES)) + fresh["max_mem_nbytes"] = int(catalog.get("max_mem_nbytes", QUERY_CACHE_MAX_MEM_NBYTES)) + fresh["max_persistent_nbytes"] = int( + catalog.get("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) + ) + _save_query_cache_catalog(array, fresh) + return fresh def _persistent_cache_insert( @@ -653,19 +579,10 @@ def _persistent_cache_insert( max_persistent = catalog.get("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) current_persistent = int(catalog.get("persistent_nbytes", 0)) if current_persistent + nbytes > max_persistent: - retained_entries = _query_cache_entries_fifo(catalog) - retained_nbytes = current_persistent - while retained_entries and retained_nbytes + nbytes > max_persistent: - _, oldest = retained_entries.pop(0) - retained_nbytes -= int(oldest["nbytes"]) - if retained_nbytes + nbytes > max_persistent: + if nbytes > max_persistent: return False - return _query_cache_rebuild_store( - array, - catalog, - retained_entries, - appended=(digest, payload_mapping, query_descriptor, nbytes), - ) + catalog = _reset_persistent_query_cache_catalog(array, catalog) + current_persistent = 0 store = _open_query_cache_store(array, create=True) if store is None: @@ -701,7 +618,6 @@ def _invalidate_query_cache(array: blosc2.NDArray) -> None: payload_path = _query_cache_payload_path(array) _close_query_cache_store(payload_path) blosc2.remove_urlpath(payload_path) - # Clear the catalog in vlmeta. with contextlib.suppress(KeyError, Exception): del array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] _hot_cache_clear(scope=scope) diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py index 49b9a7ba..2829fc18 100644 --- a/tests/ndarray/test_indexing.py +++ b/tests/ndarray/test_indexing.py @@ -1216,7 +1216,7 @@ def test_hot_cache_byte_limit_evicts_lru(): assert indexing._hot_cache_get("key0") is None # Most recent keys should still be present. assert indexing._hot_cache_get("key164") is not None - assert indexing._HOT_CACHE_BYTES <= indexing.QUERY_CACHE_MAX_MEM_CBYTES + assert indexing._HOT_CACHE_BYTES <= indexing.QUERY_CACHE_MAX_MEM_NBYTES def test_hot_cache_clear(): @@ -1289,7 +1289,7 @@ def test_persistent_cache_not_created_for_non_persistent_array(): # --------------------------------------------------------------------------- -# Stage 3 – 4 KB per-entry size limit +# Stage 3 – Per-entry logical-byte size limit # --------------------------------------------------------------------------- @@ -1315,7 +1315,7 @@ def test_persistent_entry_size_limit_rejected(tmp_path): assert result is False, "oversized entry must be rejected" -def test_persistent_cache_prunes_oldest_entries_and_rebuilds_slots(tmp_path, monkeypatch): +def test_persistent_cache_overflow_nukes_persistent_entries_and_keeps_newest(tmp_path, monkeypatch): arr, urlpath = _make_persistent_array(tmp_path, n=8_000) _clear_caches() @@ -1339,23 +1339,52 @@ def test_persistent_cache_prunes_oldest_entries_and_rebuilds_slots(tmp_path, mon catalog = indexing._load_query_cache_catalog(arr) assert catalog is not None assert catalog["max_persistent_nbytes"] == budget - assert set(catalog["entries"]) == {payloads[1][0], payloads[2][0]} - assert catalog["entries"][payloads[1][0]]["slot"] == 0 - assert catalog["entries"][payloads[2][0]]["slot"] == 1 - assert catalog["next_slot"] == 2 - assert catalog["persistent_nbytes"] == payloads[1][3] + payloads[2][3] + assert set(catalog["entries"]) == {payloads[2][0]} + assert catalog["entries"][payloads[2][0]]["slot"] == 0 + assert catalog["next_slot"] == 1 + assert catalog["persistent_nbytes"] == payloads[2][3] assert indexing._persistent_cache_lookup(arr, payloads[0][0]) is None - np.testing.assert_array_equal(indexing._persistent_cache_lookup(arr, payloads[1][0]), payloads[1][2]) + assert indexing._persistent_cache_lookup(arr, payloads[1][0]) is None np.testing.assert_array_equal(indexing._persistent_cache_lookup(arr, payloads[2][0]), payloads[2][2]) _clear_caches() reopened = blosc2.open(urlpath, mode="r") + assert indexing._persistent_cache_lookup(reopened, payloads[1][0]) is None np.testing.assert_array_equal( - indexing._persistent_cache_lookup(reopened, payloads[1][0]), payloads[1][2] + indexing._persistent_cache_lookup(reopened, payloads[2][0]), payloads[2][2] + ) + + +def test_persistent_cache_overflow_preserves_hot_cache(tmp_path, monkeypatch): + arr, _ = _make_persistent_array(tmp_path, n=8_000) + _clear_caches() + + coords1 = np.arange(0, 256, dtype=np.int64) + coords2 = np.arange(256, 512, dtype=np.int64) + expr1 = "(id >= 0) & (id < 256)" + expr2 = "(id >= 256) & (id < 512)" + + budget = indexing._query_cache_entry_nbytes(coords1) + monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_NBYTES", budget) + + indexing.store_cached_coords(arr, expr1, [indexing.SELF_TARGET_NAME], None, coords1) + indexing.store_cached_coords(arr, expr2, [indexing.SELF_TARGET_NAME], None, coords2) + + assert ( + indexing._persistent_cache_lookup( + arr, + indexing._query_cache_digest( + indexing._normalize_query_descriptor(expr1, [indexing.SELF_TARGET_NAME], None) + ), + ) + is None ) np.testing.assert_array_equal( - indexing._persistent_cache_lookup(reopened, payloads[2][0]), payloads[2][2] + indexing.get_cached_coords(arr, expr1, [indexing.SELF_TARGET_NAME], None), coords1 + ) + np.testing.assert_array_equal( + indexing.get_cached_coords(arr, expr2, [indexing.SELF_TARGET_NAME], None), coords2 ) From 9856f2fff84e36e0b08a64acdb21eda96f937e26 Mon Sep 17 00:00:00 2001 From: Francesc Alted Date: Fri, 10 Apr 2026 07:22:36 +0200 Subject: [PATCH 68/68] Refine DuckDB query benchmark scan and table reporting --- bench/indexing/duckdb_query_bench.py | 49 +++++++++++++++++++--------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/bench/indexing/duckdb_query_bench.py b/bench/indexing/duckdb_query_bench.py index fddad80d..283d29b5 100644 --- a/bench/indexing/duckdb_query_bench.py +++ b/bench/indexing/duckdb_query_bench.py @@ -32,8 +32,8 @@ ("dist", lambda result: result["dist"]), ("layout", lambda result: result["layout"]), ("create_ms", lambda result: f"{result['create_ms']:.3f}"), - ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), - ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['cold_scan_ms']:.3f}"), + ("query_ms", lambda result: f"{result['cold_ms']:.3f}"), ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), ("db_bytes", lambda result: f"{result['db_bytes']:,}"), ("query_rows", lambda result: f"{result['query_rows']:,}"), @@ -44,8 +44,8 @@ ("dist", lambda result: result["dist"]), ("layout", lambda result: result["layout"]), ("create_ms", lambda result: f"{result['create_ms']:.3f}"), - ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), - ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ("scan_ms", lambda result: f"{result['warm_scan_ms']:.3f}"), + ("query_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), ("speedup", lambda result: f"{result['warm_speedup']:.2f}x" if result["warm_speedup"] is not None else "-"), ("db_bytes", lambda result: f"{result['db_bytes']:,}"), ("query_rows", lambda result: f"{result['query_rows']:,}"), @@ -355,15 +355,28 @@ def _condition_sql(lo: object, hi: object, dtype: np.dtype, *, exact_query: bool return f"id >= {_literal(lo, dtype)} AND id <= {_literal(hi, dtype)}" -def benchmark_scan_once(path: Path, lo, hi) -> tuple[float, int]: +def benchmark_scan_once(path: Path, lo, hi, dtype: np.dtype, *, exact_query: bool = False) -> tuple[float, float, float, int]: con = duckdb.connect(str(path), read_only=True) try: + condition_sql = _condition_sql(lo, hi, dtype, exact_query=exact_query) + # Force the filtered baseline down the table-scan path instead of the ART index path. + con.execute("SET index_scan_max_count = 0") + con.execute("SET index_scan_percentage = 0") + query = f"SELECT * FROM data WHERE {condition_sql}" + + cold_start = time.perf_counter() + table = con.execute(query).arrow().read_all() + cold_elapsed = time.perf_counter() - cold_start + start = time.perf_counter() - table = con.execute("SELECT * FROM data").arrow().read_all() - ids = table["id"].to_numpy() - result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) - elapsed = time.perf_counter() - start - return elapsed, result_len + table = con.execute(query).arrow().read_all() + result_len = len(table) + warm_elapsed = time.perf_counter() - start + + third_start = time.perf_counter() + con.execute(query).arrow().read_all() + third_elapsed = time.perf_counter() - third_start + return cold_elapsed, warm_elapsed, third_elapsed, result_len finally: con.close() @@ -413,7 +426,9 @@ def benchmark_layout( create_s = _open_or_build_duckdb_file(size, dist, id_dtype, path, layout=layout, batch_size=batch_size) lo, hi = _query_bounds(size, query_width, id_dtype) - scan_elapsed, scan_rows = benchmark_scan_once(path, lo, hi) + cold_scan_elapsed, warm_scan_elapsed, third_scan_elapsed, scan_rows = benchmark_scan_once( + path, lo, hi, id_dtype, exact_query=exact_query + ) con = duckdb.connect(str(path), read_only=True) try: @@ -428,20 +443,24 @@ def benchmark_layout( if scan_rows != filtered_rows: raise AssertionError(f"filtered rows mismatch: scan={scan_rows}, filtered={filtered_rows}") - scan_ms = scan_elapsed * 1_000 + cold_scan_ms = cold_scan_elapsed * 1_000 + warm_scan_ms = warm_scan_elapsed * 1_000 cold_ms = cold_elapsed * 1_000 warm_ms = median(warm_times) if warm_times else None + if layout == "zonemap": + cold_ms = third_scan_elapsed * 1_000 return { "size": size, "dist": dist, "layout": layout, "create_ms": create_s * 1_000, - "scan_ms": scan_ms, + "cold_scan_ms": cold_scan_ms, + "warm_scan_ms": warm_scan_ms, "cold_ms": cold_ms, - "cold_speedup": scan_ms / cold_ms, + "cold_speedup": cold_scan_ms / cold_ms, "warm_ms": warm_ms, - "warm_speedup": None if warm_ms is None else scan_ms / warm_ms, + "warm_speedup": None if warm_ms is None else warm_scan_ms / warm_ms, "db_bytes": os.path.getsize(path), "query_rows": int(filtered_rows), "path": path,