diff --git a/AGENTS.md b/AGENTS.md new file mode 100644 index 00000000..47c6aaf8 --- /dev/null +++ b/AGENTS.md @@ -0,0 +1,48 @@ +# Repository Guidelines + +## Project Structure & Module Organization +The Python package lives in `src/blosc2/`, including the C/Cython extension sources +(`blosc2_ext.*`) and core modules such as `core.py`, `ndarray.py`, and `schunk.py`. +Tests are under `tests/`, with additional doctests enabled for select modules per +`pytest.ini`. Documentation sources are in `doc/` and build output lands in `html/`. +Examples are in `examples/`, and performance/benchmark scripts live in `bench/`. + +## Build, Test, and Development Commands +- `pip install .` builds the bundled C-Blosc2 and installs the package. +- `pip install -e .` installs in editable mode for local development. +- `CMAKE_PREFIX_PATH=/usr/local USE_SYSTEM_BLOSC2=1 pip install -e .` builds + against a separately installed C-Blosc2. +- `pytest` runs the default test suite (excludes `heavy` and `network` markers). +- `pytest -m "heavy"` runs long-running tests. +- `pytest -m "network"` runs tests requiring network access. +- `cd doc && rm -rf ../html _build && python -m sphinx . ../html` builds docs. + +## Coding Style & Naming Conventions +Use Ruff for formatting and linting (line length 109). Enable pre-commit hooks: +`python -m pip install pre-commit` then `pre-commit install`. Follow Python +conventions: 4-space indentation, `snake_case` for functions/variables, and +`PascalCase` for classes. Pytest discovery expects `tests/test_*.py` and +`test_*` functions. Do not use leading underscores in module-level helper +function names when those helpers are imported from other modules; reserve +leading underscores for file-local implementation details. Avoid leading +underscores in core module filenames under `src/blosc2/`; prefer non-underscored +module names unless there is a strong reason to keep a module private. + +For documentation and tutorial query examples, prefer the shortest idiom that +matches the intended result type. Use `expr[:]` or `arr[mask][:]` when showing +values, use `expr.compute()` when materializing an `NDArray`, and use +`expr.compute(_use_index=False)` when demonstrating scan-vs-index behavior. +Avoid `expr.compute()[:]` unless a NumPy array is specifically required. + +## Testing Guidelines +Pytest is required; warnings are treated as errors. The default configuration +adds `--doctest-modules`, so keep doctest examples in `blosc2/core.py`, +`blosc2/ndarray.py`, and `blosc2/schunk.py` accurate. Use markers `heavy` and +`network` for slow or network-dependent tests. + +## Commit & Pull Request Guidelines +Recent commit messages are short, imperative sentences (e.g., “Add …”, “Fix …”) +without ticket prefixes. For pull requests: branch from `main`, add tests for +behavior changes, update docs for API changes, ensure the test suite passes, +and avoid introducing new compiler warnings. Link issues when applicable and +include clear reproduction steps for bug fixes. diff --git a/CMakeLists.txt b/CMakeLists.txt index ed326b2a..e14edcde 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -41,11 +41,20 @@ add_custom_command( DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/blosc2_ext.pyx" VERBATIM) +add_custom_command( + OUTPUT indexing_ext.c + COMMAND Python::Interpreter -m cython + "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx" --output-file indexing_ext.c + DEPENDS "${CMAKE_CURRENT_SOURCE_DIR}/src/blosc2/indexing_ext.pyx" + VERBATIM) + # ...and add it to the target Python_add_library(blosc2_ext MODULE blosc2_ext.c WITH_SOABI) +Python_add_library(indexing_ext MODULE indexing_ext.c WITH_SOABI) # We need to link against NumPy target_link_libraries(blosc2_ext PRIVATE Python::NumPy) +target_link_libraries(indexing_ext PRIVATE Python::NumPy) # Fetch and build miniexpr library include(FetchContent) @@ -63,7 +72,7 @@ endif() FetchContent_Declare(miniexpr GIT_REPOSITORY https://github.com/Blosc/miniexpr.git - GIT_TAG feadbc633a887bafd84b2fbc370ef2962d01b7ee + GIT_TAG f2faef741c4c507bf6a03167c72ce7f92c6f0ae8 # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../miniexpr ) FetchContent_MakeAvailable(miniexpr) @@ -72,6 +81,7 @@ FetchContent_MakeAvailable(miniexpr) target_link_libraries(blosc2_ext PRIVATE miniexpr_static) target_compile_features(blosc2_ext PRIVATE c_std_11) +target_compile_features(indexing_ext PRIVATE c_std_11) if(WIN32 AND CMAKE_C_COMPILER_ID STREQUAL "Clang") execute_process( COMMAND "${CMAKE_C_COMPILER}" -print-resource-dir @@ -119,7 +129,7 @@ else() include(FetchContent) FetchContent_Declare(blosc2 GIT_REPOSITORY https://github.com/Blosc/c-blosc2 - GIT_TAG b32256fc1287b6e24c22f09ac202265c7054e2bc + GIT_TAG 0568990388e6201240b170947d4c2199572f795d # SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../c-blosc2 ) FetchContent_MakeAvailable(blosc2) @@ -148,7 +158,7 @@ endif() # Python extension -> site-packages/blosc2 install( - TARGETS blosc2_ext + TARGETS blosc2_ext indexing_ext LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/blosc2 ) diff --git a/bench/indexing/blosc2-vs-duckdb-indexes.md b/bench/indexing/blosc2-vs-duckdb-indexes.md new file mode 100644 index 00000000..b65022d3 --- /dev/null +++ b/bench/indexing/blosc2-vs-duckdb-indexes.md @@ -0,0 +1,360 @@ +# Blosc2 vs DuckDB Indexes + +This note summarizes the benchmark comparisons we ran between Blosc2 indexes and DuckDB indexing/pruning +mechanisms on a 10M-row structured dataset. + +The goal is not to claim a universal winner, but to document the current observed tradeoffs around: + +- index creation time +- lookup latency +- total storage footprint +- sensitivity to query shape + +The latest width-1 single-value figures below come from a fresh run on a Mac mini with an M4 Pro CPU +and 24 GB of RAM. + + +## Benchmark Setup + +### Dataset + +- Rows: `10,000,000` +- Schema: + - `id`: indexed field, `float64` + - `payload`: deterministic nontrivial ramp payload +- Distribution: `random` + - true random shuffle of `id` +- Query widths tested: + - `50` + - `1` + +### Blosc2 + +- Script: `index_query_bench.py` +- Index kinds: + - `ultralight` + - `light` + - `medium` + - `full` +- Default geometry in these runs: + - `chunks=1,250,000` + - `blocks=10,000` + +### DuckDB + +- Script: `duckdb_query_bench.py` +- Layouts: + - `zonemap` + - `art-index` +- Batch size used while loading: + - `1,250,000` + + +## Important Context + +There are two different DuckDB query shapes that matter a lot: + +- range form: + - `id >= lo AND id <= hi` +- single-value form: + - `id = value` + +For Blosc2, switching between a collapsed width-1 range and `==` makes only a small difference in practice. + +For DuckDB, this difference is very important: + +- `art-index` was much slower with the range form +- `art-index` became much faster with the single-value `=` predicate + +So any DuckDB comparison must state which predicate shape was used. + + +## Width-50 Comparison + +### DuckDB + +Command: + +```bash +python duckdb_query_bench.py \ + --size 10M \ + --outdir /tmp/duckdb-bench-smoke2 \ + --dist random \ + --query-width 50 \ + --layout all \ + --repeats 1 +``` + +Observed results: + +- `zonemap` + - build: `1180.630 ms` + - filtered lookup: `13.326 ms` + - DB size: `56,111,104` bytes +- `art-index` + - build: `2844.010 ms` + - filtered lookup: `12.419 ms` + - DB size: `478,687,232` bytes + +### Blosc2 + +Command: + +```bash +python index_query_bench.py \ + --size 10M \ + --outdir /tmp/indexes-10M \ + --kind light \ + --query-width 50 \ + --in-mem \ + --dist random +``` + +Observed `light` results: + +- build: `705.193 ms` +- cold lookup: `6.370 ms` +- warm lookup: `6.250 ms` +- base array size: about `31 MB` +- `light` index sidecars: about `27 MB` +- total footprint: about `58 MB` + +### Interpretation + +For this moderately selective random workload: + +- Blosc2 `light` is about `2x` faster than DuckDB `zonemap` +- Blosc2 `light` has a total footprint similar to DuckDB `zonemap` +- DuckDB `art-index` is only slightly faster than `zonemap` here, but much larger + +This suggests that Blosc2 `light` is more than a simple zonemap. It behaves like an active lossy lookup +structure rather than only coarse pruning metadata. + + +## Width-1 Comparison: Generic Range Form + +### DuckDB + +Command: + +```bash +python duckdb_query_bench.py \ + --size 10M \ + --outdir /tmp/duckdb-bench-smoke2 \ + --dist random \ + --query-width 1 \ + --layout all \ + --repeats 3 +``` + +Observed results: + +- `zonemap` + - filtered lookup: `12.612 ms` +- `art-index` + - filtered lookup: `13.641 ms` + +### Blosc2 + +Command: + +```bash +python index_query_bench.py \ + --size 10M \ + --outdir /tmp/indexes-10M \ + --kind all \ + --query-width 1 \ + --dist random +``` + +Observed results: + +- `light` + - cold lookup: `0.841 ms` + - warm lookup: `0.184 ms` +- `medium` + - cold lookup: `0.564 ms` + - warm lookup: `0.168 ms` +- `full` + - cold lookup: `0.554 ms` + - warm lookup: `0.167 ms` + +### Interpretation + +With the generic width-1 range form, Blosc2 is much faster than DuckDB: + +- Blosc2 `light` is already much faster than DuckDB `zonemap`, and comfortably faster than the + generic-range DuckDB `art-index` behavior +- Blosc2 `medium` and `full` are in a different regime on warm hits, at about `0.17 ms` +- DuckDB `art-index` does not show its real point-lookup behavior in this predicate form +- Blosc2 warm reuse changes the picture substantially for repeated lookups + + +## Width-1 Comparison: Single-Value Predicate + +### DuckDB + +Command: + +```bash +python duckdb_query_bench.py \ + --size 10M \ + --outdir /tmp/duckdb-bench-smoke2 \ + --dist random \ + --query-width 1 \ + --layout all \ + --repeats 3 \ + --query-single-value +``` + +Observed results: + +- `zonemap` + - build: `509.338 ms` + - cold lookup: `4.595 ms` + - warm lookup: `2.857 ms` + - DB size: `56,111,104` bytes +- `art-index` + - build: `2000.316 ms` + - cold lookup: `0.613 ms` + - warm lookup: `0.246 ms` + - DB size: `478,425,088` bytes + +### Blosc2 + +Command: + +```bash +python index_query_bench.py \ + --size 10M \ + --outdir /tmp/indexes-10M \ + --kind all \ + --query-width 1 \ + --dist random \ + --query-single-value +``` + +Observed results: + +- `light` + - build: `960.048 ms` + - cold lookup: `2.489 ms` + - warm lookup: `0.172 ms` + - index sidecars: `27,497,393` bytes +- `medium` + - build: `4745.880 ms` + - cold lookup: `2.202 ms` + - warm lookup: `0.147 ms` + - index sidecars: `37,645,201` bytes +- `full` + - build: `9539.843 ms` + - cold lookup: `1.753 ms` + - warm lookup: `0.144 ms` + - index sidecars: `29,888,673` bytes + +### Interpretation + +Once DuckDB is allowed to use the more planner-friendly single-value predicate: + +- `art-index` becomes very fast +- `art-index` is clearly faster than Blosc2 on cold point lookups in this run +- Blosc2 is clearly faster on warm repeated point lookups across `light`, `medium`, and `full` + +However, the storage costs are very different: + +- DuckDB `art-index` database size: about `478.4 MB` +- DuckDB zonemap baseline size: about `56.1 MB` +- estimated ART overhead over baseline: about `422.3 MB` +- Blosc2 `full` base + index footprint: about `31 MB + 29.9 MB = 60.9 MB` + +So for true point lookups: + +- DuckDB `art-index` wins on cold point-lookup latency in this measurement +- Blosc2 `full` remains much smaller overall +- Blosc2 `light`, `medium`, and `full` all become faster than DuckDB `art-index` on warm repeated hits +- DuckDB `art-index` still has a very large storage premium over both Blosc2 `light` and `full` + + +## Blosc2 Light vs DuckDB Zonemap + +This is the cleanest cross-system comparison, because both are lossy pruning structures rather than exact +secondary indexes. + +Main observations: + +- storage footprint is in roughly the same ballpark + - DuckDB zonemap DB: about `56 MB` + - Blosc2 base + `light`: about `58 MB` +- Blosc2 `light` lookup speed is much better + - width `50`: about `6.25 ms` vs `13.33 ms` + - width `1` range: about `0.18 ms` warm vs `12.61 ms` generic-range DuckDB + - width `1` equality: about `0.17 ms` warm vs `2.94 ms` DuckDB zonemap warm + +Conclusion: + +- DuckDB zonemap is closer in spirit to Blosc2 `light` than DuckDB ART is +- but Blosc2 `light` is a materially stronger lookup structure on these workloads + + +## Blosc2 Full vs DuckDB ART + +This is the most relevant exact-index comparison. + +Main observations: + +- point-lookup latency + - DuckDB `art-index`: `0.613 ms` cold, `0.245 ms` warm + - Blosc2 `full`: `1.753 ms` cold, `0.144 ms` warm +- build time + - DuckDB `art-index`: `2000.316 ms` + - Blosc2 `full`: `9539.843 ms` +- footprint + - DuckDB `art-index` DB: about `478.4 MB` + - Blosc2 `full` base + index: about `60.9 MB` + +Conclusion: + +- Blosc2 `full` wins on storage efficiency +- DuckDB `art-index` wins on cold point-lookup latency +- Warm repeated point lookups favor Blosc2 `full` more clearly +- DuckDB `art-index` is much faster to build than Blosc2 `full` +- DuckDB ART is much more sensitive to predicate shape + + +## Why `--query-single-value` Matters More in DuckDB + +Observed behavior: + +- Blosc2: + - width-1 range form and `==` are close, with `==` giving a small but measurable improvement +- DuckDB: + - width-1 range form was much slower than `id = value` + +Practical implication: + +- Blosc2 benchmarks are fairly robust to whether a point lookup is written as `==` or as a collapsed range +- DuckDB benchmarks must distinguish those two forms explicitly, otherwise ART performance is understated + + +## Caveats + +- These results come from one hardware/software setup and one dataset shape. +- DuckDB stores table data and indexes in one DB file, so payload and index bytes cannot be separated as cleanly + as in Blosc2. +- DuckDB zonemap is built-in table pruning metadata, not a separately managed index. +- Blosc2 and DuckDB are not identical systems: + - Blosc2 benchmark operates over compressed array storage and explicit index sidecars + - DuckDB benchmark operates over a columnar SQL engine with its own optimizer behavior + + +## Current Takeaways + +1. Blosc2 `light` is very competitive against DuckDB zonemap-like pruning. +2. Blosc2 `light` offers much faster selective lookups than DuckDB zonemap at a similar total storage cost. +3. DuckDB `art-index` becomes strong only when queries are written as true equality predicates. +4. On true point lookups, DuckDB `art-index` wins on cold latency in the current M4 Pro run, but + Blosc2 exact indexes are markedly better on warm repeated lookups. +5. Blosc2 exact indexes remain dramatically smaller on disk than DuckDB `art-index`. +6. Query-shape sensitivity is a major difference: + - small for Blosc2 + - large for DuckDB ART diff --git a/bench/indexing/duckdb_query_bench.py b/bench/indexing/duckdb_query_bench.py new file mode 100644 index 00000000..283d29b5 --- /dev/null +++ b/bench/indexing/duckdb_query_bench.py @@ -0,0 +1,604 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import math +import os +import re +import statistics +import time +from pathlib import Path + +import duckdb +import numpy as np +import pyarrow as pa + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +DEFAULT_REPEATS = 3 +DISTS = ("sorted", "block-shuffled", "permuted", "random") +LAYOUTS = ("zonemap", "art-index") +RNG_SEED = 0 +DEFAULT_BATCH_SIZE = 1_250_000 +DATASET_LAYOUT_VERSION = "payload-ramp-v1" + +COLD_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("layout", lambda result: result["layout"]), + ("create_ms", lambda result: f"{result['create_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['cold_scan_ms']:.3f}"), + ("query_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("db_bytes", lambda result: f"{result['db_bytes']:,}"), + ("query_rows", lambda result: f"{result['query_rows']:,}"), +] + +WARM_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("layout", lambda result: result["layout"]), + ("create_ms", lambda result: f"{result['create_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['warm_scan_ms']:.3f}"), + ("query_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ("speedup", lambda result: f"{result['warm_speedup']:.2f}x" if result["warm_speedup"] is not None else "-"), + ("db_bytes", lambda result: f"{result['db_bytes']:,}"), + ("query_rows", lambda result: f"{result['query_rows']:,}"), +] + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def payload_slice(start: int, stop: int) -> np.ndarray: + return np.arange(start, stop, dtype=np.float32) + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_slice(size: int, start: int, stop: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if stop <= start: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + values = np.zeros(stop - start, dtype=dtype) + true_start = max(start, size // 2) + if true_start < stop: + values[true_start - start :] = True + return values + + positions = np.arange(start, stop, dtype=np.int64) + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_at(size: int, index: int, dtype: np.dtype) -> object: + return ordered_id_slice(size, index, index + 1, dtype)[0].item() + + +def ordered_ids_from_positions(positions: np.ndarray, size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if positions.size == 0: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + return (positions >= (size // 2)).astype(dtype, copy=False) + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def _block_order(size: int, block_len: int) -> np.ndarray: + nblocks = (size + block_len - 1) // block_len + return np.random.default_rng(RNG_SEED).permutation(nblocks) + + +def _fill_block_shuffled_ids( + ids: np.ndarray, size: int, start: int, stop: int, block_len: int, order: np.ndarray +) -> None: + cursor = start + out_cursor = 0 + while cursor < stop: + dest_block = cursor // block_len + block_offset = cursor % block_len + src_block = int(order[dest_block]) + src_start = src_block * block_len + block_offset + take = min(stop - cursor, block_len - block_offset, size - src_start) + ids[out_cursor : out_cursor + take] = ordered_id_slice(size, src_start, src_start + take, ids.dtype) + cursor += take + out_cursor += take + + +def _permuted_position_params(size: int) -> tuple[int, int]: + if size <= 1: + return 1, 0 + rng = np.random.default_rng(RNG_SEED) + step = int(rng.integers(1, size)) + while math.gcd(step, size) != 1: + step += 1 + if step >= size: + step = 1 + offset = int(rng.integers(0, size)) + return step, offset + + +def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step: int, offset: int) -> None: + positions = np.arange(start, stop, dtype=np.int64) + shuffled_positions = (positions * step + offset) % size + ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype) + + +def _randomized_ids(size: int, dtype: np.dtype) -> np.ndarray: + ids = make_ordered_ids(size, dtype) + np.random.default_rng(RNG_SEED).shuffle(ids) + return ids + + +def duckdb_sql_type(dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "BOOLEAN" + if dtype == np.dtype(np.int8): + return "TINYINT" + if dtype == np.dtype(np.int16): + return "SMALLINT" + if dtype == np.dtype(np.int32): + return "INTEGER" + if dtype == np.dtype(np.int64): + return "BIGINT" + if dtype == np.dtype(np.uint8): + return "UTINYINT" + if dtype == np.dtype(np.uint16): + return "USMALLINT" + if dtype == np.dtype(np.uint32): + return "UINTEGER" + if dtype == np.dtype(np.uint64): + return "UBIGINT" + if dtype == np.dtype(np.float32): + return "REAL" + if dtype == np.dtype(np.float64): + return "DOUBLE" + raise ValueError(f"unsupported duckdb dtype: {dtype}") + + +def duckdb_path(outdir: Path, size: int, dist: str, id_dtype: np.dtype, layout: str, batch_size: int) -> Path: + return ( + outdir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.layout-{layout}.batch-{batch_size}.duckdb" + ) + + +def _duckdb_wal_path(path: Path) -> Path: + return path.with_name(f"{path.name}.wal") + + +def _remove_duckdb_path(path: Path) -> None: + if path.exists(): + path.unlink() + wal_path = _duckdb_wal_path(path) + if wal_path.exists(): + wal_path.unlink() + + +def _valid_duckdb_file(path: Path, layout: str) -> bool: + if not path.exists(): + return False + + con = None + try: + con = duckdb.connect(str(path), read_only=True) + has_data = bool( + con.execute( + "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = 'main' AND table_name = 'data'" + ).fetchone()[0] + ) + if not has_data: + return False + if layout == "art-index": + index_count = con.execute( + "SELECT COUNT(*) FROM duckdb_indexes() WHERE schema_name = 'main' AND table_name = 'data' " + "AND index_name = 'data_id_idx'" + ).fetchone()[0] + return bool(index_count) + return layout == "zonemap" + except duckdb.Error: + return False + finally: + if con is not None: + con.close() + + +def build_duckdb_file( + size: int, + dist: str, + id_dtype: np.dtype, + path: Path, + *, + layout: str, + batch_size: int, +) -> float: + path.parent.mkdir(parents=True, exist_ok=True) + _remove_duckdb_path(path) + + id_type = duckdb_sql_type(id_dtype) + block_order = _block_order(size, batch_size) if dist == "block-shuffled" else None + permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0) + random_ids = _randomized_ids(size, id_dtype) if dist == "random" else None + + start_time = time.perf_counter() + con = duckdb.connect(str(path)) + try: + con.execute("PRAGMA threads=8") + con.execute(f"CREATE TABLE data (id {id_type}, payload FLOAT)") + for start in range(0, size, batch_size): + stop = min(start + batch_size, size) + ids = np.empty(stop - start, dtype=id_dtype) + if dist == "sorted": + ids[:] = ordered_id_slice(size, start, stop, id_dtype) + elif dist == "block-shuffled": + _fill_block_shuffled_ids(ids, size, start, stop, batch_size, block_order) + elif dist == "permuted": + _fill_permuted_ids(ids, size, start, stop, permuted_step, permuted_offset) + elif dist == "random": + ids[:] = random_ids[start:stop] + else: + raise ValueError(f"unsupported distribution {dist!r}") + + payload = payload_slice(start, stop) + batch = pa.table({"id": ids, "payload": payload}) + con.register("batch_arrow", batch) + con.execute("INSERT INTO data SELECT * FROM batch_arrow") + con.unregister("batch_arrow") + + if layout == "art-index": + con.execute("CREATE INDEX data_id_idx ON data(id)") + elif layout != "zonemap": + raise ValueError(f"unsupported layout {layout!r}") + + con.execute("CHECKPOINT") + finally: + con.close() + return time.perf_counter() - start_time + + +def _open_or_build_duckdb_file( + size: int, + dist: str, + id_dtype: np.dtype, + path: Path, + *, + layout: str, + batch_size: int, +) -> float: + if _valid_duckdb_file(path, layout): + return 0.0 + return build_duckdb_file(size, dist, id_dtype, path, layout=layout, batch_size=batch_size) + + +def _query_bounds(size: int, query_width: int, dtype: np.dtype) -> tuple[object, object]: + lo_idx = size // 2 + hi_idx = min(size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_id_at(size, lo_idx, dtype), ordered_id_at(size, hi_idx, dtype) + + +def _literal(value: object, dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "TRUE" if bool(value) else "FALSE" + if dtype.kind == "f": + return repr(float(value)) + if dtype.kind in {"i", "u"}: + return str(int(value)) + raise ValueError(f"unsupported dtype for literal formatting: {dtype}") + + +def _condition_sql(lo: object, hi: object, dtype: np.dtype, *, exact_query: bool = False) -> str: + if exact_query: + if lo != hi: + raise ValueError(f"exact queries require a single lookup value, got lo={lo!r}, hi={hi!r}") + return f"id = {_literal(lo, dtype)}" + return f"id >= {_literal(lo, dtype)} AND id <= {_literal(hi, dtype)}" + + +def benchmark_scan_once(path: Path, lo, hi, dtype: np.dtype, *, exact_query: bool = False) -> tuple[float, float, float, int]: + con = duckdb.connect(str(path), read_only=True) + try: + condition_sql = _condition_sql(lo, hi, dtype, exact_query=exact_query) + # Force the filtered baseline down the table-scan path instead of the ART index path. + con.execute("SET index_scan_max_count = 0") + con.execute("SET index_scan_percentage = 0") + query = f"SELECT * FROM data WHERE {condition_sql}" + + cold_start = time.perf_counter() + table = con.execute(query).arrow().read_all() + cold_elapsed = time.perf_counter() - cold_start + + start = time.perf_counter() + table = con.execute(query).arrow().read_all() + result_len = len(table) + warm_elapsed = time.perf_counter() - start + + third_start = time.perf_counter() + con.execute(query).arrow().read_all() + third_elapsed = time.perf_counter() - third_start + return cold_elapsed, warm_elapsed, third_elapsed, result_len + finally: + con.close() + + +def benchmark_filtered_once(path: Path, lo, hi, dtype: np.dtype, *, exact_query: bool = False) -> tuple[float, int]: + con = duckdb.connect(str(path), read_only=True) + try: + condition_sql = _condition_sql(lo, hi, dtype, exact_query=exact_query) + start = time.perf_counter() + table = con.execute(f"SELECT * FROM data WHERE {condition_sql}").arrow().read_all() + ids = table["id"].to_numpy() + result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) + elapsed = time.perf_counter() - start + return elapsed, result_len + finally: + con.close() + + +def benchmark_filtered_once_con( + con: duckdb.DuckDBPyConnection, lo, hi, dtype: np.dtype, *, exact_query: bool = False +) -> tuple[float, int]: + condition_sql = _condition_sql(lo, hi, dtype, exact_query=exact_query) + start = time.perf_counter() + table = con.execute(f"SELECT * FROM data WHERE {condition_sql}").arrow().read_all() + ids = table["id"].to_numpy() + result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) + elapsed = time.perf_counter() - start + return elapsed, result_len + + +def median(values: list[float]) -> float: + return statistics.median(values) + + +def benchmark_layout( + size: int, + outdir: Path, + dist: str, + query_width: int, + id_dtype: np.dtype, + layout: str, + batch_size: int, + repeats: int, + exact_query: bool, +) -> dict: + path = duckdb_path(outdir, size, dist, id_dtype, layout, batch_size) + create_s = _open_or_build_duckdb_file(size, dist, id_dtype, path, layout=layout, batch_size=batch_size) + lo, hi = _query_bounds(size, query_width, id_dtype) + + cold_scan_elapsed, warm_scan_elapsed, third_scan_elapsed, scan_rows = benchmark_scan_once( + path, lo, hi, id_dtype, exact_query=exact_query + ) + + con = duckdb.connect(str(path), read_only=True) + try: + cold_elapsed, filtered_rows = benchmark_filtered_once_con(con, lo, hi, id_dtype, exact_query=exact_query) + warm_times = [ + benchmark_filtered_once_con(con, lo, hi, id_dtype, exact_query=exact_query)[0] * 1_000 + for _ in range(repeats) + ] + finally: + con.close() + + if scan_rows != filtered_rows: + raise AssertionError(f"filtered rows mismatch: scan={scan_rows}, filtered={filtered_rows}") + + cold_scan_ms = cold_scan_elapsed * 1_000 + warm_scan_ms = warm_scan_elapsed * 1_000 + cold_ms = cold_elapsed * 1_000 + warm_ms = median(warm_times) if warm_times else None + if layout == "zonemap": + cold_ms = third_scan_elapsed * 1_000 + + return { + "size": size, + "dist": dist, + "layout": layout, + "create_ms": create_s * 1_000, + "cold_scan_ms": cold_scan_ms, + "warm_scan_ms": warm_scan_ms, + "cold_ms": cold_ms, + "cold_speedup": cold_scan_ms / cold_ms, + "warm_ms": warm_ms, + "warm_speedup": None if warm_ms is None else warm_scan_ms / warm_ms, + "db_bytes": os.path.getsize(path), + "query_rows": int(filtered_rows), + "path": path, + } + + +def parse_human_int(value: str) -> int: + value = value.strip().lower().replace("_", "") + multipliers = {"k": 1_000, "m": 1_000_000} + if value[-1:] in multipliers: + return int(float(value[:-1]) * multipliers[value[-1]]) + return int(value) + + +def print_results( + results: list[dict], + *, + batch_size: int, + repeats: int, + dist: str, + query_width: int, + id_dtype: np.dtype, + exact_query: bool, +) -> None: + print("DuckDB range-query benchmark via SQL filtered reads") + print( + f"batch_size={batch_size:,}, repeats={repeats}, dist={dist}, query_width={query_width:,}, " + f"dtype={id_dtype.name}, query_single_value={exact_query}" + ) + print("Note: 'zonemap' is DuckDB's default table layout with automatic min/max pruning.") + print(" 'art-index' adds an explicit secondary index on id.") + if exact_query: + print(" Filter predicate uses `id = value`.") + else: + print(" Filter predicate uses `id >= lo AND id <= hi`.") + cold_widths = table_widths(results, COLD_COLUMNS) + print() + print("Cold Query Table") + print_table(results, COLD_COLUMNS, cold_widths) + if repeats > 0: + warm_widths = table_widths(results, WARM_COLUMNS) + shared_width_by_header = {} + for (header, _), width in zip(COLD_COLUMNS, cold_widths, strict=True): + shared_width_by_header[header] = width + for (header, _), width in zip(WARM_COLUMNS, warm_widths, strict=True): + shared_width_by_header[header] = max(shared_width_by_header.get(header, 0), width) + warm_widths = [shared_width_by_header[header] for header, _ in WARM_COLUMNS] + print() + print("Warm Query Table") + print_table(results, WARM_COLUMNS, warm_widths) + + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def _table_rows(results: list[dict], columns: list[tuple[str, callable]]) -> tuple[list[str], list[list[str]], list[int]]: + headers = [header for header, _ in columns] + widths = [len(header) for header in headers] + rows = [[formatter(result) for _, formatter in columns] for result in results] + for row in rows: + widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] + return headers, rows, widths + + +def print_table(results: list[dict], columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + headers, rows, computed_widths = _table_rows(results, columns) + widths = computed_widths if widths is None else widths + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in rows: + print(_format_row(row, widths)) + + +def table_widths(results: list[dict], columns: list[tuple[str, callable]]) -> list[int]: + _, _, widths = _table_rows(results, columns) + return widths + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--size", default="10M", help="Number of rows, or 'all'. Default: 10M.") + parser.add_argument("--outdir", type=Path, required=True, help="Directory for generated DuckDB files.") + parser.add_argument("--dist", choices=(*DISTS, "all"), default="permuted", help="Row distribution.") + parser.add_argument("--layout", choices=(*LAYOUTS, "all"), default="all", help="DuckDB layout to benchmark.") + parser.add_argument("--query-width", type=parse_human_int, default=1, help="Query width. Default: 1.") + parser.add_argument( + "--query-single-value", + action=argparse.BooleanOptionalAction, + default=False, + help="Use `id = value` instead of a range predicate. Requires query-width=1.", + ) + parser.add_argument("--dtype", default="float64", help="Indexed id dtype. Default: float64.") + parser.add_argument( + "--batch-size", + type=parse_human_int, + default=DEFAULT_BATCH_SIZE, + help="Batch size used while loading the table. Default: 1.25M.", + ) + parser.add_argument("--repeats", type=int, default=DEFAULT_REPEATS, help="Benchmark repeats. Default: 3.") + args = parser.parse_args() + + if args.query_single_value and args.query_width != 1: + raise ValueError("--query-single-value requires --query-width 1") + + id_dtype = np.dtype(args.dtype) + sizes = SIZES if args.size == "all" else (parse_human_int(args.size),) + dists = DISTS if args.dist == "all" else (args.dist,) + layouts = LAYOUTS if args.layout == "all" else (args.layout,) + + results = [] + for size in sizes: + for dist in dists: + for layout in layouts: + results.append( + benchmark_layout( + size, + args.outdir, + dist, + args.query_width, + id_dtype, + layout, + args.batch_size, + args.repeats, + args.query_single_value, + ) + ) + + print_results( + results, + batch_size=args.batch_size, + repeats=args.repeats, + dist=args.dist, + query_width=args.query_width, + id_dtype=id_dtype, + exact_query=args.query_single_value, + ) + + +if __name__ == "__main__": + main() diff --git a/bench/indexing/index_query_bench.py b/bench/indexing/index_query_bench.py new file mode 100644 index 00000000..54d844f9 --- /dev/null +++ b/bench/indexing/index_query_bench.py @@ -0,0 +1,927 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import math +import os +import re +import statistics +import tempfile +import time +from pathlib import Path + +import numpy as np + +import blosc2 +from blosc2 import indexing as blosc2_indexing + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +DEFAULT_REPEATS = 3 +KINDS = ("ultralight", "light", "medium", "full") +DEFAULT_KIND = "light" +DISTS = ("sorted", "block-shuffled", "permuted", "random") +RNG_SEED = 0 +DEFAULT_OPLEVEL = 5 +FULL_QUERY_MODES = ("auto", "selective-ooc", "whole-load") +DATASET_LAYOUT_VERSION = "payload-ramp-v1" + +COLD_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), +] + +WARM_COLUMNS = [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ( + "speedup", + lambda result: f"{result['warm_speedup']:.2f}x" if result["warm_speedup"] is not None else "-", + ), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), +] + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def source_dtype(id_dtype: np.dtype) -> np.dtype: + return np.dtype([("id", np.dtype(id_dtype)), ("payload", np.float32)]) + + +def payload_slice(start: int, stop: int) -> np.ndarray: + """Deterministic nontrivial payload values for structured benchmark rows.""" + return np.arange(start, stop, dtype=np.float32) + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_slice(size: int, start: int, stop: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if stop <= start: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + values = np.zeros(stop - start, dtype=dtype) + true_start = max(start, size // 2) + if true_start < stop: + values[true_start - start :] = True + return values + + positions = np.arange(start, stop, dtype=np.int64) + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_at(size: int, index: int, dtype: np.dtype) -> object: + return ordered_id_slice(size, index, index + 1, dtype)[0].item() + + +def ordered_ids_from_positions(positions: np.ndarray, size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if positions.size == 0: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + return (positions >= (size // 2)).astype(dtype, copy=False) + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random.Generator, block_len: int) -> None: + size = ids.shape[0] + if dist == "sorted": + ids[:] = ordered_ids + return + + if dist == "block-shuffled": + nblocks = (size + block_len - 1) // block_len + order = rng.permutation(nblocks) + dest = 0 + for src_block in order: + src_start = int(src_block) * block_len + src_stop = min(src_start + block_len, size) + block_size = src_stop - src_start + ids[dest : dest + block_size] = ordered_ids[src_start:src_stop] + dest += block_size + return + + if dist == "random": + ids[:] = ordered_ids + rng.shuffle(ids) + return + + raise ValueError(f"unsupported distribution {dist!r}") + + +def _geometry_value_token(value: int | None) -> str: + return "auto" if value is None else f"{value}" + + +def geometry_token(chunks: int | None, blocks: int | None) -> str: + return f"chunks-{_geometry_value_token(chunks)}.blocks-{_geometry_value_token(blocks)}" + + +def format_geometry_value(value: int | None) -> str: + return "auto" if value is None else f"{value:,}" + + +def resolve_geometry(shape: tuple[int, ...], dtype: np.dtype, chunks: int | None, blocks: int | None) -> tuple[int, int]: + chunk_spec = None if chunks is None else (chunks,) + block_spec = None if blocks is None else (blocks,) + resolved_chunks, resolved_blocks = blosc2.compute_chunks_blocks(shape, chunk_spec, block_spec, dtype=dtype) + return int(resolved_chunks[0]), int(resolved_blocks[0]) + + +def _block_order(size: int, block_len: int) -> np.ndarray: + nblocks = (size + block_len - 1) // block_len + return np.random.default_rng(RNG_SEED).permutation(nblocks) + + +def _fill_block_shuffled_ids( + ids: np.ndarray, size: int, start: int, stop: int, block_len: int, order: np.ndarray +) -> None: + cursor = start + out_cursor = 0 + while cursor < stop: + dest_block = cursor // block_len + block_offset = cursor % block_len + src_block = int(order[dest_block]) + src_start = src_block * block_len + block_offset + take = min(stop - cursor, block_len - block_offset, size - src_start) + ids[out_cursor : out_cursor + take] = ordered_id_slice(size, src_start, src_start + take, ids.dtype) + cursor += take + out_cursor += take + + +def _permuted_position_params(size: int) -> tuple[int, int]: + if size <= 1: + return 1, 0 + rng = np.random.default_rng(RNG_SEED) + step = int(rng.integers(1, size)) + while math.gcd(step, size) != 1: + step += 1 + if step >= size: + step = 1 + offset = int(rng.integers(0, size)) + return step, offset + + +def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step: int, offset: int) -> None: + positions = np.arange(start, stop, dtype=np.int64) + shuffled_positions = (positions * step + offset) % size + ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype) + + +def _randomized_ids(size: int, dtype: np.dtype) -> np.ndarray: + ids = make_ordered_ids(size, dtype) + np.random.default_rng(RNG_SEED).shuffle(ids) + return ids + + +def build_persistent_array( + size: int, dist: str, id_dtype: np.dtype, path: Path, chunks: int | None, blocks: int | None +) -> blosc2.NDArray: + dtype = source_dtype(id_dtype) + kwargs = {"urlpath": path, "mode": "w"} + if chunks is not None: + kwargs["chunks"] = (chunks,) + if blocks is not None: + kwargs["blocks"] = (blocks,) + arr = blosc2.zeros((size,), dtype=dtype, **kwargs) + chunk_len = int(arr.chunks[0]) + block_len = int(arr.blocks[0]) + block_order = _block_order(size, block_len) if dist == "block-shuffled" else None + permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0) + random_ids = _randomized_ids(size, id_dtype) if dist == "random" else None + for start in range(0, size, chunk_len): + stop = min(start + chunk_len, size) + chunk = np.zeros(stop - start, dtype=dtype) + if dist == "sorted": + chunk["id"] = ordered_id_slice(size, start, stop, id_dtype) + elif dist == "block-shuffled": + _fill_block_shuffled_ids(chunk["id"], size, start, stop, block_len, block_order) + elif dist == "permuted": + _fill_permuted_ids(chunk["id"], size, start, stop, permuted_step, permuted_offset) + elif dist == "random": + chunk["id"] = random_ids[start:stop] + else: + raise ValueError(f"unsupported distribution {dist!r}") + chunk["payload"] = payload_slice(start, stop) + arr[start:stop] = chunk + return arr + + +def base_array_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None) -> Path: + return ( + size_dir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.{geometry_token(chunks, blocks)}.b2nd" + ) + + +def indexed_array_path( + size_dir: Path, + size: int, + dist: str, + kind: str, + optlevel: int, + id_dtype: np.dtype, + in_mem: bool, + chunks: int | None, + blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, + nthreads: int | None, +) -> Path: + mode = "mem" if in_mem else "ooc" + codec_token = "codec-auto" if codec is None else f"codec-{codec.name}" + clevel_token = "clevel-auto" if clevel is None else f"clevel-{clevel}" + thread_token = "threads-auto" if nthreads is None else f"threads-{nthreads}" + return ( + size_dir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.{geometry_token(chunks, blocks)}.{codec_token}.{clevel_token}.{thread_token}" + f".{kind}.opt{optlevel}.{mode}.b2nd" + ) + + +def benchmark_scan_once(expr) -> tuple[float, int]: + start = time.perf_counter() + result = expr.compute(_use_index=False)[:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def benchmark_index_once(arr: blosc2.NDArray, cond) -> tuple[float, int]: + start = time.perf_counter() + result = arr[cond][:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def _with_full_query_mode(full_query_mode: str): + class _FullQueryModeScope: + def __enter__(self): + self.previous = os.environ.get("BLOSC2_FULL_EXACT_QUERY_MODE") + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = full_query_mode + + def __exit__(self, exc_type, exc, tb): + if self.previous is None: + os.environ.pop("BLOSC2_FULL_EXACT_QUERY_MODE", None) + else: + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = self.previous + + return _FullQueryModeScope() + + +def index_sizes(descriptor: dict) -> tuple[int, int]: + logical = 0 + disk = 0 + for level_info in descriptor["levels"].values(): + dtype = np.dtype(level_info["dtype"]) + logical += dtype.itemsize * level_info["nsegments"] + if level_info["path"]: + disk += os.path.getsize(level_info["path"]) + + light = descriptor.get("light") + if light is not None: + for key in ("values_path", "bucket_positions_path", "offsets_path"): + array = blosc2.open(light[key]) + logical += int(np.prod(array.shape)) * array.dtype.itemsize + disk += os.path.getsize(light[key]) + + reduced = descriptor.get("reduced") + if reduced is not None: + values = blosc2.open(reduced["values_path"]) + positions = blosc2.open(reduced["positions_path"]) + offsets = blosc2.open(reduced["offsets_path"]) + logical += values.shape[0] * values.dtype.itemsize + logical += positions.shape[0] * positions.dtype.itemsize + logical += offsets.shape[0] * offsets.dtype.itemsize + disk += os.path.getsize(reduced["values_path"]) + disk += os.path.getsize(reduced["positions_path"]) + disk += os.path.getsize(reduced["offsets_path"]) + + full = descriptor.get("full") + if full is not None: + values = blosc2.open(full["values_path"]) + positions = blosc2.open(full["positions_path"]) + logical += values.shape[0] * values.dtype.itemsize + logical += positions.shape[0] * positions.dtype.itemsize + disk += os.path.getsize(full["values_path"]) + disk += os.path.getsize(full["positions_path"]) + return logical, disk + + +def _query_bounds(size: int, query_width: int, dtype: np.dtype) -> tuple[object, object]: + if size <= 0: + raise ValueError("benchmark arrays must not be empty") + + lo_idx = size // 2 + hi_idx = min(size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_id_at(size, lo_idx, dtype), ordered_id_at(size, hi_idx, dtype) + + +def _literal(value: object, dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "True" if bool(value) else "False" + if dtype.kind == "f": + return repr(float(value)) + if dtype.kind in {"i", "u"}: + return str(int(value)) + raise ValueError(f"unsupported dtype for literal formatting: {dtype}") + + +def _condition_expr(lo: object, hi: object, dtype: np.dtype, *, query_single_value: bool = False) -> str: + lo_literal = _literal(lo, dtype) + if query_single_value: + if lo != hi: + raise ValueError(f"single-value queries require a single lookup value, got lo={lo!r}, hi={hi!r}") + return f"id == {lo_literal}" + hi_literal = _literal(hi, dtype) + return f"(id >= {lo_literal}) & (id <= {hi_literal})" + + +def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int, in_mem: bool) -> dict | None: + for descriptor in arr.indexes: + if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: + continue + expected_ooc = descriptor.get("ooc", False) if kind == "ultralight" else (not bool(in_mem)) + if ( + descriptor.get("field") == "id" + and descriptor.get("kind") == kind + and int(descriptor.get("optlevel", -1)) == int(optlevel) + and bool(descriptor.get("ooc", False)) is bool(expected_ooc) + and not descriptor.get("stale", False) + ): + return descriptor + return None + + +def _open_or_build_persistent_array( + path: Path, size: int, dist: str, id_dtype: np.dtype, chunks: int | None, blocks: int | None +) -> blosc2.NDArray: + if path.exists(): + return blosc2.open(path, mode="a") + blosc2.remove_urlpath(path) + return build_persistent_array(size, dist, id_dtype, path, chunks, blocks) + + +def _open_or_build_indexed_array( + path: Path, + size: int, + dist: str, + id_dtype: np.dtype, + kind: str, + optlevel: int, + in_mem: bool, + chunks: int | None, + blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, + nthreads: int | None, +) -> tuple[blosc2.NDArray, float]: + if path.exists(): + arr = blosc2.open(path, mode="a") + if _valid_index_descriptor(arr, kind, optlevel, in_mem) is not None: + return arr, 0.0 + if arr.indexes: + arr.drop_index(field="id") + blosc2.remove_urlpath(path) + + arr = build_persistent_array(size, dist, id_dtype, path, chunks, blocks) + build_start = time.perf_counter() + kwargs = {"field": "id", "kind": kind, "optlevel": optlevel, "in_mem": in_mem} + cparams = {} + if codec is not None: + cparams["codec"] = codec + if clevel is not None: + cparams["clevel"] = clevel + if nthreads is not None: + cparams["nthreads"] = nthreads + if cparams: + kwargs["cparams"] = cparams + arr.create_index(**kwargs) + return arr, time.perf_counter() - build_start + + +def benchmark_size( + size: int, + size_dir: Path, + dist: str, + query_width: int, + query_single_value: bool, + optlevel: int, + id_dtype: np.dtype, + in_mem: bool, + full_query_mode: str, + chunks: int | None, + blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, + nthreads: int | None, + kinds: tuple[str, ...], + cold_row_callback=None, +) -> list[dict]: + arr = _open_or_build_persistent_array( + base_array_path(size_dir, size, dist, id_dtype, chunks, blocks), size, dist, id_dtype, chunks, blocks + ) + lo, hi = _query_bounds(size, query_width, id_dtype) + condition_str = _condition_expr(lo, hi, id_dtype, query_single_value=query_single_value) + condition = blosc2.lazyexpr(condition_str, arr.fields) + expr = condition.where(arr) + base_bytes = size * arr.dtype.itemsize + compressed_base_bytes = os.path.getsize(arr.urlpath) + + scan_ms = benchmark_scan_once(expr)[0] * 1_000 + + rows = [] + for kind in kinds: + idx_arr, build_time = _open_or_build_indexed_array( + indexed_array_path( + size_dir, size, dist, kind, optlevel, id_dtype, in_mem, chunks, blocks, codec, clevel, nthreads + ), + size, + dist, + id_dtype, + kind, + optlevel, + in_mem, + chunks, + blocks, + codec, + clevel, + nthreads, + ) + idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) + idx_expr = idx_cond.where(idx_arr) + with _with_full_query_mode(full_query_mode): + explanation = idx_expr.explain() + cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) + descriptor = idx_arr.indexes[0] + logical_index_bytes, disk_index_bytes = index_sizes(descriptor) + + row = { + "size": size, + "dist": dist, + "kind": kind, + "optlevel": optlevel, + "in_mem": in_mem, + "query_rows": index_len, + "build_s": build_time, + "create_idx_ms": build_time * 1_000, + "scan_ms": scan_ms, + "cold_ms": cold_time * 1_000, + "cold_speedup": scan_ms / (cold_time * 1_000), + "warm_ms": None, + "warm_speedup": None, + "candidate_units": explanation["candidate_units"], + "total_units": explanation["total_units"], + "lookup_path": explanation.get("lookup_path"), + "full_query_mode": full_query_mode, + "logical_index_bytes": logical_index_bytes, + "disk_index_bytes": disk_index_bytes, + "index_pct": logical_index_bytes / base_bytes * 100, + "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + "_arr": idx_arr, + "_cond": idx_cond, + } + rows.append(row) + if cold_row_callback is not None: + cold_row_callback(row) + return rows + + +def measure_warm_queries(rows: list[dict], repeats: int) -> None: + if repeats <= 0: + return + for result in rows: + arr = result["_arr"] + cond = result["_cond"] + with _with_full_query_mode(result["full_query_mode"]): + index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] + warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None + result["warm_ms"] = warm_ms + result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms + + +def parse_human_size(value: str) -> int: + value = value.strip() + if not value: + raise argparse.ArgumentTypeError("size must not be empty") + + suffixes = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000} + suffix = value[-1].lower() + if suffix in suffixes: + number = value[:-1] + if not number: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") + try: + parsed = int(number) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + size = parsed * suffixes[suffix] + else: + try: + size = int(value) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + + if size <= 0: + raise argparse.ArgumentTypeError("size must be a positive integer") + return size + + +def parse_human_size_or_auto(value: str) -> int | None: + value = value.strip() + if value.lower() == "auto": + return None + return parse_human_size(value) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark python-blosc2 index kinds.") + parser.add_argument( + "--size", + type=parse_human_size, + help="Benchmark a single array size. Supports suffixes like 1k, 1K, 1M, 1G.", + ) + parser.add_argument( + "--query-width", + type=parse_human_size, + default=1, + help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1.", + ) + parser.add_argument( + "--query-single-value", + action=argparse.BooleanOptionalAction, + default=False, + help="Use `id == value` instead of a range predicate. Requires query-width=1.", + ) + parser.add_argument( + "--chunks", + type=parse_human_size_or_auto, + default=None, + help="Chunk size for the base array. Supports suffixes like 10k, 1M, and 'auto'. Default: auto.", + ) + parser.add_argument( + "--blocks", + type=parse_human_size_or_auto, + default=None, + help="Block size for the base array. Supports suffixes like 10k, 1M, and 'auto'. Default: auto.", + ) + parser.add_argument( + "--repeats", + type=int, + default=DEFAULT_REPEATS, + help="Number of repeated warm-query measurements after the first cold query. Default: 3.", + ) + parser.add_argument( + "--outdir", + type=Path, + help="Directory where benchmark arrays and index sidecars should be written and kept.", + ) + parser.add_argument( + "--optlevel", + type=int, + default=DEFAULT_OPLEVEL, + help="Index optlevel to use when creating indexes. Default: 5.", + ) + parser.add_argument( + "--dtype", + default="float64", + help="NumPy dtype for the indexed field. Examples: float64, float32, int16, bool. Default: float64.", + ) + parser.add_argument( + "--dist", + choices=(*DISTS, "all"), + default="permuted", + help="Distribution for the indexed field. Use 'all' to benchmark every distribution.", + ) + parser.add_argument( + "--kind", + choices=(*KINDS, "all"), + default=DEFAULT_KIND, + help=f"Index kind to benchmark. Use 'all' to benchmark every kind. Default: {DEFAULT_KIND}.", + ) + parser.add_argument( + "--in-mem", + action=argparse.BooleanOptionalAction, + default=False, + help="Use the in-memory index builders. Disabled by default; pass --in-mem to force them.", + ) + parser.add_argument( + "--full-query-mode", + choices=FULL_QUERY_MODES, + default="auto", + help="How full exact queries should run during the benchmark: auto, selective-ooc, or whole-load.", + ) + parser.add_argument( + "--codec", + type=str, + default=None, + choices=[codec.name for codec in blosc2.Codec], + help="Codec to use for index sidecars. Default: library default.", + ) + parser.add_argument( + "--clevel", + type=int, + default=None, + help="Compression level to use for index sidecars. Default: library default.", + ) + parser.add_argument( + "--nthreads", + type=int, + default=None, + help="Number of threads to use for index creation. Default: use blosc2.nthreads.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + if args.repeats < 0: + raise SystemExit("--repeats must be >= 0") + if args.query_single_value and args.query_width != 1: + raise SystemExit("--query-single-value requires --query-width 1") + try: + id_dtype = np.dtype(args.dtype) + except TypeError as exc: + raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc + if id_dtype.kind not in {"b", "i", "u", "f"}: + raise SystemExit(f"--dtype only supports bool, integer, and floating-point dtypes; got {id_dtype}") + codec = None if args.codec is None else blosc2.Codec[args.codec] + if args.clevel is not None and args.clevel < 0: + raise SystemExit("--clevel must be >= 0") + if args.nthreads is not None and args.nthreads <= 0: + raise SystemExit("--nthreads must be a positive integer") + sizes = (args.size,) if args.size is not None else SIZES + dists = DISTS if args.dist == "all" else (args.dist,) + kinds = KINDS if args.kind == "all" else (args.kind,) + + if args.outdir is None: + with tempfile.TemporaryDirectory() as tmpdir: + run_benchmarks( + sizes, + dists, + kinds, + Path(tmpdir), + args.dist, + args.query_width, + args.query_single_value, + args.repeats, + args.optlevel, + id_dtype, + args.in_mem, + args.full_query_mode, + args.chunks, + args.blocks, + codec, + args.clevel, + args.nthreads, + ) + else: + args.outdir.mkdir(parents=True, exist_ok=True) + run_benchmarks( + sizes, + dists, + kinds, + args.outdir, + args.dist, + args.query_width, + args.query_single_value, + args.repeats, + args.optlevel, + id_dtype, + args.in_mem, + args.full_query_mode, + args.chunks, + args.blocks, + codec, + args.clevel, + args.nthreads, + ) + + +def run_benchmarks( + sizes: tuple[int, ...], + dists: tuple[str, ...], + kinds: tuple[str, ...], + size_dir: Path, + dist_label: str, + query_width: int, + query_single_value: bool, + repeats: int, + optlevel: int, + id_dtype: np.dtype, + in_mem: bool, + full_query_mode: str, + chunks: int | None, + blocks: int | None, + codec: blosc2.Codec | None, + clevel: int | None, + nthreads: int | None, +) -> None: + all_results = [] + + array_dtype = source_dtype(id_dtype) + resolved_geometries = {resolve_geometry((size,), array_dtype, chunks, blocks) for size in sizes} + if len(resolved_geometries) == 1: + resolved_chunk_len, resolved_block_len = next(iter(resolved_geometries)) + geometry_label = f"chunks={resolved_chunk_len:,}, blocks={resolved_block_len:,}" + else: + geometry_label = "chunks=varies, blocks=varies" + print("Structured range-query benchmark across index kinds") + print( + f"{geometry_label}, repeats={repeats}, dist={dist_label}, " + f"query_width={query_width:,}, optlevel={optlevel}, dtype={id_dtype.name}, in_mem={in_mem}, " + f"query_single_value={query_single_value}, " + f"full_query_mode={full_query_mode}, index_codec={'auto' if codec is None else codec.name}, " + f"index_clevel={'auto' if clevel is None else clevel}, " + f"index_nthreads={'auto' if nthreads is None else nthreads}" + ) + for dist in dists: + for size in sizes: + size_results = benchmark_size( + size, + size_dir, + dist, + query_width, + query_single_value, + optlevel, + id_dtype, + in_mem, + full_query_mode, + chunks, + blocks, + codec, + clevel, + nthreads, + kinds, + ) + all_results.extend(size_results) + cold_widths = table_widths(all_results, COLD_COLUMNS) + print() + print("Cold Query Table") + print_table(all_results, COLD_COLUMNS, cold_widths) + if repeats > 0: + measure_warm_queries(all_results, repeats) + warm_widths = table_widths(all_results, WARM_COLUMNS) + shared_width_by_header = {} + for (header, _), width in zip(COLD_COLUMNS, cold_widths, strict=True): + shared_width_by_header[header] = width + for (header, _), width in zip(WARM_COLUMNS, warm_widths, strict=True): + shared_width_by_header[header] = max(shared_width_by_header.get(header, 0), width) + warm_widths = [shared_width_by_header[header] for header, _ in WARM_COLUMNS] + print() + print("Warm Query Table") + print_table(all_results, WARM_COLUMNS, warm_widths) + + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def _table_rows(results: list[dict], columns: list[tuple[str, callable]]) -> tuple[list[str], list[list[str]], list[int]]: + headers = [header for header, _ in columns] + widths = [len(header) for header in headers] + rows = [[formatter(result) for _, formatter in columns] for result in results] + for row in rows: + widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] + return headers, rows, widths + + +def print_table(results: list[dict], columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + headers, rows, computed_widths = _table_rows(results, columns) + widths = computed_widths if widths is None else widths + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in rows: + print(_format_row(row, widths)) + + +def print_table_header(columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + headers = [header for header, _ in columns] + if widths is None: + widths = [len(header) for header in headers] + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + + +def print_table_row(result: dict, columns: list[tuple[str, callable]], widths: list[int] | None = None) -> None: + cells = [formatter(result) for _, formatter in columns] + if widths is None: + widths = [max(len(header), len(cell)) for (header, _), cell in zip(columns, cells, strict=True)] + print(_format_row(cells, widths)) + + +def progress_widths( + columns: list[tuple[str, callable]], + sizes: tuple[int, ...], + dists: tuple[str, ...], + kinds: tuple[str, ...], + id_dtype: np.dtype, +) -> list[int]: + max_size = max(sizes) + max_index_bytes = max_size * max(np.dtype(id_dtype).itemsize + 8, 16) + max_cells = { + "rows": f"{max_size:,}", + "dist": max(dists, key=len), + "builder": "ooc", + "kind": max(kinds, key=len), + "create_idx_ms": "999999.999", + "scan_ms": "9999.999", + "cold_ms": "9999.999", + "warm_ms": "9999.999", + "speedup": "9999.99x", + "logical_bytes": f"{max_index_bytes:,}", + "disk_bytes": f"{max_index_bytes:,}", + "index_pct": "100.0000%", + "index_pct_disk": "100.0000%", + } + widths = [] + for header, _ in columns: + widths.append(max(len(header), len(max_cells.get(header, "")))) + return widths + + +def table_widths(results: list[dict], columns: list[tuple[str, callable]]) -> list[int]: + _, _, widths = _table_rows(results, columns) + return widths + + +if __name__ == "__main__": + main() diff --git a/bench/indexing/index_query_bench_tables.py b/bench/indexing/index_query_bench_tables.py new file mode 100644 index 00000000..246ee50d --- /dev/null +++ b/bench/indexing/index_query_bench_tables.py @@ -0,0 +1,458 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import os +import re +import statistics +import tempfile +import time +from pathlib import Path + +import numpy as np +import tables + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +CHUNK_LEN = 100_000 +DEFAULT_REPEATS = 3 +KINDS = ("ultralight", "light", "medium", "full") +DISTS = ("sorted", "block-shuffled", "random") +RNG_SEED = 0 +TABLE_NAME = "data" +DATA_FILTERS = tables.Filters(complevel=5, complib="blosc2:zstd", shuffle=True) + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def fill_ids(ids: np.ndarray, ordered_ids: np.ndarray, dist: str, rng: np.random.Generator) -> None: + size = ids.shape[0] + if dist == "sorted": + ids[:] = ordered_ids + return + + if dist == "block-shuffled": + nblocks = (size + CHUNK_LEN - 1) // CHUNK_LEN + order = rng.permutation(nblocks) + dest = 0 + for src_block in order: + src_start = int(src_block) * CHUNK_LEN + src_stop = min(src_start + CHUNK_LEN, size) + block_size = src_stop - src_start + ids[dest : dest + block_size] = ordered_ids[src_start:src_stop] + dest += block_size + return + + if dist == "random": + ids[:] = ordered_ids + rng.shuffle(ids) + return + + raise ValueError(f"unsupported distribution {dist!r}") + + +def make_source_data(size: int, dist: str, id_dtype: np.dtype) -> np.ndarray: + dtype = np.dtype([("id", id_dtype), ("payload", np.float32)]) + data = np.zeros(size, dtype=dtype) + fill_ids(data["id"], make_ordered_ids(size, id_dtype), dist, np.random.default_rng(RNG_SEED)) + return data + + +def _source_data_factory(size: int, dist: str, id_dtype: np.dtype): + data = None + + def get_data() -> np.ndarray: + nonlocal data + if data is None: + data = make_source_data(size, dist, id_dtype) + return data + + return get_data + + +def _ordered_ids_factory(size: int, id_dtype: np.dtype): + ordered_ids = None + + def get_ordered_ids() -> np.ndarray: + nonlocal ordered_ids + if ordered_ids is None: + ordered_ids = make_ordered_ids(size, id_dtype) + return ordered_ids + + return get_ordered_ids + + +def base_table_path(size_dir: Path, size: int, dist: str, id_dtype: np.dtype) -> Path: + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.h5" + + +def indexed_table_path(size_dir: Path, size: int, dist: str, kind: str, id_dtype: np.dtype) -> Path: + return size_dir / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{kind}.h5" + + +def build_persistent_table(data: np.ndarray, path: Path) -> tuple[tables.File, tables.Table]: + h5 = tables.open_file(path, mode="w") + table = h5.create_table( + "/", + TABLE_NAME, + obj=data, + filters=DATA_FILTERS, + expectedrows=len(data), + chunkshape=CHUNK_LEN, + ) + h5.flush() + return h5, table + + +def benchmark_once(table: tables.Table, condition: str) -> tuple[float, int]: + start = time.perf_counter() + result = table.read_where(condition) + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def pytables_index_sizes(h5: tables.File) -> int: + total = 0 + if "/_i_data" not in h5: + return total + for node in h5.walk_nodes("/_i_data"): + dtype = getattr(node, "dtype", None) + shape = getattr(node, "shape", None) + if dtype is None or shape is None: + continue + nitems = 1 + for dim in shape: + nitems *= int(dim) + total += nitems * dtype.itemsize + return total + + +def _valid_index(table: tables.Table, kind: str) -> bool: + if not table.cols.id.is_indexed: + return False + return table.colindexes["id"].kind == kind + + +def _open_or_build_base_table(path: Path, get_data) -> tuple[tables.File, tables.Table]: + if path.exists(): + h5 = tables.open_file(path, mode="a") + return h5, getattr(h5.root, TABLE_NAME) + path.unlink(missing_ok=True) + return build_persistent_table(get_data(), path) + + +def _open_or_build_indexed_table(path: Path, get_data, kind: str) -> tuple[tables.File, tables.Table, float]: + if path.exists(): + h5 = tables.open_file(path, mode="a") + table = getattr(h5.root, TABLE_NAME) + if _valid_index(table, kind): + return h5, table, 0.0 + h5.close() + path.unlink() + + h5, table = build_persistent_table(get_data(), path) + build_start = time.perf_counter() + table.cols.id.create_index(kind=kind) + h5.flush() + return h5, table, time.perf_counter() - build_start + + +def _query_bounds(ordered_ids: np.ndarray, query_width: int) -> tuple[object, object]: + if ordered_ids.size == 0: + raise ValueError("benchmark arrays must not be empty") + + lo_idx = ordered_ids.size // 2 + hi_idx = min(ordered_ids.size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_ids[lo_idx].item(), ordered_ids[hi_idx].item() + + +def _literal(value: object, dtype: np.dtype) -> str: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + return "True" if bool(value) else "False" + if dtype.kind == "f": + return repr(float(value)) + if dtype.kind in {"i", "u"}: + return str(int(value)) + raise ValueError(f"unsupported dtype for literal formatting: {dtype}") + + +def _condition_expr(lo: object, hi: object, dtype: np.dtype) -> str: + return f"(id >= {_literal(lo, dtype)}) & (id <= {_literal(hi, dtype)})" + + +def benchmark_size(size: int, size_dir: Path, dist: str, query_width: int, id_dtype: np.dtype) -> list[dict]: + get_data = _source_data_factory(size, dist, id_dtype) + get_ordered_ids = _ordered_ids_factory(size, id_dtype) + base_h5, base_table = _open_or_build_base_table(base_table_path(size_dir, size, dist, id_dtype), get_data) + lo, hi = _query_bounds(get_ordered_ids(), query_width) + condition = _condition_expr(lo, hi, id_dtype) + base_bytes = size * np.dtype([("id", id_dtype), ("payload", np.float32)]).itemsize + compressed_base_bytes = os.path.getsize(base_h5.filename) + + scan_ms = benchmark_once(base_table, condition)[0] * 1_000 + + rows = [] + for kind in KINDS: + idx_h5, idx_table, build_time = _open_or_build_indexed_table( + indexed_table_path(size_dir, size, dist, kind, id_dtype), get_data, kind + ) + cold_time, index_len = benchmark_once(idx_table, condition) + indexed_file_bytes = os.path.getsize(idx_h5.filename) + disk_index_bytes = max(0, indexed_file_bytes - compressed_base_bytes) + logical_index_bytes = pytables_index_sizes(idx_h5) + + rows.append( + { + "size": size, + "dist": dist, + "kind": kind, + "query_rows": index_len, + "create_idx_ms": build_time * 1_000, + "scan_ms": scan_ms, + "cold_ms": cold_time * 1_000, + "cold_speedup": scan_ms / (cold_time * 1_000), + "warm_ms": None, + "warm_speedup": None, + "logical_index_bytes": logical_index_bytes, + "disk_index_bytes": disk_index_bytes, + "index_pct": logical_index_bytes / base_bytes * 100, + "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + "_h5": idx_h5, + "_table": idx_table, + "_condition": condition, + } + ) + + base_h5.close() + return rows + + +def measure_warm_queries(rows: list[dict], repeats: int) -> None: + if repeats <= 0: + return + for result in rows: + table = result["_table"] + condition = result["_condition"] + index_runs = [benchmark_once(table, condition)[0] for _ in range(repeats)] + warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None + result["warm_ms"] = warm_ms + result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms + + +def close_rows(rows: list[dict]) -> None: + for result in rows: + h5 = result.pop("_h5", None) + result.pop("_table", None) + result.pop("_condition", None) + if h5 is not None and h5.isopen: + h5.close() + + +def parse_human_size(value: str) -> int: + value = value.strip() + if not value: + raise argparse.ArgumentTypeError("size must not be empty") + + suffixes = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000} + suffix = value[-1].lower() + if suffix in suffixes: + number = value[:-1] + if not number: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") + try: + parsed = int(number) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + size = parsed * suffixes[suffix] + else: + try: + size = int(value) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + + if size <= 0: + raise argparse.ArgumentTypeError("size must be a positive integer") + return size + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark PyTables OPSI index kinds.") + parser.add_argument( + "--size", + type=parse_human_size, + help="Benchmark a single array size. Supports suffixes like 1k, 1K, 1M, 1G.", + ) + parser.add_argument( + "--query-width", + type=parse_human_size, + default=1_000, + help="Width of the range predicate. Supports suffixes like 1k, 1K, 1M, 1G. Default: 1000.", + ) + parser.add_argument( + "--repeats", + type=int, + default=DEFAULT_REPEATS, + help="Number of repeated warm-query measurements after the first cold query. Default: 3.", + ) + parser.add_argument( + "--dtype", + default="float64", + help="NumPy dtype for the indexed field. Examples: float64, float32, int16, bool. Default: float64.", + ) + parser.add_argument( + "--dist", + choices=(*DISTS, "all"), + default="all", + help="Data distribution to benchmark. Default: all.", + ) + parser.add_argument( + "--outdir", + type=Path, + help="Optional directory to keep and reuse generated HDF5 files.", + ) + return parser.parse_args() + + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def print_table(rows: list[dict], columns: list[tuple[str, callable]]) -> None: + header = [name for name, _ in columns] + body = [[formatter(row) for _, formatter in columns] for row in rows] + widths = [len(name) for name in header] + for row in body: + for index, cell in enumerate(row): + widths[index] = max(widths[index], len(cell)) + + print(_format_row(header, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in body: + print(_format_row(row, widths)) + + +def run_benchmark() -> None: + args = parse_args() + try: + id_dtype = np.dtype(args.dtype) + except TypeError as exc: + raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc + if id_dtype.kind not in {"b", "i", "u", "f"}: + raise SystemExit(f"--dtype only supports bool, integer, and floating-point dtypes; got {id_dtype}") + sizes = (args.size,) if args.size is not None else SIZES + dists = DISTS if args.dist == "all" else (args.dist,) + dist_label = args.dist + repeats = max(0, args.repeats) + query_width = args.query_width + + if args.outdir is None: + with tempfile.TemporaryDirectory() as tmpdir: + _run_benchmark(Path(tmpdir), sizes, dists, dist_label, repeats, query_width, id_dtype) + else: + size_dir = args.outdir.expanduser() + size_dir.mkdir(parents=True, exist_ok=True) + _run_benchmark(size_dir, sizes, dists, dist_label, repeats, query_width, id_dtype) + + +def _run_benchmark( + size_dir: Path, + sizes: tuple[int, ...], + dists: tuple[str, ...], + dist_label: str, + repeats: int, + query_width: int, + id_dtype: np.dtype, +) -> None: + all_results = [] + print("Structured range-query benchmark across PyTables index kinds") + print( + f"chunks={CHUNK_LEN:,}, repeats={repeats}, dist={dist_label}, " + f"query_width={query_width:,}, dtype={id_dtype.name}, complib={DATA_FILTERS.complib}" + ) + try: + for dist in dists: + for size in sizes: + size_results = benchmark_size(size, size_dir, dist, query_width, id_dtype) + all_results.extend(size_results) + + print() + print("Cold Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), + ], + ) + if repeats > 0: + measure_warm_queries(all_results, repeats) + print() + print("Warm Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ( + "speedup", + lambda result: f"{result['warm_speedup']:.2f}x" + if result["warm_speedup"] is not None + else "-", + ), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ("index_pct", lambda result: f"{result['index_pct']:.4f}%"), + ("index_pct_disk", lambda result: f"{result['index_pct_disk']:.4f}%"), + ], + ) + finally: + close_rows(all_results) + + +if __name__ == "__main__": + run_benchmark() diff --git a/bench/indexing/parquet_query_bench.py b/bench/indexing/parquet_query_bench.py new file mode 100644 index 00000000..1db29940 --- /dev/null +++ b/bench/indexing/parquet_query_bench.py @@ -0,0 +1,441 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import math +import os +import re +import statistics +import time +from pathlib import Path + +import numpy as np +import pyarrow as pa +import pyarrow.parquet as pq + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +DEFAULT_REPEATS = 3 +DISTS = ("sorted", "block-shuffled", "permuted", "random") +LAYOUTS = ("row-group", "page-index") +RNG_SEED = 0 +DEFAULT_ROW_GROUP_SIZE = 1_250_000 +DEFAULT_MAX_ROWS_PER_PAGE = 10_000 +DEFAULT_COMPRESSION = "snappy" +DATASET_LAYOUT_VERSION = "payload-ramp-v1" + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def payload_slice(start: int, stop: int) -> np.ndarray: + return np.arange(start, stop, dtype=np.float32) + + +def make_ordered_ids(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype == np.dtype(np.bool_): + values = np.zeros(size, dtype=dtype) + values[size // 2 :] = True + return values + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + start = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + start = max(int(info.min), -(unique_count // 2)) + positions = np.arange(size, dtype=np.int64) + values = start + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + return np.linspace(-span / 2, span / 2, num=size, endpoint=False, dtype=dtype) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_slice(size: int, start: int, stop: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if stop <= start: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + values = np.zeros(stop - start, dtype=dtype) + true_start = max(start, size // 2) + if true_start < stop: + values[true_start - start :] = True + return values + + positions = np.arange(start, stop, dtype=np.int64) + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def ordered_id_at(size: int, index: int, dtype: np.dtype) -> object: + return ordered_id_slice(size, index, index + 1, dtype)[0].item() + + +def ordered_ids_from_positions(positions: np.ndarray, size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if positions.size == 0: + return np.empty(0, dtype=dtype) + + if dtype == np.dtype(np.bool_): + return (positions >= (size // 2)).astype(dtype, copy=False) + + if dtype.kind in {"i", "u"}: + info = np.iinfo(dtype) + unique_count = min(size, int(info.max) - int(info.min) + 1) + base = int(info.min) if unique_count < size and dtype.kind == "i" else 0 + if dtype.kind == "i" and unique_count < size: + base = max(int(info.min), -(unique_count // 2)) + values = base + (positions * unique_count) // size + return values.astype(dtype, copy=False) + + if dtype.kind == "f": + span = max(1, size) + values = positions.astype(np.float64, copy=False) - (span / 2) + return values.astype(dtype, copy=False) + + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def _block_order(size: int, block_len: int) -> np.ndarray: + nblocks = (size + block_len - 1) // block_len + return np.random.default_rng(RNG_SEED).permutation(nblocks) + + +def _fill_block_shuffled_ids( + ids: np.ndarray, size: int, start: int, stop: int, block_len: int, order: np.ndarray +) -> None: + cursor = start + out_cursor = 0 + while cursor < stop: + dest_block = cursor // block_len + block_offset = cursor % block_len + src_block = int(order[dest_block]) + src_start = src_block * block_len + block_offset + take = min(stop - cursor, block_len - block_offset, size - src_start) + ids[out_cursor : out_cursor + take] = ordered_id_slice(size, src_start, src_start + take, ids.dtype) + cursor += take + out_cursor += take + + +def _permuted_position_params(size: int) -> tuple[int, int]: + if size <= 1: + return 1, 0 + rng = np.random.default_rng(RNG_SEED) + step = int(rng.integers(1, size)) + while math.gcd(step, size) != 1: + step += 1 + if step >= size: + step = 1 + offset = int(rng.integers(0, size)) + return step, offset + + +def _fill_permuted_ids(ids: np.ndarray, size: int, start: int, stop: int, step: int, offset: int) -> None: + positions = np.arange(start, stop, dtype=np.int64) + shuffled_positions = (positions * step + offset) % size + ids[:] = ordered_ids_from_positions(shuffled_positions, size, ids.dtype) + + +def _randomized_ids(size: int, dtype: np.dtype) -> np.ndarray: + ids = make_ordered_ids(size, dtype) + np.random.default_rng(RNG_SEED).shuffle(ids) + return ids + + +def parquet_path( + outdir: Path, + size: int, + dist: str, + id_dtype: np.dtype, + layout: str, + row_group_size: int, + max_rows_per_page: int, + compression: str, +) -> Path: + return ( + outdir + / f"size_{size}_{dist}_{dtype_token(id_dtype)}.{DATASET_LAYOUT_VERSION}.layout-{layout}.rg-{row_group_size}.page-{max_rows_per_page}.codec-{compression}.parquet" + ) + + +def build_parquet_file( + size: int, + dist: str, + id_dtype: np.dtype, + path: Path, + *, + row_group_size: int, + max_rows_per_page: int, + compression: str, + write_page_index: bool, +) -> float: + path.parent.mkdir(parents=True, exist_ok=True) + if path.exists(): + path.unlink() + + schema = pa.schema([("id", pa.from_numpy_dtype(id_dtype)), ("payload", pa.float32())]) + block_order = _block_order(size, max_rows_per_page) if dist == "block-shuffled" else None + permuted_step, permuted_offset = _permuted_position_params(size) if dist == "permuted" else (1, 0) + random_ids = _randomized_ids(size, id_dtype) if dist == "random" else None + + start_time = time.perf_counter() + writer = pq.ParquetWriter( + path, + schema, + compression=compression, + write_statistics=True, + write_page_index=write_page_index, + max_rows_per_page=max_rows_per_page, + ) + try: + for start in range(0, size, row_group_size): + stop = min(start + row_group_size, size) + ids = np.empty(stop - start, dtype=id_dtype) + if dist == "sorted": + ids[:] = ordered_id_slice(size, start, stop, id_dtype) + elif dist == "block-shuffled": + _fill_block_shuffled_ids(ids, size, start, stop, max_rows_per_page, block_order) + elif dist == "permuted": + _fill_permuted_ids(ids, size, start, stop, permuted_step, permuted_offset) + elif dist == "random": + ids[:] = random_ids[start:stop] + else: + raise ValueError(f"unsupported distribution {dist!r}") + + payload = payload_slice(start, stop) + table = pa.table({"id": ids, "payload": payload}, schema=schema) + writer.write_table(table, row_group_size=row_group_size) + finally: + writer.close() + return time.perf_counter() - start_time + + +def _query_bounds(size: int, query_width: int, dtype: np.dtype) -> tuple[object, object]: + lo_idx = size // 2 + hi_idx = min(size - 1, lo_idx + max(query_width - 1, 0)) + return ordered_id_at(size, lo_idx, dtype), ordered_id_at(size, hi_idx, dtype) + + +def benchmark_scan_once(path: Path, lo, hi) -> tuple[float, int]: + start = time.perf_counter() + table = pq.read_table(path, use_threads=True) + ids = table["id"].to_numpy() + mask = (ids >= lo) & (ids <= hi) + result_len = int(np.count_nonzero(mask)) + elapsed = time.perf_counter() - start + return elapsed, result_len + + +def benchmark_filtered_once(path: Path, lo, hi) -> tuple[float, int]: + start = time.perf_counter() + table = pq.read_table(path, filters=[("id", ">=", lo), ("id", "<=", hi)], use_threads=True) + ids = table["id"].to_numpy() + result_len = int(np.count_nonzero((ids >= lo) & (ids <= hi))) + elapsed = time.perf_counter() - start + return elapsed, result_len + + +def parquet_payload_bytes(path: Path) -> int: + metadata = pq.ParquetFile(path).metadata + payload = 0 + for row_group_idx in range(metadata.num_row_groups): + row_group = metadata.row_group(row_group_idx) + for column_idx in range(row_group.num_columns): + payload += int(row_group.column(column_idx).total_compressed_size) + return payload + + +def median(values: list[float]) -> float: + return statistics.median(values) + + +def benchmark_layout( + size: int, + outdir: Path, + dist: str, + query_width: int, + id_dtype: np.dtype, + layout: str, + row_group_size: int, + max_rows_per_page: int, + compression: str, + repeats: int, +) -> dict: + path = parquet_path(outdir, size, dist, id_dtype, layout, row_group_size, max_rows_per_page, compression) + write_page_index = layout == "page-index" + create_s = build_parquet_file( + size, + dist, + id_dtype, + path, + row_group_size=row_group_size, + max_rows_per_page=max_rows_per_page, + compression=compression, + write_page_index=write_page_index, + ) + lo, hi = _query_bounds(size, query_width, id_dtype) + + scan_times = [] + filtered_times = [] + scan_rows = None + filtered_rows = None + for _ in range(repeats): + scan_elapsed, scan_rows = benchmark_scan_once(path, lo, hi) + filtered_elapsed, filtered_rows = benchmark_filtered_once(path, lo, hi) + scan_times.append(scan_elapsed * 1_000) + filtered_times.append(filtered_elapsed * 1_000) + + if scan_rows != filtered_rows: + raise AssertionError(f"filtered rows mismatch: scan={scan_rows}, filtered={filtered_rows}") + + file_bytes = os.path.getsize(path) + payload_bytes = parquet_payload_bytes(path) + overhead_bytes = file_bytes - payload_bytes + + return { + "size": size, + "dist": dist, + "layout": layout, + "create_ms": create_s * 1_000, + "scan_ms": median(scan_times), + "filtered_ms": median(filtered_times), + "speedup": median(scan_times) / median(filtered_times), + "file_bytes": file_bytes, + "payload_bytes": payload_bytes, + "overhead_bytes": overhead_bytes, + "payload_pct": (payload_bytes / file_bytes * 100) if file_bytes else 0.0, + "overhead_pct": (overhead_bytes / file_bytes * 100) if file_bytes else 0.0, + "query_rows": int(filtered_rows), + "path": path, + } + + +def print_results( + results: list[dict], + *, + row_group_size: int, + max_rows_per_page: int, + repeats: int, + dist: str, + query_width: int, + id_dtype: np.dtype, + compression: str, +) -> None: + print("Parquet range-query benchmark via pyarrow filtered reads") + print( + f"row_group_size={row_group_size:,}, max_rows_per_page={max_rows_per_page:,}, repeats={repeats}, " + f"dist={dist}, query_width={query_width:,}, dtype={id_dtype.name}, compression={compression}" + ) + print("Note: filtered reads are measured with pyarrow.parquet.read_table(filters=...).") + print(" Pruning behavior depends on what the current PyArrow reader can exploit.") + print() + print( + f"{'rows':<10} {'dist':<8} {'layout':<11} {'create_ms':>12} {'scan_ms':>9} {'filtered_ms':>12} " + f"{'speedup':>9} {'file_bytes':>12} {'payload':>12} {'overhead':>12} {'query_rows':>11}" + ) + print( + f"{'-' * 10} {'-' * 8} {'-' * 11} {'-' * 12} {'-' * 9} {'-' * 12} {'-' * 9} {'-' * 12} {'-' * 12} {'-' * 12} {'-' * 11}" + ) + for row in results: + print( + f"{row['size']:<10,} {row['dist']:<8} {row['layout']:<11} {row['create_ms']:12.3f} " + f"{row['scan_ms']:9.3f} {row['filtered_ms']:12.3f} {row['speedup']:9.2f}x " + f"{row['file_bytes']:12,} {row['payload_bytes']:12,} {row['overhead_bytes']:12,} {row['query_rows']:11,}" + ) + + +def parse_human_int(value: str) -> int: + value = value.strip().lower().replace("_", "") + multipliers = {"k": 1_000, "m": 1_000_000} + if value[-1:] in multipliers: + return int(float(value[:-1]) * multipliers[value[-1]]) + return int(value) + + +def main() -> None: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--size", default="10M", help="Number of rows, or 'all'. Default: 10M.") + parser.add_argument("--outdir", type=Path, required=True, help="Directory for generated Parquet files.") + parser.add_argument("--dist", choices=(*DISTS, "all"), default="permuted", help="Row distribution.") + parser.add_argument("--layout", choices=(*LAYOUTS, "all"), default="all", help="Parquet layout to benchmark.") + parser.add_argument("--query-width", type=parse_human_int, default=1, help="Query width. Default: 1.") + parser.add_argument("--dtype", default="float64", help="Indexed id dtype. Default: float64.") + parser.add_argument( + "--row-group-size", + type=parse_human_int, + default=DEFAULT_ROW_GROUP_SIZE, + help="Parquet row group size. Default: 1.25M.", + ) + parser.add_argument( + "--max-rows-per-page", + type=parse_human_int, + default=DEFAULT_MAX_ROWS_PER_PAGE, + help="Parquet max rows per page. Default: 10k.", + ) + parser.add_argument("--compression", default=DEFAULT_COMPRESSION, help="Parquet compression codec.") + parser.add_argument("--repeats", type=int, default=DEFAULT_REPEATS, help="Benchmark repeats. Default: 3.") + args = parser.parse_args() + + id_dtype = np.dtype(args.dtype) + sizes = SIZES if args.size == "all" else (parse_human_int(args.size),) + dists = DISTS if args.dist == "all" else (args.dist,) + layouts = LAYOUTS if args.layout == "all" else (args.layout,) + + results = [] + for size in sizes: + for dist in dists: + for layout in layouts: + results.append( + benchmark_layout( + size, + args.outdir, + dist, + args.query_width, + id_dtype, + layout, + args.row_group_size, + args.max_rows_per_page, + args.compression, + args.repeats, + ) + ) + + print_results( + results, + row_group_size=args.row_group_size, + max_rows_per_page=args.max_rows_per_page, + repeats=args.repeats, + dist=args.dist, + query_width=args.query_width, + id_dtype=id_dtype, + compression=args.compression, + ) + + +if __name__ == "__main__": + main() diff --git a/bench/indexing/query_cache_store_bench.py b/bench/indexing/query_cache_store_bench.py new file mode 100644 index 00000000..46f2cbaf --- /dev/null +++ b/bench/indexing/query_cache_store_bench.py @@ -0,0 +1,458 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import cProfile +import io +import pstats +import statistics +import tempfile +import time +from dataclasses import dataclass +from pathlib import Path + +import numpy as np + +import blosc2 +from blosc2 import indexing + +STRATEGIES = ("baseline", "cache_catalog", "skip_observer", "defer_vlmeta", "all") + + +@dataclass +class InsertState: + catalog: dict | None = None + store: object | None = None + + +def _make_array(path: Path, *, size: int, chunks: int, blocks: int) -> blosc2.NDArray: + return blosc2.asarray( + np.arange(size, dtype=np.int64), + urlpath=path, + mode="w", + chunks=(chunks,), + blocks=(blocks,), + ) + + +def _clear_process_caches() -> None: + indexing._hot_cache_clear() + indexing._QUERY_CACHE_STORE_HANDLES.clear() + indexing._PERSISTENT_INDEXES.clear() + + +def _coords_for_count(count: int, spacing: int, modulo: int) -> np.ndarray: + coords = (np.arange(count, dtype=np.int64) * spacing) % modulo + return np.sort(coords, kind="stable") + + +def _median(values: list[float]) -> float: + return statistics.median(values) if values else 0.0 + + +def _build_query_bits(arr: blosc2.NDArray, expr: str, coords: np.ndarray) -> tuple[str, dict, dict]: + descriptor = indexing._normalize_query_descriptor(expr, [indexing.SELF_TARGET_NAME], None) + digest = indexing._query_cache_digest(descriptor) + scope = indexing._query_cache_scope(arr) + indexing._hot_cache_put(digest, coords, scope=scope) + payload_mapping = indexing._encode_coords_payload(coords) + return digest, descriptor, payload_mapping + + +def _load_or_create_catalog(arr: blosc2.NDArray, state: InsertState | None, strategy: str) -> dict: + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None and state.catalog is not None: + return state.catalog + + catalog = indexing._load_query_cache_catalog(arr) + if catalog is None: + catalog = indexing._default_query_cache_catalog(indexing._query_cache_payload_path(arr)) + + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None: + state.catalog = catalog + return catalog + + +def _load_or_create_store(arr: blosc2.NDArray, state: InsertState | None, strategy: str): + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None and state.store is not None: + return state.store + + store = indexing._open_query_cache_store(arr, create=True) + if strategy in {"cache_catalog", "defer_vlmeta", "all"} and state is not None: + state.store = store + return store + + +def _entry_nbytes(coords: np.ndarray, payload_mapping: dict, strategy: str) -> int: + if strategy in {"skip_observer", "all"}: + return len(payload_mapping["data"]) + return indexing._query_cache_entry_nbytes(coords) + + +def _insert_with_strategy( + arr: blosc2.NDArray, + expr: str, + coords: np.ndarray, + strategy: str, + state: InsertState | None = None, +) -> float: + start = time.perf_counter_ns() + digest, descriptor, payload_mapping = _build_query_bits(arr, expr, coords) + nbytes = _entry_nbytes(coords, payload_mapping, strategy) + catalog = _load_or_create_catalog(arr, state, strategy) + if digest in catalog.get("entries", {}): + end = time.perf_counter_ns() + return (end - start) / 1_000_000 + + store = _load_or_create_store(arr, state, strategy) + slot = len(store) + store.append(payload_mapping) + + catalog["entries"][digest] = { + "slot": slot, + "nbytes": nbytes, + "nrows": len(coords), + "dtype": payload_mapping["dtype"], + "query": descriptor, + } + catalog["persistent_nbytes"] = int(catalog.get("persistent_nbytes", 0)) + nbytes + catalog["next_slot"] = slot + 1 + + if strategy not in {"defer_vlmeta", "all"}: + indexing._save_query_cache_catalog(arr, catalog) + elif state is not None: + state.catalog = catalog + + end = time.perf_counter_ns() + return (end - start) / 1_000_000 + + +def _flush_state(arr: blosc2.NDArray, state: InsertState | None, strategy: str) -> None: + if strategy not in {"defer_vlmeta", "all"} or state is None or state.catalog is None: + return + indexing._save_query_cache_catalog(arr, state.catalog) + + +def _benchmark_fresh( + root: Path, + *, + strategy: str, + coords: np.ndarray, + size: int, + chunks: int, + blocks: int, + repeats: int, +) -> float: + runs = [] + for idx in range(repeats): + arr = _make_array(root / f"fresh-{strategy}-{idx}.b2nd", size=size, chunks=chunks, blocks=blocks) + _clear_process_caches() + state = InsertState() if strategy in {"cache_catalog", "defer_vlmeta", "all"} else None + expr = f"(id >= {idx}) & (id <= {idx})" + start = time.perf_counter_ns() + _insert_with_strategy(arr, expr, coords, strategy, state) + _flush_state(arr, state, strategy) + end = time.perf_counter_ns() + runs.append((end - start) / 1_000_000) + return _median(runs) + + +def _benchmark_steady( + root: Path, + *, + strategy: str, + coords: np.ndarray, + size: int, + chunks: int, + blocks: int, + inserts: int, +) -> float: + arr = _make_array(root / f"steady-{strategy}.b2nd", size=size, chunks=chunks, blocks=blocks) + _clear_process_caches() + state = InsertState() if strategy in {"cache_catalog", "defer_vlmeta", "all"} else None + start = time.perf_counter_ns() + for idx in range(inserts): + expr = f"(id >= {idx}) & (id <= {idx})" + _insert_with_strategy(arr, expr, coords, strategy, state) + _flush_state(arr, state, strategy) + end = time.perf_counter_ns() + return ((end - start) / 1_000_000) / max(1, inserts) + + +def _baseline_step_breakdown( + arr: blosc2.NDArray, expr: str, coords: np.ndarray +) -> dict[str, float | int]: + t0 = time.perf_counter_ns() + descriptor = indexing._normalize_query_descriptor(expr, [indexing.SELF_TARGET_NAME], None) + digest = indexing._query_cache_digest(descriptor) + t1 = time.perf_counter_ns() + + scope = indexing._query_cache_scope(arr) + indexing._hot_cache_put(digest, coords, scope=scope) + t2 = time.perf_counter_ns() + + payload_mapping = indexing._encode_coords_payload(coords) + nbytes = indexing._query_cache_entry_nbytes(coords) + t3 = time.perf_counter_ns() + + catalog = indexing._load_query_cache_catalog(arr) + payload_path = indexing._query_cache_payload_path(arr) + if catalog is None: + catalog = indexing._default_query_cache_catalog(payload_path) + store = indexing._open_query_cache_store(arr, create=True) + t4 = time.perf_counter_ns() + + slot = len(store) + store.append(payload_mapping) + t5 = time.perf_counter_ns() + + catalog["entries"][digest] = { + "slot": slot, + "nbytes": nbytes, + "nrows": len(coords), + "dtype": payload_mapping["dtype"], + "query": descriptor, + } + catalog["persistent_nbytes"] = int(catalog.get("persistent_nbytes", 0)) + nbytes + catalog["next_slot"] = slot + 1 + indexing._save_query_cache_catalog(arr, catalog) + t6 = time.perf_counter_ns() + + return { + "digest_ms": (t1 - t0) / 1_000_000, + "hot_ms": (t2 - t1) / 1_000_000, + "encode_nbytes_ms": (t3 - t2) / 1_000_000, + "open_store_ms": (t4 - t3) / 1_000_000, + "append_ms": (t5 - t4) / 1_000_000, + "catalog_ms": (t6 - t5) / 1_000_000, + "step_total_ms": (t6 - t0) / 1_000_000, + "entry_nbytes": nbytes, + } + + +def _profile_store(arr: blosc2.NDArray, coords: np.ndarray, repeats: int, top: int) -> str: + profiler = cProfile.Profile() + + def run(): + for idx in range(repeats): + expr = f"(id >= {idx}) & (id <= {idx})" + indexing.store_cached_coords(arr, expr, [indexing.SELF_TARGET_NAME], None, coords) + + profiler.enable() + run() + profiler.disable() + + out = io.StringIO() + stats = pstats.Stats(profiler, stream=out).sort_stats("cumulative") + stats.print_stats(top) + return out.getvalue() + + +def _active_cache_store_cparams(arr: blosc2.NDArray) -> blosc2.CParams: + coords = np.asarray([0], dtype=np.int64) + indexing.store_cached_coords(arr, "(id >= 0) & (id <= 0)", [indexing.SELF_TARGET_NAME], None, coords) + payload_path = indexing._query_cache_payload_path(arr) + store = blosc2.VLArray(storage=blosc2.Storage(urlpath=payload_path, mode="r")) + return store.cparams + + +def _print_strategy_table(title: str, rows: list[dict[str, object]]) -> None: + columns = [ + ("coords", lambda row: f"{row['coords_count']:,}"), + ("strategy", lambda row: str(row["strategy"])), + ("time_ms", lambda row: f"{row['time_ms']:.3f}"), + ("speedup", lambda row: f"{row['speedup']:.2f}x"), + ] + widths = [] + for name, render in columns: + width = len(name) + for row in rows: + width = max(width, len(render(row))) + widths.append(width) + + print(title) + header = " ".join(name.ljust(width) for (name, _), width in zip(columns, widths, strict=True)) + rule = " ".join("-" * width for width in widths) + print(header) + print(rule) + for row in rows: + print( + " ".join( + render(row).ljust(width) for (_, render), width in zip(columns, widths, strict=True) + ) + ) + print() + + +def _print_breakdown(rows: list[dict[str, object]]) -> None: + columns = [ + ("coords", lambda row: f"{row['coords_count']:,}"), + ("entry_nbytes", lambda row: f"{row['entry_nbytes']:,}"), + ("digest_ms", lambda row: f"{row['digest_ms']:.3f}"), + ("hot_ms", lambda row: f"{row['hot_ms']:.3f}"), + ("encode_nbytes_ms", lambda row: f"{row['encode_nbytes_ms']:.3f}"), + ("open_store_ms", lambda row: f"{row['open_store_ms']:.3f}"), + ("append_ms", lambda row: f"{row['append_ms']:.3f}"), + ("catalog_ms", lambda row: f"{row['catalog_ms']:.3f}"), + ("step_total_ms", lambda row: f"{row['step_total_ms']:.3f}"), + ] + widths = [] + for name, render in columns: + width = len(name) + for row in rows: + width = max(width, len(render(row))) + widths.append(width) + + print("Baseline Step Breakdown") + header = " ".join(name.ljust(width) for (name, _), width in zip(columns, widths, strict=True)) + rule = " ".join("-" * width for width in widths) + print(header) + print(rule) + for row in rows: + print( + " ".join( + render(row).ljust(width) for (_, render), width in zip(columns, widths, strict=True) + ) + ) + print() + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Microbenchmark persistent query-cache insert strategies.") + parser.add_argument("--size", type=int, default=1_000_000, help="Array size for the backing persistent array.") + parser.add_argument("--chunks", type=int, default=100_000, help="Chunk length for the backing array.") + parser.add_argument("--blocks", type=int, default=10_000, help="Block length for the backing array.") + parser.add_argument( + "--coords-counts", + type=int, + nargs="+", + default=[1, 10, 100, 1_000], + help="Coordinate counts to benchmark.", + ) + parser.add_argument("--fresh-repeats", type=int, default=20, help="Repeated fresh first-insert runs.") + parser.add_argument("--steady-inserts", type=int, default=100, help="Repeated inserts into one array.") + parser.add_argument( + "--breakdown-repeats", type=int, default=20, help="Repeated baseline step breakdown runs." + ) + parser.add_argument( + "--spacing", + type=int, + default=9973, + help="Stride used to synthesize sparse sorted coordinates.", + ) + parser.add_argument( + "--profile-repeats", + type=int, + default=200, + help="Number of repeated baseline inserts to include in the cProfile run.", + ) + parser.add_argument( + "--profile-top", + type=int, + default=25, + help="Number of cProfile entries to print.", + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + fresh_rows = [] + steady_rows = [] + breakdown_rows = [] + + with tempfile.TemporaryDirectory(prefix="blosc2-query-cache-bench-") as tmpdir: + root = Path(tmpdir) + probe = _make_array(root / "cparams-probe.b2nd", size=args.size, chunks=args.chunks, blocks=args.blocks) + _clear_process_caches() + active_cparams = _active_cache_store_cparams(probe) + _clear_process_caches() + + for coords_count in args.coords_counts: + coords = _coords_for_count(coords_count, args.spacing, args.size) + + fresh_times = {} + steady_times = {} + for strategy in STRATEGIES: + fresh_times[strategy] = _benchmark_fresh( + root, + strategy=strategy, + coords=coords, + size=args.size, + chunks=args.chunks, + blocks=args.blocks, + repeats=args.fresh_repeats, + ) + steady_times[strategy] = _benchmark_steady( + root, + strategy=strategy, + coords=coords, + size=args.size, + chunks=args.chunks, + blocks=args.blocks, + inserts=args.steady_inserts, + ) + + fresh_baseline = fresh_times["baseline"] + steady_baseline = steady_times["baseline"] + for strategy in STRATEGIES: + fresh_rows.append( + { + "coords_count": coords_count, + "strategy": strategy, + "time_ms": fresh_times[strategy], + "speedup": fresh_baseline / fresh_times[strategy] if fresh_times[strategy] else 0.0, + } + ) + steady_rows.append( + { + "coords_count": coords_count, + "strategy": strategy, + "time_ms": steady_times[strategy], + "speedup": steady_baseline / steady_times[strategy] if steady_times[strategy] else 0.0, + } + ) + + baseline_steps = [] + for idx in range(args.breakdown_repeats): + arr = _make_array(root / f"breakdown-{coords_count}-{idx}.b2nd", size=args.size, chunks=args.chunks, blocks=args.blocks) + _clear_process_caches() + expr = f"(id >= {idx}) & (id <= {idx})" + baseline_steps.append(_baseline_step_breakdown(arr, expr, coords)) + breakdown_rows.append( + { + "coords_count": coords_count, + "entry_nbytes": int(_median([float(row["entry_nbytes"]) for row in baseline_steps])), + "digest_ms": _median([float(row["digest_ms"]) for row in baseline_steps]), + "hot_ms": _median([float(row["hot_ms"]) for row in baseline_steps]), + "encode_nbytes_ms": _median([float(row["encode_nbytes_ms"]) for row in baseline_steps]), + "open_store_ms": _median([float(row["open_store_ms"]) for row in baseline_steps]), + "append_ms": _median([float(row["append_ms"]) for row in baseline_steps]), + "catalog_ms": _median([float(row["catalog_ms"]) for row in baseline_steps]), + "step_total_ms": _median([float(row["step_total_ms"]) for row in baseline_steps]), + } + ) + + print( + "Persistent query-cache insert microbenchmark " + f"(codec={active_cparams.codec.name}, clevel={active_cparams.clevel}, use_dict={active_cparams.use_dict})" + ) + print() + _print_strategy_table("Fresh Insert Comparison", fresh_rows) + _print_strategy_table("Steady Insert Comparison", steady_rows) + _print_breakdown(breakdown_rows) + + profile_coords = _coords_for_count(args.coords_counts[0], args.spacing, args.size) + profile_arr = _make_array(root / "profile.b2nd", size=args.size, chunks=args.chunks, blocks=args.blocks) + _clear_process_caches() + print(f"Baseline cProfile for coords_count={args.coords_counts[0]:,} over {args.profile_repeats} inserts") + print(_profile_store(profile_arr, profile_coords, args.profile_repeats, args.profile_top)) + + +if __name__ == "__main__": + main() diff --git a/bench/ndarray/expression_index_bench.py b/bench/ndarray/expression_index_bench.py new file mode 100644 index 00000000..87d8ba8c --- /dev/null +++ b/bench/ndarray/expression_index_bench.py @@ -0,0 +1,479 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import argparse +import os +import re +import statistics +import tempfile +import time +from pathlib import Path + +import numpy as np + +import blosc2 +from blosc2 import indexing as blosc2_indexing + +SIZES = (1_000_000, 2_000_000, 5_000_000, 10_000_000) +CHUNK_LEN = 100_000 +BLOCK_LEN = 20_000 +DEFAULT_REPEATS = 3 +KINDS = ("ultralight", "light", "medium", "full") +DISTS = ("sorted", "block-shuffled", "random") +RNG_SEED = 0 +DEFAULT_OPLEVEL = 5 +EXPRESSION = "abs(x)" +FULL_QUERY_MODES = ("auto", "selective-ooc", "whole-load") + + +def dtype_token(dtype: np.dtype) -> str: + return re.sub(r"[^0-9A-Za-z]+", "_", np.dtype(dtype).name).strip("_") + + +def make_ordered_x(size: int, dtype: np.dtype) -> np.ndarray: + dtype = np.dtype(dtype) + if dtype.kind in {"i", "u"}: + return np.arange(-(size // 2), -(size // 2) + size, dtype=np.int64).astype(dtype, copy=False) + if dtype.kind == "f": + return np.linspace(-(size / 2), size / 2, num=size, endpoint=False, dtype=dtype) + raise ValueError(f"unsupported dtype for benchmark: {dtype}") + + +def fill_x(x: np.ndarray, ordered_x: np.ndarray, dist: str, rng: np.random.Generator) -> None: + size = x.shape[0] + if dist == "sorted": + x[:] = ordered_x + return + if dist == "block-shuffled": + nblocks = (size + BLOCK_LEN - 1) // BLOCK_LEN + order = rng.permutation(nblocks) + dest = 0 + for src_block in order: + src_start = int(src_block) * BLOCK_LEN + src_stop = min(src_start + BLOCK_LEN, size) + block_size = src_stop - src_start + x[dest : dest + block_size] = ordered_x[src_start:src_stop] + dest += block_size + return + if dist == "random": + x[:] = ordered_x + rng.shuffle(x) + return + raise ValueError(f"unsupported distribution {dist!r}") + + +def make_source_data(size: int, dist: str, x_dtype: np.dtype) -> np.ndarray: + dtype = np.dtype([("x", x_dtype), ("payload", np.float32)]) + data = np.zeros(size, dtype=dtype) + fill_x(data["x"], make_ordered_x(size, x_dtype), dist, np.random.default_rng(RNG_SEED)) + return data + + +def build_persistent_array(data: np.ndarray, path: Path) -> blosc2.NDArray: + return blosc2.asarray(data, urlpath=path, mode="w", chunks=(CHUNK_LEN,), blocks=(BLOCK_LEN,)) + + +def base_array_path(size_dir: Path, size: int, dist: str, x_dtype: np.dtype) -> Path: + return size_dir / f"expr_size_{size}_{dist}_{dtype_token(x_dtype)}.b2nd" + + +def indexed_array_path( + size_dir: Path, size: int, dist: str, kind: str, optlevel: int, x_dtype: np.dtype, in_mem: bool +) -> Path: + mode = "mem" if in_mem else "ooc" + return size_dir / f"expr_size_{size}_{dist}_{dtype_token(x_dtype)}.{kind}.opt{optlevel}.{mode}.b2nd" + + +def benchmark_scan_once(expr) -> tuple[float, int]: + start = time.perf_counter() + result = expr.compute(_use_index=False)[:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def benchmark_index_once(arr: blosc2.NDArray, cond) -> tuple[float, int]: + start = time.perf_counter() + result = arr[cond][:] + elapsed = time.perf_counter() - start + return elapsed, len(result) + + +def _with_full_query_mode(full_query_mode: str): + class _FullQueryModeScope: + def __enter__(self): + self.previous = os.environ.get("BLOSC2_FULL_EXACT_QUERY_MODE") + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = full_query_mode + + def __exit__(self, exc_type, exc, tb): + if self.previous is None: + os.environ.pop("BLOSC2_FULL_EXACT_QUERY_MODE", None) + else: + os.environ["BLOSC2_FULL_EXACT_QUERY_MODE"] = self.previous + + return _FullQueryModeScope() + + +def index_sizes(descriptor: dict) -> tuple[int, int]: + logical = 0 + disk = 0 + for level_info in descriptor["levels"].values(): + dtype = np.dtype(level_info["dtype"]) + logical += dtype.itemsize * level_info["nsegments"] + if level_info["path"]: + disk += os.path.getsize(level_info["path"]) + + for key in ("light", "reduced", "full"): + section = descriptor.get(key) + if section is None: + continue + for path_key in section: + if not path_key.endswith("_path"): + continue + arr = blosc2.open(section[path_key]) + logical += int(np.prod(arr.shape)) * arr.dtype.itemsize + disk += os.path.getsize(section[path_key]) + return logical, disk + + +def _source_data_factory(size: int, dist: str, x_dtype: np.dtype): + data = None + + def get_data() -> np.ndarray: + nonlocal data + if data is None: + data = make_source_data(size, dist, x_dtype) + return data + + return get_data + + +def _condition_expr(limit: object, dtype: np.dtype) -> str: + if np.dtype(dtype).kind == "f": + literal = repr(float(limit)) + else: + literal = str(int(limit)) + return f"(abs(x) >= 0) & (abs(x) < {literal})" + + +def _valid_index_descriptor(arr: blosc2.NDArray, kind: str, optlevel: int, in_mem: bool) -> dict | None: + for descriptor in arr.indexes: + if descriptor.get("version") != blosc2_indexing.INDEX_FORMAT_VERSION: + continue + target = descriptor.get("target") or {} + if ( + target.get("source") == "expression" + and target.get("expression_key") == EXPRESSION + and descriptor.get("kind") == kind + and int(descriptor.get("optlevel", -1)) == int(optlevel) + and bool(descriptor.get("ooc", False)) is (not bool(in_mem)) + and not descriptor.get("stale", False) + ): + return descriptor + return None + + +def _open_or_build_persistent_array(path: Path, get_data) -> blosc2.NDArray: + if path.exists(): + return blosc2.open(path, mode="a") + blosc2.remove_urlpath(path) + return build_persistent_array(get_data(), path) + + +def _open_or_build_indexed_array( + path: Path, get_data, kind: str, optlevel: int, in_mem: bool +) -> tuple[blosc2.NDArray, float]: + if path.exists(): + arr = blosc2.open(path, mode="a") + if _valid_index_descriptor(arr, kind, optlevel, in_mem) is not None: + return arr, 0.0 + if arr.indexes: + arr.drop_index(name=arr.indexes[0]["name"]) + blosc2.remove_urlpath(path) + + arr = build_persistent_array(get_data(), path) + build_start = time.perf_counter() + arr.create_expr_index(EXPRESSION, kind=kind, optlevel=optlevel, in_mem=in_mem) + return arr, time.perf_counter() - build_start + + +def benchmark_size( + size: int, + size_dir: Path, + dist: str, + query_width: int, + optlevel: int, + x_dtype: np.dtype, + in_mem: bool, + full_query_mode: str, +) -> list[dict]: + get_data = _source_data_factory(size, dist, x_dtype) + arr = _open_or_build_persistent_array(base_array_path(size_dir, size, dist, x_dtype), get_data) + condition_str = _condition_expr(query_width, x_dtype) + condition = blosc2.lazyexpr(condition_str, arr.fields) + expr = condition.where(arr) + base_bytes = size * arr.dtype.itemsize + compressed_base_bytes = os.path.getsize(arr.urlpath) + + scan_ms = benchmark_scan_once(expr)[0] * 1_000 + + rows = [] + for kind in KINDS: + idx_arr, build_time = _open_or_build_indexed_array( + indexed_array_path(size_dir, size, dist, kind, optlevel, x_dtype, in_mem), + get_data, + kind, + optlevel, + in_mem, + ) + idx_cond = blosc2.lazyexpr(condition_str, idx_arr.fields) + idx_expr = idx_cond.where(idx_arr) + with _with_full_query_mode(full_query_mode): + explanation = idx_expr.explain() + cold_time, index_len = benchmark_index_once(idx_arr, idx_cond) + logical_index_bytes, disk_index_bytes = index_sizes(idx_arr.indexes[0]) + + rows.append( + { + "size": size, + "dist": dist, + "kind": kind, + "optlevel": optlevel, + "in_mem": in_mem, + "query_rows": index_len, + "build_s": build_time, + "create_idx_ms": build_time * 1_000, + "scan_ms": scan_ms, + "cold_ms": cold_time * 1_000, + "cold_speedup": scan_ms / (cold_time * 1_000), + "warm_ms": None, + "warm_speedup": None, + "candidate_units": explanation["candidate_units"], + "total_units": explanation["total_units"], + "lookup_path": explanation.get("lookup_path"), + "full_query_mode": full_query_mode, + "logical_index_bytes": logical_index_bytes, + "disk_index_bytes": disk_index_bytes, + "index_pct": logical_index_bytes / base_bytes * 100, + "index_pct_disk": disk_index_bytes / compressed_base_bytes * 100, + "_arr": idx_arr, + "_cond": idx_cond, + } + ) + return rows + + +def measure_warm_queries(rows: list[dict], repeats: int) -> None: + if repeats <= 0: + return + for result in rows: + arr = result["_arr"] + cond = result["_cond"] + with _with_full_query_mode(result["full_query_mode"]): + index_runs = [benchmark_index_once(arr, cond)[0] for _ in range(repeats)] + warm_ms = statistics.median(index_runs) * 1_000 if index_runs else None + result["warm_ms"] = warm_ms + result["warm_speedup"] = None if warm_ms is None else result["scan_ms"] / warm_ms + + +def parse_human_size(value: str) -> int: + value = value.strip() + if not value: + raise argparse.ArgumentTypeError("size must not be empty") + + suffixes = {"k": 1_000, "m": 1_000_000, "g": 1_000_000_000} + suffix = value[-1].lower() + if suffix in suffixes: + number = value[:-1] + if not number: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") + try: + parsed = int(number) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + size = parsed * suffixes[suffix] + else: + try: + size = int(value) + except ValueError as exc: + raise argparse.ArgumentTypeError(f"invalid size {value!r}") from exc + + if size <= 0: + raise argparse.ArgumentTypeError("size must be a positive integer") + return size + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Benchmark python-blosc2 expression index kinds.") + parser.add_argument("--size", type=parse_human_size, help="Benchmark a single array size.") + parser.add_argument( + "--query-width", + type=parse_human_size, + default=1_000, + help="Upper bound for the `abs(x) < query_width` predicate. Default: 1000.", + ) + parser.add_argument("--repeats", type=int, default=DEFAULT_REPEATS, help="Warm-query repetitions.") + parser.add_argument("--outdir", type=Path, help="Directory where benchmark arrays and sidecars are kept.") + parser.add_argument("--optlevel", type=int, default=DEFAULT_OPLEVEL, help="Index optlevel. Default: 5.") + parser.add_argument( + "--dtype", + default="int64", + help="NumPy dtype for the source field. Examples: int64, int32, float64. Default: int64.", + ) + parser.add_argument( + "--dist", + choices=(*DISTS, "all"), + default="random", + help="Distribution for the source field. Use 'all' to benchmark every distribution.", + ) + parser.add_argument( + "--in-mem", + action=argparse.BooleanOptionalAction, + default=False, + help="Use the in-memory index builders. Disabled by default; pass --in-mem to force them.", + ) + parser.add_argument( + "--full-query-mode", + choices=FULL_QUERY_MODES, + default="auto", + help="How full exact queries should run during the benchmark: auto, selective-ooc, or whole-load.", + ) + return parser.parse_args() + + +def _format_row(cells: list[str], widths: list[int]) -> str: + return " ".join(cell.ljust(width) for cell, width in zip(cells, widths, strict=True)) + + +def _table_rows(results: list[dict], columns: list[tuple[str, callable]]) -> tuple[list[str], list[list[str]], list[int]]: + headers = [header for header, _ in columns] + widths = [len(header) for header in headers] + rows = [[formatter(result) for _, formatter in columns] for result in results] + for row in rows: + widths = [max(width, len(cell)) for width, cell in zip(widths, row, strict=True)] + return headers, rows, widths + + +def print_table(results: list[dict], columns: list[tuple[str, callable]]) -> None: + headers, rows, widths = _table_rows(results, columns) + print(_format_row(headers, widths)) + print(_format_row(["-" * width for width in widths], widths)) + for row in rows: + print(_format_row(row, widths)) + + +def run_benchmarks( + sizes: tuple[int, ...], + dists: tuple[str, ...], + size_dir: Path, + dist_label: str, + query_width: int, + repeats: int, + optlevel: int, + x_dtype: np.dtype, + in_mem: bool, + full_query_mode: str, +) -> None: + all_results = [] + print("Expression range-query benchmark across index kinds") + print( + f"expr={EXPRESSION}, chunks={CHUNK_LEN:,}, blocks={BLOCK_LEN:,}, repeats={repeats}, dist={dist_label}, " + f"query_width={query_width:,}, optlevel={optlevel}, dtype={x_dtype.name}, in_mem={in_mem}, " + f"full_query_mode={full_query_mode}" + ) + for dist in dists: + for size in sizes: + size_results = benchmark_size(size, size_dir, dist, query_width, optlevel, x_dtype, in_mem, full_query_mode) + all_results.extend(size_results) + + print() + print("Cold Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("cold_ms", lambda result: f"{result['cold_ms']:.3f}"), + ("speedup", lambda result: f"{result['cold_speedup']:.2f}x"), + ("logical_bytes", lambda result: f"{result['logical_index_bytes']:,}"), + ("disk_bytes", lambda result: f"{result['disk_index_bytes']:,}"), + ], + ) + if repeats > 0: + measure_warm_queries(all_results, repeats) + print() + print("Warm Query Table") + print_table( + all_results, + [ + ("rows", lambda result: f"{result['size']:,}"), + ("dist", lambda result: result["dist"]), + ("builder", lambda result: "mem" if result["in_mem"] else "ooc"), + ("kind", lambda result: result["kind"]), + ("create_idx_ms", lambda result: f"{result['create_idx_ms']:.3f}"), + ("scan_ms", lambda result: f"{result['scan_ms']:.3f}"), + ("warm_ms", lambda result: f"{result['warm_ms']:.3f}" if result["warm_ms"] is not None else "-"), + ( + "speedup", + lambda result: f"{result['warm_speedup']:.2f}x" + if result["warm_speedup"] is not None + else "-", + ), + ], + ) + + +def main() -> None: + args = parse_args() + if args.repeats < 0: + raise SystemExit("--repeats must be >= 0") + try: + x_dtype = np.dtype(args.dtype) + except TypeError as exc: + raise SystemExit(f"unsupported dtype {args.dtype!r}") from exc + if x_dtype.kind not in {"i", "u", "f"}: + raise SystemExit(f"--dtype only supports integer and floating-point dtypes; got {x_dtype}") + sizes = (args.size,) if args.size is not None else SIZES + dists = DISTS if args.dist == "all" else (args.dist,) + + if args.outdir is None: + with tempfile.TemporaryDirectory() as tmpdir: + run_benchmarks( + sizes, + dists, + Path(tmpdir), + args.dist, + args.query_width, + args.repeats, + args.optlevel, + x_dtype, + args.in_mem, + args.full_query_mode, + ) + else: + args.outdir.mkdir(parents=True, exist_ok=True) + run_benchmarks( + sizes, + dists, + args.outdir, + args.dist, + args.query_width, + args.repeats, + args.optlevel, + x_dtype, + args.in_mem, + args.full_query_mode, + ) + + +if __name__ == "__main__": + main() diff --git a/doc/getting_started/tutorials.rst b/doc/getting_started/tutorials.rst index 563ba8ea..44bfdca6 100644 --- a/doc/getting_started/tutorials.rst +++ b/doc/getting_started/tutorials.rst @@ -17,3 +17,4 @@ Tutorials tutorials/09.ucodecs-ufilters tutorials/10.prefilters tutorials/11.vlarray + tutorials/14.indexing-arrays diff --git a/doc/getting_started/tutorials/14.indexing-arrays.ipynb b/doc/getting_started/tutorials/14.indexing-arrays.ipynb new file mode 100644 index 00000000..a89d54ca --- /dev/null +++ b/doc/getting_started/tutorials/14.indexing-arrays.ipynb @@ -0,0 +1,587 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "b5f43e5fb3d24d4e", + "metadata": {}, + "source": [ + "# Indexing Arrays\n", + "\n", + "Blosc2 can attach indexes to 1-D `NDArray` objects and to fields inside 1-D structured arrays. These indexes accelerate selective masks, and `full` indexes can also drive ordered access directly through `sort(order=...)`, `indices(order=...)`, and `itersorted(...)`.\n", + "\n", + "This tutorial covers:\n", + "\n", + "- how to create field and expression indexes,\n", + "- how to tell whether a mask is using an index,\n", + "- what sort of acceleration different index kinds can deliver on a selective mask,\n", + "- how index persistence works,\n", + "- when to rebuild indexes,\n", + "- and a recommended workflow for keeping append-heavy `full` indexes compact.\n" + ] + }, + { + "cell_type": "markdown", + "id": "2b6f2bb4ad3a4cb8", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "id": "8c510216bc394cf9", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:37.470903Z", + "start_time": "2026-04-09T06:27:37.098590Z" + } + }, + "source": [ + "import statistics\n", + "import time\n", + "from pathlib import Path\n", + "\n", + "import numpy as np\n", + "\n", + "import blosc2\n", + "\n", + "\n", + "def format_bytes(nbytes):\n", + " units = (\"B\", \"KiB\", \"MiB\", \"GiB\", \"TiB\")\n", + " value = float(nbytes)\n", + " for unit in units:\n", + " if value < 1024.0 or unit == units[-1]:\n", + " if unit == \"B\":\n", + " return f\"{int(value)} {unit}\"\n", + " return f\"{value:.2f} {unit}\"\n", + " value /= 1024.0\n", + " return f\"{value:.2f} {units[-1]}\"\n", + "\n", + "\n", + "def show_index_summary(label, descriptor):\n", + " print(\n", + " f\"{label}: kind={descriptor['kind']}, persistent={descriptor['persistent']}, \"\n", + " f\"ooc={descriptor['ooc']}, stale={descriptor['stale']}\"\n", + " )\n", + "\n", + "\n", + "def explain_subset(expr):\n", + " info = expr.explain()\n", + " keep = {}\n", + " for key in (\"will_use_index\", \"reason\", \"kind\", \"level\", \"lookup_path\", \"full_runs\"):\n", + " if key in info:\n", + " keep[key] = info[key]\n", + " return keep\n", + "\n", + "\n", + "def median_ms(func, repeats=5, warmup=1):\n", + " for _ in range(warmup):\n", + " func()\n", + " samples = []\n", + " for _ in range(repeats):\n", + " t0 = time.perf_counter()\n", + " func()\n", + " samples.append((time.perf_counter() - t0) * 1e3)\n", + " return statistics.median(samples)\n", + "\n", + "\n", + "paths = [\n", + " Path(\"indexing_tutorial_medium.b2nd\"),\n", + " Path(\"indexing_tutorial_append_full.b2nd\"),\n", + "]\n", + "for path in paths:\n", + " blosc2.remove_urlpath(path)" + ], + "outputs": [], + "execution_count": 1 + }, + { + "cell_type": "markdown", + "id": "28fbc94b52634f32", + "metadata": {}, + "source": [ + "## Index kinds and how to create them\n", + "\n", + "Blosc2 currently supports four index kinds:\n", + "\n", + "- `ultralight`: compact summaries only,\n", + "- `light`: summary levels plus lightweight per-block payloads,\n", + "- `medium`: richer payloads for exact masks,\n", + "- `full`: globally sorted payloads for exact masks and ordered reuse.\n", + "\n", + "There is one active index per target field or expression. If you create another index on the same target, it replaces the previous one. The easiest way to compare kinds is to build them on separate arrays.\n", + "\n", + "The next cell times index creation and reports the compressed storage footprint of each index relative to the compressed base array." + ] + }, + { + "cell_type": "code", + "id": "d1a5a37585a045ca", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:52.097578Z", + "start_time": "2026-04-09T06:27:37.471828Z" + } + }, + "source": [ + "N_ROWS = 10_000_000\n", + "MASK_TEXT = \"(id >= -5.0) & (id < 5.0)\"\n", + "\n", + "rng = np.random.default_rng(0)\n", + "dtype = np.dtype([(\"id\", np.float64), (\"payload\", np.int32)])\n", + "ids = np.arange(-N_ROWS // 2, N_ROWS // 2, dtype=np.float64)\n", + "rng.shuffle(ids)\n", + "data = blosc2.fromiter(((id_, i) for i, id_ in enumerate(ids)), shape=(N_ROWS,), dtype=dtype)\n", + "\n", + "indexed_arrays = {}\n", + "build_rows = []\n", + "base_cbytes = data.cbytes\n", + "for kind in (\"ultralight\", \"light\", \"medium\", \"full\"):\n", + " arr = data.copy()\n", + " t0 = time.perf_counter()\n", + " arr.create_index(field=\"id\", kind=kind)\n", + " build_ms = (time.perf_counter() - t0) * 1e3\n", + " index_obj = arr.index(\"id\")\n", + " indexed_arrays[kind] = arr\n", + " build_rows.append((kind, build_ms, index_obj.cbytes, index_obj.cbytes / base_cbytes))\n", + "\n", + "print(f\"Compressed base array size: {format_bytes(base_cbytes)}\")\n", + "print(f\"{'kind':<12} {'build_ms':>10} {'index_size':>12} {'overhead':>10}\")\n", + "for kind, build_ms, index_cbytes, overhead in build_rows:\n", + " print(f\"{kind:<12} {build_ms:10.3f} {format_bytes(index_cbytes):>12} {overhead:>9.2f}x\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Compressed base array size: 30.74 MiB\n", + "kind build_ms index_size overhead\n", + "ultralight 45.528 142 B 0.00x\n", + "light 679.027 26.04 MiB 0.85x\n", + "medium 2342.959 34.99 MiB 1.14x\n", + "full 8925.948 28.44 MiB 0.93x\n" + ] + } + ], + "execution_count": 2 + }, + { + "cell_type": "markdown", + "id": "bc1cc9b122fe4052", + "metadata": {}, + "source": [ + "## Using an index for masks\n", + "\n", + "Range predicates are planned automatically when you use `where(...)`. If you just want the matching values, `expr[:]` is the shortest form. In the comparisons below we use `compute()` so the result stays as an `NDArray`, and we force a scan by passing `_use_index=False`." + ] + }, + { + "cell_type": "code", + "id": "f1b3aaec965b42d6", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:52.220533Z", + "start_time": "2026-04-09T06:27:52.120176Z" + } + }, + "source": [ + "medium_arr = indexed_arrays[\"medium\"]\n", + "expr = blosc2.lazyexpr(MASK_TEXT, medium_arr.fields).where(medium_arr)\n", + "\n", + "print(explain_subset(expr))\n", + "\n", + "indexed = expr.compute()\n", + "scanned = expr.compute(_use_index=False)\n", + "np.testing.assert_array_equal(indexed, scanned)\n", + "print(f\"Matched rows: {len(indexed)}\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'medium', 'level': 'exact', 'lookup_path': 'chunk-nav', 'full_runs': 0}\n", + "Matched rows: 10\n" + ] + } + ], + "execution_count": 3 + }, + { + "cell_type": "markdown", + "id": "1db4bd16a95a48dd", + "metadata": {}, + "source": [ + "### Timing the mask with and without indexes\n", + "\n", + "The next cell measures the same selective mask on all four index kinds and compares it with a forced full scan. On this exact workload, `medium` and `full` usually show the clearest benefit because they carry richer payloads for exact masks." + ] + }, + { + "cell_type": "code", + "id": "c9e932b7561b4ff4", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:53.696948Z", + "start_time": "2026-04-09T06:27:52.222040Z" + } + }, + "source": [ + "timing_rows = []\n", + "expected = None\n", + "for kind, arr in indexed_arrays.items():\n", + " expr = blosc2.lazyexpr(MASK_TEXT, arr.fields).where(arr)\n", + " result = expr.compute()\n", + " if expected is None:\n", + " expected = result\n", + " else:\n", + " np.testing.assert_array_equal(result, expected)\n", + "\n", + " scan_ms = median_ms(lambda expr=expr: expr.compute(_use_index=False), repeats=3)\n", + " index_ms = median_ms(lambda expr=expr: expr.compute(), repeats=3)\n", + " timing_rows.append((kind, scan_ms, index_ms, scan_ms / index_ms))\n", + "\n", + "print(f\"Selective mask over {N_ROWS:,} rows\")\n", + "print(f\"{'kind':<12} {'scan_ms':>11} {'index_ms':>10} {'speedup':>10}\")\n", + "for kind, scan_ms, index_ms, speedup in timing_rows:\n", + " print(f\"{kind:<12} {scan_ms:11.3f} {index_ms:10.3f} {speedup:10.2f}x\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Selective mask over 10,000,000 rows\n", + "kind scan_ms index_ms speedup\n", + "ultralight 73.371 70.249 1.04x\n", + "light 65.966 1.478 44.63x\n", + "medium 65.349 1.253 52.16x\n", + "full 65.108 1.221 53.31x\n" + ] + } + ], + "execution_count": 4 + }, + { + "cell_type": "markdown", + "id": "7679d86361304087", + "metadata": {}, + "source": [ + "## `full` indexes and ordered access\n", + "\n", + "A `full` index stores a global sorted payload. This is the required index tier for direct ordered reuse. `create_csindex()` is just a convenience wrapper for `create_index(kind=\"full\")`." + ] + }, + { + "cell_type": "code", + "id": "9ffcb0d8d06a4daa", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:53.735085Z", + "start_time": "2026-04-09T06:27:53.707924Z" + } + }, + "source": [ + "ordered_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int64)])\n", + "ordered_data = np.array(\n", + " [(2, 9), (1, 8), (2, 7), (1, 6), (2, 5), (1, 4), (2, 3), (1, 2)],\n", + " dtype=ordered_dtype,\n", + ")\n", + "ordered_arr = blosc2.asarray(ordered_data)\n", + "ordered_arr.create_csindex(\"id\")\n", + "\n", + "print(\"Sorted positions:\", ordered_arr.indices(order=[\"id\", \"payload\"])[:])\n", + "print(\"Sorted rows:\")\n", + "print(ordered_arr.sort(order=[\"id\", \"payload\"])[:])" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Sorted positions: [7 5 3 1 6 4 2 0]\n", + "Sorted rows:\n", + "[(1, 2) (1, 4) (1, 6) (1, 8) (2, 3) (2, 5) (2, 7) (2, 9)]\n" + ] + } + ], + "execution_count": 5 + }, + { + "cell_type": "markdown", + "id": "a77189a036524546", + "metadata": {}, + "source": [ + "## Expression indexes\n", + "\n", + "You can also index a deterministic scalar expression stream. Expression indexes are matched by normalized expression identity, so the same expression can be reused for masks and ordered access." + ] + }, + { + "cell_type": "code", + "id": "7d337ce2f9fb4f32", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:53.759337Z", + "start_time": "2026-04-09T06:27:53.736407Z" + } + }, + "source": [ + "expr_dtype = np.dtype([(\"x\", np.int64), (\"payload\", np.int32)])\n", + "expr_data = np.array([(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], dtype=expr_dtype)\n", + "expr_arr = blosc2.asarray(expr_data)\n", + "expr_arr.create_expr_index(\"abs(x)\", kind=\"full\", name=\"abs_x\")\n", + "\n", + "ordered_expr = blosc2.lazyexpr(\"(abs(x) >= 2) & (abs(x) < 8)\", expr_arr.fields).where(expr_arr)\n", + "print(explain_subset(ordered_expr))\n", + "print(\"Expression-order positions:\", ordered_expr.indices(order=\"abs(x)\")[:])" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'in-memory', 'full_runs': 0}\n", + "Expression-order positions: [2 6 4 5 1 7]\n" + ] + } + ], + "execution_count": 6 + }, + { + "cell_type": "markdown", + "id": "0a0a629ffed5480d", + "metadata": {}, + "source": [ + "## Persistence: automatic or manual?\n", + "\n", + "Index persistence follows the base array by default:\n", + "\n", + "- for a persistent array (`urlpath=...`), `persistent=None` means the index sidecars are persisted automatically,\n", + "- for an in-memory array, the index lives only in memory,\n", + "- on a persistent array, `persistent=False` keeps the index process-local instead of writing sidecars.\n", + "\n", + "In practice, if you want an index to survive reopen, persist the array and use the default behavior." + ] + }, + { + "cell_type": "code", + "id": "0be5f512928f48db", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:58.801567Z", + "start_time": "2026-04-09T06:27:53.761336Z" + } + }, + "source": [ + "persistent_arr = data.copy(urlpath=paths[0], mode=\"w\")\n", + "persistent_descriptor = persistent_arr.create_index(field=\"id\", kind=\"medium\")\n", + "show_index_summary(\"persistent medium\", persistent_descriptor)\n", + "\n", + "reopened = blosc2.open(paths[0], mode=\"a\")\n", + "print(f\"Reopened index count: {len(reopened.indexes)}\")\n", + "print(f\"Persisted sidecar path: {reopened.indexes[0]['reduced']['values_path']}\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "persistent medium: kind=medium, persistent=True, ooc=True, stale=False\n", + "Reopened index count: 1\n", + "Persisted sidecar path: indexing_tutorial_medium.__index__.id.medium.reduced.values.b2nd\n" + ] + } + ], + "execution_count": 7 + }, + { + "cell_type": "markdown", + "id": "5bfb14d1e0f945b7", + "metadata": {}, + "source": [ + "## When to rebuild an index\n", + "\n", + "Appending is special-cased and keeps compatible indexes current. General mutation and resize operations do not. After unsupported mutations, the index is marked stale and should be refreshed explicitly with `rebuild_index()`." + ] + }, + { + "cell_type": "code", + "id": "11f0cd1b910b409a", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:58.852040Z", + "start_time": "2026-04-09T06:27:58.814043Z" + } + }, + "source": [ + "mutable_arr = blosc2.arange(20, dtype=np.int64)\n", + "mutable_arr.create_index(kind=\"full\")\n", + "mutable_arr[:3] = -1\n", + "\n", + "print(\"Stale after direct mutation:\", mutable_arr.indexes[0][\"stale\"])\n", + "mutable_arr.rebuild_index()\n", + "print(\"Stale after rebuild:\", mutable_arr.indexes[0][\"stale\"])" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Stale after direct mutation: True\n", + "Stale after rebuild: False\n" + ] + } + ], + "execution_count": 8 + }, + { + "cell_type": "markdown", + "id": "328a2c209dc246ba", + "metadata": {}, + "source": [ + "## Recommended workflow for append-heavy `full` indexes\n", + "\n", + "Appending to a `full` index is intentionally cheap: appended tails become sorted runs instead of forcing an immediate rewrite of the compact base sidecars.\n", + "\n", + "That means the recommended workflow is:\n", + "\n", + "1. create a persistent `full` index once,\n", + "2. append freely during ingestion,\n", + "3. let masks keep working while runs accumulate,\n", + "4. call `compact_index()` after ingestion windows or before latency-sensitive read phases.\n", + "\n", + "The next example uses a larger append-heavy array and times the same selective mask before and after compaction. The exact mask path reports whether it is using a compact lookup layout or a run-aware fallback. After compaction, `full[\"runs\"]` becomes empty again." + ] + }, + { + "cell_type": "code", + "id": "2e1a47a9cf7246e6", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:59.968401Z", + "start_time": "2026-04-09T06:27:58.852830Z" + } + }, + "source": [ + "append_dtype = np.dtype([(\"id\", np.int64), (\"payload\", np.int32)])\n", + "base_rows = 200_000\n", + "append_batch = 500\n", + "num_runs = 40\n", + "\n", + "append_data = blosc2.zeros(base_rows, dtype=append_dtype)[:]\n", + "append_data[\"id\"] = blosc2.arange(base_rows, dtype=np.int64)\n", + "append_data[\"payload\"] = blosc2.arange(base_rows, dtype=np.int32)\n", + "\n", + "append_arr = blosc2.asarray(append_data, urlpath=paths[1], mode=\"w\")\n", + "append_arr.create_index(field=\"id\", kind=\"full\")\n", + "\n", + "for run in range(num_runs):\n", + " start = 300_000 + run * append_batch\n", + " batch = blosc2.zeros(append_batch, dtype=append_dtype)[:]\n", + " batch[\"id\"] = blosc2.arange(start, start + append_batch, dtype=np.int64)\n", + " batch[\"payload\"] = blosc2.arange(append_batch, dtype=np.int32)\n", + " append_arr.append(batch)\n", + "\n", + "mask_str = \"(id >= 310_000) & (id < 310_020)\"\n", + "append_expr = blosc2.lazyexpr(mask_str, append_arr.fields).where(append_arr)\n", + "before_info = explain_subset(append_expr)\n", + "before_ms = median_ms(lambda: append_expr.compute(), repeats=5)\n", + "print(\"Before compaction:\", before_info)\n", + "print(\"Pending runs:\", len(append_arr.indexes[0][\"full\"][\"runs\"]))\n", + "print(f\"Median mask time before compaction: {before_ms:.3f} ms\")\n", + "\n", + "append_arr.compact_index(\"id\")\n", + "append_expr = blosc2.lazyexpr(mask_str, append_arr.fields).where(append_arr)\n", + "after_info = explain_subset(append_expr)\n", + "after_ms = median_ms(lambda: append_expr.compute(), repeats=5)\n", + "print(\"After compaction:\", after_info)\n", + "print(\"Pending runs:\", len(append_arr.indexes[0][\"full\"][\"runs\"]))\n", + "print(f\"Median mask time after compaction: {after_ms:.3f} ms\")\n", + "print(f\"Speedup after compaction: {before_ms / after_ms:.2f}x\")" + ], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Before compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'run-bounded-ooc', 'full_runs': 40}\n", + "Pending runs: 40\n", + "Median mask time before compaction: 3.293 ms\n", + "After compaction: {'will_use_index': True, 'reason': 'multi-field exact indexes selected', 'kind': 'full', 'level': 'exact', 'lookup_path': 'compact-selective-ooc', 'full_runs': 0}\n", + "Pending runs: 0\n", + "Median mask time after compaction: 0.689 ms\n", + "Speedup after compaction: 4.78x\n" + ] + } + ], + "execution_count": 9 + }, + { + "cell_type": "markdown", + "id": "1eb8f667d1ff4aba", + "metadata": {}, + "source": [ + "## Practical guidance\n", + "\n", + "- Use `medium` when your main goal is faster selective masks.\n", + "- Use `full` when you also want ordered reuse through `sort(order=...)`, `indices(order=...)`, or `itersorted(...)`.\n", + "- Persist the base array if you want indexes to survive reopen automatically.\n", + "- After unsupported mutations, use `rebuild_index()`.\n", + "- For append-heavy `full` indexes, compact explicitly at convenient maintenance boundaries instead of on every append.\n", + "- Measure your own workload: compact indexes, predicate selectivity, and ordered access needs all affect which kind is best.\n" + ] + }, + { + "cell_type": "code", + "id": "9833102355db4ec0", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:27:59.991418Z", + "start_time": "2026-04-09T06:27:59.978217Z" + } + }, + "source": [ + "for path in paths:\n", + " blosc2.remove_urlpath(path)" + ], + "outputs": [], + "execution_count": 10 + }, + { + "cell_type": "code", + "id": "17489b2c3d2ac57", + "metadata": { + "ExecuteTime": { + "end_time": "2026-04-09T06:28:00.015548Z", + "start_time": "2026-04-09T06:27:59.998661Z" + } + }, + "source": [], + "outputs": [], + "execution_count": 10 + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.5" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/doc/reference/classes.rst b/doc/reference/classes.rst index 0ce750e3..65385d87 100644 --- a/doc/reference/classes.rst +++ b/doc/reference/classes.rst @@ -8,6 +8,7 @@ Main Classes .. autosummary:: NDArray + Index NDField LazyArray C2Array @@ -28,6 +29,7 @@ Main Classes :maxdepth: 1 ndarray + index ndfield lazyarray c2array diff --git a/doc/reference/index.rst b/doc/reference/index.rst index cf4ff7fa..78f9a276 100644 --- a/doc/reference/index.rst +++ b/doc/reference/index.rst @@ -1,14 +1,8 @@ -API Reference -============= +Index +===== -.. toctree:: - :maxdepth: 2 +.. currentmodule:: blosc2 - classes - save_load - msgpack_serialization - storage - array_operations - utilities - low_level - misc +.. autoclass:: Index + :members: + :member-order: groupwise diff --git a/doc/reference/lazyarray.rst b/doc/reference/lazyarray.rst index 1080a62b..d3a21a1e 100644 --- a/doc/reference/lazyarray.rst +++ b/doc/reference/lazyarray.rst @@ -33,10 +33,12 @@ See the `LazyExpr`_ and `LazyUDF`_ sections for more information. .. autosummary:: __getitem__ + will_use_index Methods --------------- .. automethod:: __getitem__ + .. automethod:: will_use_index Attributes ---------- diff --git a/doc/reference/ufuncs.rst b/doc/reference/ufuncs.rst index 3ae5397a..5fda0d87 100644 --- a/doc/reference/ufuncs.rst +++ b/doc/reference/ufuncs.rst @@ -25,8 +25,8 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica asin asinh atan - atan2 atanh + atan2 bitwise_and bitwise_invert bitwise_left_shift @@ -53,9 +53,9 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica less less_equal log + log10 log1p log2 - log10 logaddexp logical_and logical_not @@ -100,8 +100,8 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica .. autofunction:: blosc2.asin .. autofunction:: blosc2.asinh .. autofunction:: blosc2.atan -.. autofunction:: blosc2.atan2 .. autofunction:: blosc2.atanh +.. autofunction:: blosc2.atan2 .. autofunction:: blosc2.bitwise_and .. autofunction:: blosc2.bitwise_invert .. autofunction:: blosc2.bitwise_left_shift @@ -128,9 +128,9 @@ Note: The functions ``real``, ``imag``, ``contains``, ``where`` are not technica .. autofunction:: blosc2.less .. autofunction:: blosc2.less_equal .. autofunction:: blosc2.log +.. autofunction:: blosc2.log10 .. autofunction:: blosc2.log1p .. autofunction:: blosc2.log2 -.. autofunction:: blosc2.log10 .. autofunction:: blosc2.logaddexp .. autofunction:: blosc2.logical_and .. autofunction:: blosc2.logical_not diff --git a/examples/ndarray/expression_index.py b/examples/ndarray/expression_index.py new file mode 100644 index 00000000..3b2992b3 --- /dev/null +++ b/examples/ndarray/expression_index.py @@ -0,0 +1,33 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np + +import blosc2 + +# Intent: show how to build an index on a derived expression stream and +# reuse it for both filtering and direct ordered reads. + +dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) +data = np.array( + [(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], + dtype=dtype, +) + +arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) +arr.create_expr_index("abs(x)", kind="full", name="abs_x") + +expr = blosc2.lazyexpr("(abs(x) >= 2) & (abs(x) < 8)", arr.fields).where(arr) + +print("Expression-indexed filter result:") +print(expr[:]) + +print("\nRows ordered by abs(x) via the full expression index:") +print(arr.sort(order="abs(x)")[:]) + +print("\nFiltered rows ordered by abs(x):") +print(expr.sort(order="abs(x)")[:]) diff --git a/examples/ndarray/index_append_maintenance.py b/examples/ndarray/index_append_maintenance.py new file mode 100644 index 00000000..21076e48 --- /dev/null +++ b/examples/ndarray/index_append_maintenance.py @@ -0,0 +1,31 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np + +import blosc2 + +# Intent: show that appending to a 1-D indexed array keeps the index sidecars +# usable, so indexed queries and sorted reads continue to work without an +# explicit rebuild after append(). + +dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) +data = np.array([(2, 20), (0, 0), (3, 30), (1, 10)], dtype=dtype) +arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + +arr.create_csindex("id") + +to_append = np.array([(6, 60), (4, 40), (5, 50)], dtype=dtype) +arr.append(to_append) + +expr = blosc2.lazyexpr("(id >= 4) & (id < 7)", arr.fields).where(arr) + +print("Indexed query after append:") +print(expr[:]) + +print("\nSorted rows after append:") +print(arr.sort(order="id")[:]) diff --git a/examples/ndarray/index_sorted_iteration.py b/examples/ndarray/index_sorted_iteration.py new file mode 100644 index 00000000..5a562f84 --- /dev/null +++ b/examples/ndarray/index_sorted_iteration.py @@ -0,0 +1,40 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +import numpy as np + +import blosc2 + +# Intent: show how a full/csindex can be reused for direct sorted reads, +# sorted logical positions, and streaming ordered iteration. + +dtype = np.dtype([("id", np.int64), ("score", np.float64)]) +data = np.array( + [ + (4, 0.3), + (1, 1.5), + (3, 0.8), + (1, 0.2), + (2, 3.1), + (3, 0.1), + (2, 1.2), + ], + dtype=dtype, +) + +arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) +arr.create_csindex("id") + +print("Sorted rows via full index:") +print(arr.sort(order=["id", "score"])[:]) + +print("\nSorted logical positions:") +print(arr.indices(order=["id", "score"])[:]) + +print("\nIterating in sorted order:") +for row in arr.itersorted(order=["id", "score"], batch_size=3): + print(row) diff --git a/src/blosc2/__init__.py b/src/blosc2/__init__.py index 8ba00c5e..ddee6390 100644 --- a/src/blosc2/__init__.py +++ b/src/blosc2/__init__.py @@ -557,6 +557,7 @@ def _raise(exc): can_cast, ) from .proxy import Proxy, ProxySource, ProxyNDSource, ProxyNDField, SimpleProxy, jit, as_simpleproxy +from .indexing import Index from .schunk import SChunk, open from . import linalg @@ -730,6 +731,7 @@ def _raise(exc): "DictStore", "EmbedStore", "Filter", + "Index", "LazyArray", "DSLKernel", "DSLSyntaxError", diff --git a/src/blosc2/blosc2_ext.pyx b/src/blosc2/blosc2_ext.pyx index 2ba002e5..a263c0fb 100644 --- a/src/blosc2/blosc2_ext.pyx +++ b/src/blosc2/blosc2_ext.pyx @@ -59,6 +59,7 @@ ctypedef fused T: int32_t int64_t + cdef extern from "": int printf(const char *format, ...) nogil @@ -1041,7 +1042,9 @@ cdef create_cparams_from_kwargs(blosc2_cparams *cparams, kwargs): cparams.clevel = kwargs.get('clevel', blosc2.cparams_dflts['clevel']) cparams.use_dict = kwargs.get('use_dict', blosc2.cparams_dflts['use_dict']) cparams.typesize = typesize = kwargs.get('typesize', blosc2.cparams_dflts['typesize']) - cparams.nthreads = kwargs.get('nthreads', blosc2.nthreads) + cparams.nthreads = kwargs.get('nthreads', 1 if blosc2.IS_WASM else blosc2.nthreads) + if blosc2.IS_WASM: + cparams.nthreads = 1 cparams.blocksize = kwargs.get('blocksize', blosc2.cparams_dflts['blocksize']) splitmode = kwargs.get('splitmode', blosc2.cparams_dflts['splitmode']) cparams.splitmode = splitmode.value @@ -1122,7 +1125,9 @@ def compress2(src, **kwargs): cdef create_dparams_from_kwargs(blosc2_dparams *dparams, kwargs, blosc2_cparams* cparams=NULL): memcpy(dparams, &BLOSC2_DPARAMS_DEFAULTS, sizeof(BLOSC2_DPARAMS_DEFAULTS)) - dparams.nthreads = kwargs.get('nthreads', blosc2.nthreads) + dparams.nthreads = kwargs.get('nthreads', 1 if blosc2.IS_WASM else blosc2.nthreads) + if blosc2.IS_WASM: + dparams.nthreads = 1 dparams.schunk = NULL dparams.postfilter = NULL dparams.postparams = NULL @@ -2831,7 +2836,9 @@ def open(urlpath, mode, offset, **kwargs): if cparams is not None: res.schunk.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) else: - res.schunk.cparams = dataclasses.replace(res.schunk.cparams, nthreads=blosc2.nthreads) + res.schunk.cparams = dataclasses.replace( + res.schunk.cparams, nthreads=(1 if blosc2.IS_WASM else blosc2.nthreads) + ) if dparams is not None: res.schunk.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) res.schunk.mode = mode @@ -2841,7 +2848,7 @@ def open(urlpath, mode, offset, **kwargs): if cparams is not None: res.cparams = cparams if isinstance(cparams, blosc2.CParams) else blosc2.CParams(**cparams) else: - res.cparams = dataclasses.replace(res.cparams, nthreads=blosc2.nthreads) + res.cparams = dataclasses.replace(res.cparams, nthreads=(1 if blosc2.IS_WASM else blosc2.nthreads)) if dparams is not None: res.dparams = dparams if isinstance(dparams, blosc2.DParams) else blosc2.DParams(**dparams) @@ -3235,6 +3242,66 @@ cdef class NDArray: return arr + def get_1d_span_numpy(self, arr, int64_t nchunk, int32_t start, int32_t nitems): + if self.ndim != 1: + raise ValueError("get_1d_span_numpy is only supported for 1-D arrays") + if nchunk < 0 or nchunk >= self.array.sc.nchunks: + raise IndexError("chunk index out of range") + if start < 0 or nitems < 0: + raise ValueError("start and nitems must be >= 0") + if start + nitems > self.array.chunknitems: + raise ValueError("requested span exceeds chunk size") + + cdef uint8_t *chunk = NULL + cdef c_bool needs_free + cdef int32_t chunk_nbytes + cdef int32_t chunk_cbytes + cdef int32_t block_nbytes + cdef blosc2_context *dctx = self.array.sc.dctx + cdef Py_buffer view + cdef int rc + cdef c_bool owns_dctx = False + + rc = blosc2_schunk_get_lazychunk(self.array.sc, nchunk, &chunk, &needs_free) + if rc < 0: + raise RuntimeError("Error while getting the lazy chunk") + + rc = blosc2_cbuffer_sizes(chunk, &chunk_nbytes, &chunk_cbytes, &block_nbytes) + if rc < 0: + if needs_free: + free(chunk) + raise RuntimeError("Error while getting compressed buffer sizes") + if start + nitems > chunk_nbytes // self.array.sc.typesize: + if needs_free: + free(chunk) + raise ValueError("requested span exceeds decoded chunk size") + + PyObject_GetBuffer(arr, &view, PyBUF_SIMPLE) + if view.len < nitems * self.array.sc.typesize: + PyBuffer_Release(&view) + if needs_free: + free(chunk) + raise ValueError("destination buffer is smaller than the requested decoded span") + + if dctx == NULL: + dctx = blosc2_create_dctx(BLOSC2_DPARAMS_DEFAULTS) + owns_dctx = True + if dctx == NULL: + PyBuffer_Release(&view) + if needs_free: + free(chunk) + raise RuntimeError("Could not create decompression context") + rc = blosc2_getitem_ctx(dctx, chunk, chunk_cbytes, start, nitems, view.buf, view.len) + if owns_dctx: + blosc2_free_ctx(dctx) + PyBuffer_Release(&view) + if needs_free: + free(chunk) + if rc < 0: + raise RuntimeError("Error while decoding the requested span") + + return arr + def get_oindex_numpy(self, arr, key): """ Orthogonal indexing. Key is a tuple of lists of integer indices. diff --git a/src/blosc2/core.py b/src/blosc2/core.py index e3d9d4ed..872e5a88 100644 --- a/src/blosc2/core.py +++ b/src/blosc2/core.py @@ -1439,7 +1439,10 @@ def get_chunksize(blocksize, l3_minimum=4 * 2**20, l3_maximum=2**26, reduc_facto if isinstance(l2_cache_size, int) and l2_cache_size > chunksize: # Apple Silicon has a large L2 cache, and memory bandwidth is high, # so we can use a larger chunksize based on L2 cache size. - chunksize = l2_cache_size * 4 + # chunksize = l2_cache_size * 4 + # But experiments show that using such a large chunksize + # can make indexes too large. Going back to using just L2. + chunksize = l2_cache_size # Ensure a minimum size if chunksize < l3_minimum: diff --git a/src/blosc2/indexing.py b/src/blosc2/indexing.py new file mode 100644 index 00000000..951dbf7f --- /dev/null +++ b/src/blosc2/indexing.py @@ -0,0 +1,5887 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### + +from __future__ import annotations + +import ast +import contextlib +import enum +import hashlib +import math +import os +import re +import sys +import tempfile +import weakref +from collections.abc import Mapping +from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict, dataclass +from pathlib import Path + +import numpy as np + +import blosc2 + +from . import indexing_ext + +INDEXES_VLMETA_KEY = "blosc2_indexes" +INDEX_FORMAT_VERSION = 1 +SELF_TARGET_NAME = "__self__" + +# On Windows, mmap holds file locks that prevent later writes (vlmeta updates, +# sidecar recreation during rebuild_index, etc.). Disable mmap for all index +# I/O on that platform. +_INDEX_MMAP_MODE = None if sys.platform == "win32" else "r" + +FLAG_ALL_NAN = np.uint8(1 << 0) +FLAG_HAS_NAN = np.uint8(1 << 1) + +SEGMENT_LEVELS_BY_KIND = { + "ultralight": ("chunk",), + "light": ("chunk", "block"), + "medium": ("chunk", "block", "subblock"), + "full": ("chunk", "block", "subblock"), +} + +_IN_MEMORY_INDEXES: dict[int, dict] = {} +_IN_MEMORY_INDEX_FINALIZERS: dict[int, weakref.finalize] = {} +_PERSISTENT_INDEXES: dict[tuple[str, str | int], dict] = {} +_DATA_CACHE: dict[tuple[int, str | None, str, str], np.ndarray] = {} +_SIDECAR_HANDLE_CACHE: dict[tuple[int, str | None, str, str], object] = {} + +# --------------------------------------------------------------------------- +# Query-result cache constants and global state +# --------------------------------------------------------------------------- +QUERY_CACHE_VLMETA_KEY = "_blosc2_query_cache" +QUERY_CACHE_FORMAT_VERSION = 1 +QUERY_CACHE_MAX_ENTRY_NBYTES = 65_536 # 64 KB of logical int64 positions per persistent entry +QUERY_CACHE_MAX_MEM_NBYTES = 131_072 # 128 KB for the in-process hot cache +QUERY_CACHE_MAX_PERSISTENT_NBYTES = 4 * 1024 * 1024 # 4 MB of logical int64 positions in the payload store + +# In-process hot cache: (array-scope, digest) -> decoded np.ndarray of coordinates. +_HOT_CACHE: dict[tuple[tuple[str, str | int], str], np.ndarray] = {} +# Insertion-order list for LRU eviction. +_HOT_CACHE_ORDER: list[tuple[tuple[str, str | int], str]] = [] +# Total bytes of arrays currently in the hot cache. +_HOT_CACHE_BYTES: int = 0 +# Persistent VLArray handles: resolved urlpath -> open VLArray object. +_QUERY_CACHE_STORE_HANDLES: dict[str, object] = {} +# Cached mmap handles for data arrays used in full-query gather: urlpath -> NDArray. +_GATHER_MMAP_HANDLES: dict[str, object] = {} +_HOT_CACHE_GLOBAL_SCOPE = ("global", 0) + +FULL_OOC_RUN_ITEMS = 2_000_000 +FULL_OOC_MERGE_BUFFER_ITEMS = 500_000 +FULL_SELECTIVE_OOC_MAX_SPANS = 128 +FULL_RUN_BOUNDED_FALLBACK_RUNS = 8 +FULL_RUN_BOUNDED_FALLBACK_ITEMS = 1_000_000 +INDEX_QUERY_MIN_CHUNKS_PER_THREAD = 8 + + +def _python_executor_threads(requested_threads: int) -> int: + # wasm32 builds do not support spawning Python worker threads reliably. + if blosc2.IS_WASM: + return 1 + return max(1, int(requested_threads)) + + +def _sanitize_token(token: str) -> str: + return re.sub(r"[^0-9A-Za-z_.-]+", "_", token) + + +def _cleanup_in_memory_store(key: int) -> None: + _IN_MEMORY_INDEXES.pop(key, None) + _IN_MEMORY_INDEX_FINALIZERS.pop(key, None) + _hot_cache_clear(scope=("memory", key)) + + +@dataclass(slots=True) +class IndexPlan: + usable: bool + reason: str + descriptor: dict | None = None + base: blosc2.NDArray | None = None + target: dict | None = None + field: str | None = None + level: str | None = None + segment_len: int | None = None + candidate_units: np.ndarray | None = None + total_units: int = 0 + selected_units: int = 0 + exact_positions: np.ndarray | None = None + bucket_masks: np.ndarray | None = None + bucket_len: int | None = None + chunk_len: int | None = None + block_len: int | None = None + lower: object | None = None + lower_inclusive: bool = True + upper: object | None = None + upper_inclusive: bool = True + candidate_chunks: int = 0 + candidate_nav_segments: int = 0 + candidate_base_spans: int = 0 + lookup_path: str | None = None + + +@dataclass(slots=True) +class SegmentPredicatePlan: + base: blosc2.NDArray + candidate_units: np.ndarray + descriptor: dict + target: dict + field: str | None + level: str + segment_len: int + + +@dataclass(slots=True) +class ExactPredicatePlan: + base: blosc2.NDArray + descriptor: dict + target: dict + field: str | None + lower: object | None = None + lower_inclusive: bool = True + upper: object | None = None + upper_inclusive: bool = True + + +@dataclass(slots=True) +class SortedRun: + values_path: Path + positions_path: Path + length: int + + +@dataclass(slots=True) +class TempRunTracker: + current_disk_bytes: int = 0 + peak_disk_bytes: int = 0 + total_written_bytes: int = 0 + + +@dataclass(slots=True) +class OrderedIndexPlan: + usable: bool + reason: str + descriptor: dict | None = None + base: blosc2.NDArray | None = None + field: str | None = None + order_fields: list[str | None] | None = None + total_rows: int = 0 + selected_rows: int = 0 + secondary_refinement: bool = False + + +@dataclass(frozen=True, slots=True) +class IndexComponent: + label: str + category: str + name: str + path: str | None + + +def _default_index_store() -> dict: + return {"version": INDEX_FORMAT_VERSION, "indexes": {}} + + +def _array_key(array: blosc2.NDArray) -> tuple[str, str | int]: + if _is_persistent_array(array): + return ("persistent", str(Path(array.urlpath).resolve())) + return ("memory", id(array)) + + +def _field_token(field: str | None) -> str: + return "__self__" if field is None else field + + +def _target_token(target: dict) -> str: + source = target.get("source") + if source == "field": + return _field_token(target.get("field")) + if source == "expression": + digest = hashlib.sha1(target["expression_key"].encode("utf-8")).hexdigest()[:12] + return f"__expr__{digest}" + raise ValueError(f"unsupported index target source {source!r}") + + +def _copy_nested_dict(value: dict | None) -> dict | None: + if value is None: + return None + copied = value.copy() + for key, item in list(copied.items()): + if isinstance(item, dict): + copied[key] = item.copy() + return copied + + +def _copy_descriptor(descriptor: dict) -> dict: + copied = descriptor.copy() + if descriptor.get("cparams") is not None: + copied["cparams"] = descriptor["cparams"].copy() + copied["levels"] = _copy_nested_dict(descriptor.get("levels")) + if descriptor.get("target") is not None: + copied["target"] = descriptor["target"].copy() + if descriptor.get("light") is not None: + copied["light"] = descriptor["light"].copy() + if descriptor.get("reduced") is not None: + copied["reduced"] = descriptor["reduced"].copy() + if descriptor.get("full") is not None: + copied["full"] = descriptor["full"].copy() + if "runs" in copied["full"]: + copied["full"]["runs"] = [run.copy() for run in copied["full"]["runs"]] + return copied + + +def _descriptor_for_token(array: blosc2.NDArray, token: str) -> dict: + descriptor = _load_store(array)["indexes"].get(token) + if descriptor is None: + raise KeyError("index not found") + return descriptor + + +def _copy_descriptor_for_token(array: blosc2.NDArray, token: str) -> dict: + return _copy_descriptor(_descriptor_for_token(array, token)) + + +def _is_persistent_array(array: blosc2.NDArray) -> bool: + return array.urlpath is not None + + +def _load_store(array: blosc2.NDArray) -> dict: + if _is_persistent_array(array): + key = _array_key(array) + cached = _PERSISTENT_INDEXES.get(key) + if cached is not None: + return cached + try: + store = array.schunk.vlmeta[INDEXES_VLMETA_KEY] + except KeyError: + store = _default_index_store() + if not isinstance(store, dict): + store = _default_index_store() + store.setdefault("version", INDEX_FORMAT_VERSION) + store.setdefault("indexes", {}) + _PERSISTENT_INDEXES[key] = store + return store + + key = id(array) + cached = _IN_MEMORY_INDEXES.get(key) + if cached is not None: + return cached + store = _default_index_store() + _IN_MEMORY_INDEXES[key] = store + _IN_MEMORY_INDEX_FINALIZERS[key] = weakref.finalize(array, _cleanup_in_memory_store, key) + return store + + +def _save_store(array: blosc2.NDArray, store: dict) -> None: + store.setdefault("version", INDEX_FORMAT_VERSION) + store.setdefault("indexes", {}) + if _is_persistent_array(array): + _PERSISTENT_INDEXES[_array_key(array)] = store + array.schunk.vlmeta[INDEXES_VLMETA_KEY] = store + else: + key = id(array) + _IN_MEMORY_INDEXES[key] = store + _IN_MEMORY_INDEX_FINALIZERS.setdefault(key, weakref.finalize(array, _cleanup_in_memory_store, key)) + + +# --------------------------------------------------------------------------- +# Stage 1 – Query cache: metadata helpers and container plumbing +# --------------------------------------------------------------------------- + + +def _query_cache_payload_path(array: blosc2.NDArray) -> str: + """Return the path for the persistent query-cache VLArray payload store.""" + path, root = _sanitize_sidecar_root(array.urlpath) + return str(path.with_name(f"{root}.__query_cache__.b2frame")) + + +def _query_cache_owner(array: blosc2.NDArray) -> blosc2.NDArray: + owner = getattr(array, "ndarr", None) + return owner if owner is not None else array + + +def _ensure_in_memory_array_finalizer(array: blosc2.NDArray) -> None: + if _is_persistent_array(array): + return + key = id(array) + _IN_MEMORY_INDEX_FINALIZERS.setdefault(key, weakref.finalize(array, _cleanup_in_memory_store, key)) + + +def _query_cache_scope(array: blosc2.NDArray) -> tuple[str, str | int]: + owner = _query_cache_owner(array) + _ensure_in_memory_array_finalizer(owner) + return _array_key(owner) + + +def _default_query_cache_catalog(payload_path: str) -> dict: + return { + "version": QUERY_CACHE_FORMAT_VERSION, + "payload_ref": {"kind": "urlpath", "version": 1, "urlpath": payload_path}, + "max_entry_nbytes": QUERY_CACHE_MAX_ENTRY_NBYTES, + "max_mem_nbytes": QUERY_CACHE_MAX_MEM_NBYTES, + "max_persistent_nbytes": QUERY_CACHE_MAX_PERSISTENT_NBYTES, + "persistent_nbytes": 0, + "next_slot": 0, + "entries": {}, + } + + +def _normalize_query_cache_catalog(catalog: dict) -> dict: + """Ensure the prototype query-cache catalog has the current nbytes schema.""" + if not isinstance(catalog, dict): + return _default_query_cache_catalog("") + catalog.setdefault("version", QUERY_CACHE_FORMAT_VERSION) + catalog.setdefault("payload_ref", {"kind": "urlpath", "version": 1, "urlpath": ""}) + catalog.setdefault("max_entry_nbytes", QUERY_CACHE_MAX_ENTRY_NBYTES) + catalog.setdefault("max_mem_nbytes", QUERY_CACHE_MAX_MEM_NBYTES) + catalog.setdefault("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) + catalog.setdefault("persistent_nbytes", 0) + catalog.setdefault("next_slot", 0) + catalog.setdefault("entries", {}) + return catalog + + +def _load_query_cache_catalog(array: blosc2.NDArray) -> dict | None: + """Read the query-cache catalog from *array* vlmeta, or return None.""" + if not _is_persistent_array(array): + return None + try: + cat = array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] + except KeyError: + return None + if not isinstance(cat, dict) or cat.get("version") != QUERY_CACHE_FORMAT_VERSION: + return None + return _normalize_query_cache_catalog(cat) + + +def _save_query_cache_catalog(array: blosc2.NDArray, catalog: dict) -> None: + """Write *catalog* back to *array* vlmeta.""" + array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] = catalog + + +def _open_query_cache_store(array: blosc2.NDArray, *, create: bool = False): + """Return an open (writable) VLArray for the persistent payload store. + + Returns ``None`` if the array is not persistent. When *create* is True the + store is created if it does not yet exist. + """ + if not _is_persistent_array(array): + return None + path = _query_cache_payload_path(array) + cached = _QUERY_CACHE_STORE_HANDLES.get(path) + if cached is not None: + return cached + if Path(path).exists(): + vla = blosc2.VLArray(storage=blosc2.Storage(urlpath=path, mode="a")) + _QUERY_CACHE_STORE_HANDLES[path] = vla + return vla + if not create: + return None + vla = blosc2.VLArray(storage=blosc2.Storage(urlpath=path, mode="w")) + _QUERY_CACHE_STORE_HANDLES[path] = vla + return vla + + +def _close_query_cache_store(path: str) -> None: + """Drop a cached VLArray handle for *path*.""" + _QUERY_CACHE_STORE_HANDLES.pop(path, None) + + +# --------------------------------------------------------------------------- +# Stage 2 – Cache key normalization +# --------------------------------------------------------------------------- + + +def _normalize_query_descriptor( + expression: str, + tokens: list[str], + order: list[str] | None, +) -> dict: + """Build a canonical, order-stable query descriptor for cache keying.""" + try: + normalized_expr = ast.unparse(ast.parse(expression, mode="eval")) + except Exception: + normalized_expr = expression + return { + "version": QUERY_CACHE_FORMAT_VERSION, + "kind": "indices", + "tokens": sorted(tokens), + "expr": normalized_expr, + "order": list(order) if order is not None else None, + } + + +def _query_cache_digest(descriptor: dict) -> str: + """Return a 32-character hex digest for *descriptor*.""" + import json + + canonical = json.dumps(descriptor, sort_keys=True, separators=(",", ":")) + return hashlib.blake2b(canonical.encode(), digest_size=16).hexdigest() + + +# --------------------------------------------------------------------------- +# Stage 3 – Payload encode/decode and hot/persistent cache helpers +# --------------------------------------------------------------------------- + + +def _encode_coords_payload(coords: np.ndarray) -> dict: + """Encode a coordinate array as a compact msgpack-safe mapping.""" + if coords.size == 0: + dtype = np.dtype(" np.ndarray: + """Reconstruct a coordinate array from a cached payload mapping.""" + return np.frombuffer(payload["data"], dtype=np.dtype(payload["dtype"])).copy() + + +def _hot_cache_key( + digest: str, scope: tuple[str, str | int] | None = None +) -> tuple[tuple[str, str | int], str]: + return (_HOT_CACHE_GLOBAL_SCOPE if scope is None else scope, digest) + + +def _hot_cache_get(digest: str, scope: tuple[str, str | int] | None = None) -> np.ndarray | None: + """Return the cached coordinate array for *digest*, or ``None``.""" + key = _hot_cache_key(digest, scope) + arr = _HOT_CACHE.get(key) + if arr is None: + return None + # Move to most-recently-used position. + with contextlib.suppress(ValueError): + _HOT_CACHE_ORDER.remove(key) + _HOT_CACHE_ORDER.append(key) + return arr + + +def _hot_cache_put(digest: str, coords: np.ndarray, scope: tuple[str, str | int] | None = None) -> None: + """Insert *coords* into the hot cache, evicting LRU entries if needed.""" + global _HOT_CACHE_BYTES + key = _hot_cache_key(digest, scope) + entry_bytes = coords.nbytes + if entry_bytes > QUERY_CACHE_MAX_MEM_NBYTES: + # Single entry too large; skip. + return + # If already present, remove old accounting first. + if key in _HOT_CACHE: + _HOT_CACHE_BYTES -= _HOT_CACHE[key].nbytes + with contextlib.suppress(ValueError): + _HOT_CACHE_ORDER.remove(key) + # Evict LRU entries until there is room. + while _HOT_CACHE_ORDER and _HOT_CACHE_BYTES + entry_bytes > QUERY_CACHE_MAX_MEM_NBYTES: + oldest = _HOT_CACHE_ORDER.pop(0) + evicted = _HOT_CACHE.pop(oldest, None) + if evicted is not None: + _HOT_CACHE_BYTES -= evicted.nbytes + _HOT_CACHE[key] = coords + _HOT_CACHE_ORDER.append(key) + _HOT_CACHE_BYTES += entry_bytes + + +def _hot_cache_clear(scope: tuple[str, str | int] | None = None) -> None: + """Clear all in-process hot cache entries for *scope* (or all scopes).""" + global _HOT_CACHE_BYTES + if scope is not None: + keys = [key for key in _HOT_CACHE if key[0] == scope] + for key in keys: + _HOT_CACHE_BYTES -= _HOT_CACHE.pop(key).nbytes + _HOT_CACHE_ORDER[:] = [key for key in _HOT_CACHE_ORDER if key[0] != scope] + return + _HOT_CACHE.clear() + _HOT_CACHE_ORDER.clear() + _HOT_CACHE_BYTES = 0 + + +def _persistent_cache_lookup(array: blosc2.NDArray, digest: str) -> np.ndarray | None: + """Return coordinates from the persistent cache for *digest*, or ``None``.""" + catalog = _load_query_cache_catalog(array) + if catalog is None: + return None + entry = catalog.get("entries", {}).get(digest) + if entry is None: + return None + slot = entry["slot"] + store = _open_query_cache_store(array) + if store is None or slot >= len(store): + return None + payload = store[slot] + if not isinstance(payload, dict) or payload.get("version") != QUERY_CACHE_FORMAT_VERSION: + return None + try: + coords = _decode_coords_payload(payload) + except Exception: + return None + return coords + + +def _query_cache_entry_nbytes(coords: np.ndarray) -> int: + """Return the logical int64 position bytes used for persistent budget accounting.""" + return int(np.asarray(coords).size) * np.dtype(np.int64).itemsize + + +def _reset_persistent_query_cache_catalog(array: blosc2.NDArray, catalog: dict | None = None) -> dict: + """Drop persistent cache storage and return a fresh empty catalog preserving limits.""" + payload_path = _query_cache_payload_path(array) + _close_query_cache_store(payload_path) + blosc2.remove_urlpath(payload_path) + + fresh = _default_query_cache_catalog(payload_path) + if catalog is not None: + fresh["max_entry_nbytes"] = int(catalog.get("max_entry_nbytes", QUERY_CACHE_MAX_ENTRY_NBYTES)) + fresh["max_mem_nbytes"] = int(catalog.get("max_mem_nbytes", QUERY_CACHE_MAX_MEM_NBYTES)) + fresh["max_persistent_nbytes"] = int( + catalog.get("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) + ) + _save_query_cache_catalog(array, fresh) + return fresh + + +def _persistent_cache_insert( + array: blosc2.NDArray, + digest: str, + coords: np.ndarray, + query_descriptor: dict, +) -> bool: + """Append *coords* to the persistent cache and update the catalog. + + Returns ``True`` on success, ``False`` if the entry is too large or the + persistent budget is exceeded. + """ + catalog = _load_query_cache_catalog(array) + payload_path = _query_cache_payload_path(array) + if catalog is None: + catalog = _default_query_cache_catalog(payload_path) + elif digest in catalog.get("entries", {}): + return True + + payload_mapping = _encode_coords_payload(coords) + nbytes = _query_cache_entry_nbytes(coords) + + max_entry = catalog.get("max_entry_nbytes", QUERY_CACHE_MAX_ENTRY_NBYTES) + if nbytes > max_entry: + return False + + max_persistent = catalog.get("max_persistent_nbytes", QUERY_CACHE_MAX_PERSISTENT_NBYTES) + current_persistent = int(catalog.get("persistent_nbytes", 0)) + if current_persistent + nbytes > max_persistent: + if nbytes > max_persistent: + return False + catalog = _reset_persistent_query_cache_catalog(array, catalog) + current_persistent = 0 + + store = _open_query_cache_store(array, create=True) + if store is None: + return False + + slot = len(store) + store.append(payload_mapping) + + catalog["entries"][digest] = { + "slot": slot, + "nbytes": nbytes, + "nrows": len(coords), + "dtype": payload_mapping["dtype"], + "query": query_descriptor, + } + catalog["persistent_nbytes"] = current_persistent + nbytes + catalog["next_slot"] = slot + 1 + _save_query_cache_catalog(array, catalog) + return True + + +# --------------------------------------------------------------------------- +# Stage 5 – Query cache invalidation +# --------------------------------------------------------------------------- + + +def _invalidate_query_cache(array: blosc2.NDArray) -> None: + """Drop the entire query cache for *array* (persistent file + hot cache).""" + scope = _query_cache_scope(array) + if not _is_persistent_array(array): + _hot_cache_clear(scope=scope) + return + payload_path = _query_cache_payload_path(array) + _close_query_cache_store(payload_path) + blosc2.remove_urlpath(payload_path) + with contextlib.suppress(KeyError, Exception): + del array.schunk.vlmeta[QUERY_CACHE_VLMETA_KEY] + _hot_cache_clear(scope=scope) + # Drop any cached mmap handle for this array's data file so a re-opened or + # extended array is not served from a stale mapping. + urlpath = getattr(array, "urlpath", None) + if urlpath is not None: + _GATHER_MMAP_HANDLES.pop(str(urlpath), None) + + +# --------------------------------------------------------------------------- +# Public helper: cached coordinate lookup (used by lazyexpr.py integration) +# --------------------------------------------------------------------------- + + +def get_cached_coords( + array: blosc2.NDArray, + expression: str, + tokens: list[str], + order: list[str] | None, +) -> np.ndarray | None: + """Return cached coordinates for *expression*/*tokens*/*order*, or ``None``.""" + owner = _query_cache_owner(array) + scope = _query_cache_scope(owner) + descriptor = _normalize_query_descriptor(expression, tokens, order) + digest = _query_cache_digest(descriptor) + # 1. In-process hot cache. + coords = _hot_cache_get(digest, scope=scope) + if coords is not None: + return coords + # 2. Persistent cache (persistent arrays only). + if _is_persistent_array(owner): + coords = _persistent_cache_lookup(owner, digest) + if coords is not None: + _hot_cache_put(digest, coords, scope=scope) + return coords + return None + + +def store_cached_coords( + array: blosc2.NDArray, + expression: str, + tokens: list[str], + order: list[str] | None, + coords: np.ndarray, +) -> None: + """Store *coords* in both the hot cache and (if persistent) the payload store.""" + owner = _query_cache_owner(array) + scope = _query_cache_scope(owner) + descriptor = _normalize_query_descriptor(expression, tokens, order) + digest = _query_cache_digest(descriptor) + _hot_cache_put(digest, coords, scope=scope) + if _is_persistent_array(owner): + _persistent_cache_insert(owner, digest, coords, descriptor) + + +def _supported_index_dtype(dtype: np.dtype) -> bool: + return np.dtype(dtype).kind in {"b", "i", "u", "f", "m", "M"} + + +def _field_target_descriptor(field: str | None) -> dict: + return {"source": "field", "field": field} + + +def _expression_target_descriptor(expression: str, expression_key: str, dependencies: list[str]) -> dict: + return { + "source": "expression", + "expression": expression, + "expression_key": expression_key, + "dependencies": list(dependencies), + } + + +def _target_field(target: dict) -> str | None: + return target.get("field") if target.get("source") == "field" else None + + +def _field_dtype(array: blosc2.NDArray, field: str | None) -> np.dtype: + if field is None: + return np.dtype(array.dtype) + if array.dtype.fields is None: + raise TypeError("field indexes require a structured dtype") + if field not in array.dtype.fields: + raise ValueError(f"field {field!r} is not present in the dtype") + return np.dtype(array.dtype.fields[field][0]) + + +def _validate_index_target(array: blosc2.NDArray, field: str | None) -> np.dtype: + if not isinstance(array, blosc2.NDArray): + raise TypeError("indexes are only supported on NDArray") + if array.ndim != 1: + raise ValueError("indexes are only supported on 1-D NDArray objects") + dtype = _field_dtype(array, field) + if not _supported_index_dtype(dtype): + raise TypeError(f"dtype {dtype} is not supported by the current index engine") + return dtype + + +class _OperandCanonicalizer(ast.NodeTransformer): + def __init__(self, operands: dict): + self.operands = operands + self.base: blosc2.NDArray | None = None + self.dependencies: list[str] = [] + self.valid = True + + def visit_Name(self, node: ast.Name) -> ast.AST: + operand = self.operands.get(node.id) + if operand is None: + return node + target = _operand_target(operand) + if target is None: + self.valid = False + return node + base, field = target + if self.base is None: + self.base = base + elif self.base is not base: + self.valid = False + return node + canonical = SELF_TARGET_NAME if field is None else field + self.dependencies.append(canonical) + return ast.copy_location(ast.Name(id=canonical, ctx=node.ctx), node) + + +def _normalize_expression_node( + node: ast.AST, operands: dict +) -> tuple[blosc2.NDArray, str, list[str]] | None: + canonicalizer = _OperandCanonicalizer(operands) + normalized = canonicalizer.visit( + ast.fix_missing_locations(ast.parse(ast.unparse(node), mode="eval")).body + ) + if not canonicalizer.valid or canonicalizer.base is None or not canonicalizer.dependencies: + return None + dependencies = list(dict.fromkeys(canonicalizer.dependencies)) + return canonicalizer.base, ast.unparse(normalized), dependencies + + +def _normalize_expression_target(expression: str, operands: dict) -> tuple[blosc2.NDArray, dict, np.dtype]: + try: + tree = ast.parse(expression, mode="eval") + except SyntaxError as exc: + raise ValueError("expression is not valid Python syntax") from exc + + normalized = _normalize_expression_node(tree.body, operands) + if normalized is None: + raise ValueError("expression indexes require operands from a single 1-D NDArray target") + base, expression_key, dependencies = normalized + if base.ndim != 1: + raise ValueError("expression indexes are only supported on 1-D NDArray objects") + target = _expression_target_descriptor(expression, expression_key, dependencies) + sample_stop = min(int(base.shape[0]), max(1, int(base.blocks[0]) if base.blocks else 1)) + sample = _slice_values_for_target(base, target, 0, sample_stop) + dtype = np.dtype(sample.dtype) + if sample.ndim != 1: + raise ValueError("expression indexes require expressions returning a 1-D scalar stream") + if not _supported_index_dtype(dtype): + raise TypeError(f"dtype {dtype} is not supported by the current index engine") + return base, target, dtype + + +def _sanitize_sidecar_root(urlpath: str | Path) -> tuple[Path, str]: + path = Path(urlpath) + suffix = "".join(path.suffixes) + root = path.name[: -len(suffix)] if suffix else path.name + return path, root + + +def _sidecar_path(array: blosc2.NDArray, token: str, kind: str, name: str) -> str: + path, root = _sanitize_sidecar_root(array.urlpath) + return str(path.with_name(f"{root}.__index__.{_sanitize_token(token)}.{kind}.{name}.b2nd")) + + +def _segment_len(array: blosc2.NDArray, level: str) -> int: + if level == "chunk": + return int(array.chunks[0]) + if level == "block": + return int(array.blocks[0]) + if level == "subblock": + return max(1, int(array.blocks[0]) // 8) + raise ValueError(f"unknown level {level!r}") + + +def _data_cache_key(array: blosc2.NDArray, token: str, category: str, name: str): + return (_array_key(array), token, category, name) + + +def _clear_cached_data(array: blosc2.NDArray, token: str) -> None: + prefix = (_array_key(array), token) + keys = [key for key in _DATA_CACHE if key[:2] == prefix] + for key in keys: + _DATA_CACHE.pop(key, None) + handle_keys = [key for key in _SIDECAR_HANDLE_CACHE if key[:2] == prefix] + for key in handle_keys: + _SIDECAR_HANDLE_CACHE.pop(key, None) + + +def _sidecar_handle_cache_key(array: blosc2.NDArray, token: str, category: str, name: str): + return (_array_key(array), token, category, name) + + +def _open_sidecar_handle(array: blosc2.NDArray, token: str, category: str, name: str, path: str | None): + cache_key = _sidecar_handle_cache_key(array, token, category, name) + cached = _SIDECAR_HANDLE_CACHE.get(cache_key) + if cached is not None: + return cached + if path is None: + raise RuntimeError("sidecar handle path is not available") + handle = blosc2.open(path) + _SIDECAR_HANDLE_CACHE[cache_key] = handle + return handle + + +def _operands_for_dependencies(values: np.ndarray, dependencies: list[str]) -> dict[str, np.ndarray]: + operands = {} + for dependency in dependencies: + if dependency == SELF_TARGET_NAME: + operands[dependency] = values + else: + operands[dependency] = values[dependency] + return operands + + +def _values_from_numpy_target(values: np.ndarray, target: dict) -> np.ndarray: + if target["source"] == "field": + field = target.get("field") + return values if field is None else values[field] + if target["source"] == "expression": + from .lazyexpr import ne_evaluate + + result = ne_evaluate( + target["expression_key"], _operands_for_dependencies(values, target["dependencies"]) + ) + return np.asarray(result) + raise ValueError(f"unsupported index target source {target['source']!r}") + + +def _values_for_target(array: blosc2.NDArray, target: dict) -> np.ndarray: + return _slice_values_for_target(array, target, 0, int(array.shape[0])) + + +def _slice_values_for_target(array: blosc2.NDArray, target: dict, start: int, stop: int) -> np.ndarray: + return _values_from_numpy_target(array[start:stop], target) + + +def _summary_dtype(dtype: np.dtype) -> np.dtype: + return np.dtype([("min", dtype), ("max", dtype), ("flags", np.uint8)]) + + +def _boundary_dtype(dtype: np.dtype) -> np.dtype: + return np.dtype([("start", dtype), ("end", dtype)]) + + +def _segment_summary(segment: np.ndarray, dtype: np.dtype): + flags = np.uint8(0) + if dtype.kind == "f": + valid = ~np.isnan(segment) + if not np.all(valid): + flags |= FLAG_HAS_NAN + if not np.any(valid): + flags |= FLAG_ALL_NAN + zero = np.zeros((), dtype=dtype)[()] + return zero, zero, flags + segment = segment[valid] + return segment.min(), segment.max(), flags + + +def _compute_segment_summaries(values: np.ndarray, dtype: np.dtype, segment_len: int) -> np.ndarray: + nsegments = math.ceil(values.shape[0] / segment_len) + summary_dtype = _summary_dtype(dtype) + summaries = np.empty(nsegments, dtype=summary_dtype) + + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, values.shape[0]) + segment = values[start:stop] + summaries[idx] = _segment_summary(segment, dtype) + return summaries + + +def _compute_sorted_boundaries(values: np.ndarray, dtype: np.dtype, segment_len: int) -> np.ndarray: + nsegments = math.ceil(values.shape[0] / segment_len) + boundaries = np.empty(nsegments, dtype=_boundary_dtype(dtype)) + + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, values.shape[0]) + segment = values[start:stop] + boundaries[idx] = (segment[0], segment[-1]) + return boundaries + + +def _compute_sorted_boundaries_from_sidecar( + path: str, dtype: np.dtype, length: int, segment_len: int +) -> np.ndarray: + nsegments = math.ceil(length / segment_len) + boundaries = np.empty(nsegments, dtype=_boundary_dtype(dtype)) + sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + start_value = np.empty(1, dtype=dtype) + end_value = np.empty(1, dtype=dtype) + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, length) + _read_ndarray_linear_span(sidecar, start, start_value) + _read_ndarray_linear_span(sidecar, stop - 1, end_value) + boundaries[idx] = (start_value[0], end_value[0]) + return boundaries + + +def _store_array_sidecar( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + name: str, + data: np.ndarray, + persistent: bool, + *, + chunks: tuple[int, ...] | None = None, + blocks: tuple[int, ...] | None = None, + cparams: dict | None = None, +) -> dict: + cache_key = _data_cache_key(array, token, category, name) + if persistent: + path = _sidecar_path(array, token, kind, f"{category}.{name}") + blosc2.remove_urlpath(path) + kwargs = {"urlpath": path, "mode": "w"} + if chunks is not None: + kwargs["chunks"] = chunks + if blocks is not None: + kwargs["blocks"] = blocks + if cparams is not None: + kwargs["cparams"] = cparams + blosc2.asarray(data, **kwargs) + if isinstance(data, np.memmap): + _DATA_CACHE.pop(cache_key, None) + else: + _DATA_CACHE[cache_key] = data + else: + path = None + _DATA_CACHE[cache_key] = np.array(data, copy=True) if isinstance(data, np.memmap) else data + return {"path": path, "dtype": data.dtype.descr if data.dtype.fields else data.dtype.str} + + +def _create_persistent_sidecar_handle( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + name: str, + length: int, + dtype: np.dtype, + *, + chunks: tuple[int, ...] | None = None, + blocks: tuple[int, ...] | None = None, + cparams: dict | None = None, +) -> tuple[blosc2.NDArray | None, dict]: + path = _sidecar_path(array, token, kind, f"{category}.{name}") + blosc2.remove_urlpath(path) + kwargs = {"urlpath": path, "mode": "w"} + if chunks is not None: + kwargs["chunks"] = chunks + if blocks is not None: + kwargs["blocks"] = blocks + if cparams is not None: + kwargs["cparams"] = cparams + if length == 0: + blosc2.asarray(np.empty(0, dtype=dtype), **kwargs) + return None, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} + handle = blosc2.empty((length,), dtype=dtype, **kwargs) + return handle, {"path": path, "dtype": dtype.descr if dtype.fields else dtype.str} + + +def _normalize_index_cparams(cparams) -> blosc2.CParams | None: + if cparams is None: + return None + if isinstance(cparams, blosc2.CParams): + return cparams + return blosc2.CParams(**cparams) + + +def _plain_index_cparams(cparams: dict | blosc2.CParams | None) -> dict | None: + if cparams is None: + return None + + def _plain_value(value): + if isinstance(value, enum.Enum): + return value.value + if isinstance(value, dict): + return {key: _plain_value(item) for key, item in value.items()} + if isinstance(value, list | tuple): + return type(value)(_plain_value(item) for item in value) + return value + + if isinstance(cparams, blosc2.CParams): + cparams = asdict(cparams) + else: + cparams = cparams.copy() + return {key: _plain_value(value) for key, value in cparams.items()} + + +def _load_array_sidecar( + array: blosc2.NDArray, token: str, category: str, name: str, path: str | None +) -> np.ndarray: + cache_key = _data_cache_key(array, token, category, name) + cached = _DATA_CACHE.get(cache_key) + if cached is not None: + return cached + if path is None: + raise RuntimeError("in-memory index metadata is missing from the current process") + data = blosc2.open(path)[:] + _DATA_CACHE[cache_key] = data + return data + + +def _build_levels_descriptor( + array: blosc2.NDArray, + target: dict, + token: str, + kind: str, + dtype: np.dtype, + values: np.ndarray, + persistent: bool, + cparams: dict | None = None, +) -> dict: + levels = {} + for level in SEGMENT_LEVELS_BY_KIND[kind]: + segment_len = _segment_len(array, level) + summaries = _compute_segment_summaries(values, dtype, segment_len) + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) + levels[level] = { + "segment_len": segment_len, + "nsegments": len(summaries), + "path": sidecar["path"], + "dtype": sidecar["dtype"], + } + return levels + + +def _build_levels_descriptor_ooc( + array: blosc2.NDArray, + target: dict, + token: str, + kind: str, + dtype: np.dtype, + persistent: bool, + cparams: dict | None = None, +) -> dict: + levels = {} + size = int(array.shape[0]) + summary_dtype = _summary_dtype(dtype) + for level in SEGMENT_LEVELS_BY_KIND[kind]: + segment_len = _segment_len(array, level) + nsegments = math.ceil(size / segment_len) + summaries = np.empty(nsegments, dtype=summary_dtype) + for idx in range(nsegments): + start = idx * segment_len + stop = min(start + segment_len, size) + summaries[idx] = _segment_summary(_slice_values_for_target(array, target, start, stop), dtype) + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) + levels[level] = { + "segment_len": segment_len, + "nsegments": len(summaries), + "path": sidecar["path"], + "dtype": sidecar["dtype"], + } + return levels + + +def _sidecar_storage_geometry( + path: str | None, fallback_chunk_len: int, fallback_block_len: int +) -> tuple[int, int]: + if path is None: + return fallback_chunk_len, fallback_block_len + sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + return int(sidecar.chunks[0]), int(sidecar.blocks[0]) + + +def _rebuild_full_navigation_sidecars( + array: blosc2.NDArray, + token: str, + kind: str, + full: dict, + sorted_values: np.ndarray, + persistent: bool, + cparams: dict | None = None, +) -> None: + chunk_len, block_len = _sidecar_storage_geometry( + full.get("values_path"), int(array.chunks[0]), int(array.blocks[0]) + ) + l1 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), chunk_len) + l2 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), block_len) + l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent, cparams=cparams) + l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent, cparams=cparams) + full["l1_path"] = l1_sidecar["path"] + full["l2_path"] = l2_sidecar["path"] + full["sidecar_chunk_len"] = int(chunk_len) + full["sidecar_block_len"] = int(block_len) + + +def _rebuild_full_navigation_sidecars_from_path( + array: blosc2.NDArray, + token: str, + kind: str, + full: dict, + values_path: str, + dtype: np.dtype, + length: int, + persistent: bool, + cparams: dict | None = None, +) -> None: + chunk_len, block_len = _sidecar_storage_geometry(values_path, int(array.chunks[0]), int(array.blocks[0])) + l1 = _compute_sorted_boundaries_from_sidecar(values_path, dtype, length, chunk_len) + l2 = _compute_sorted_boundaries_from_sidecar(values_path, dtype, length, block_len) + l1_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l1", l1, persistent, cparams=cparams) + l2_sidecar = _store_array_sidecar(array, token, kind, "full_nav", "l2", l2, persistent, cparams=cparams) + full["l1_path"] = l1_sidecar["path"] + full["l2_path"] = l2_sidecar["path"] + full["sidecar_chunk_len"] = int(chunk_len) + full["sidecar_block_len"] = int(block_len) + full["l1_dtype"] = l1_sidecar["dtype"] + full["l2_dtype"] = l2_sidecar["dtype"] + + +def _stream_copy_sidecar_array( + source_path: Path | str, + dest_path: Path | str, + length: int, + dtype: np.dtype, + chunks: tuple[int, ...], + blocks: tuple[int, ...], + cparams: dict | None = None, +) -> None: + source = blosc2.open(str(source_path), mmap_mode=_INDEX_MMAP_MODE) + blosc2.remove_urlpath(str(dest_path)) + kwargs = {"chunks": chunks, "blocks": blocks, "urlpath": str(dest_path), "mode": "w"} + if cparams is not None: + kwargs["cparams"] = cparams + dest = blosc2.empty((length,), dtype=dtype, **kwargs) + chunk_len = int(dest.chunks[0]) + for start in range(0, length, chunk_len): + stop = min(start + chunk_len, length) + span = np.empty(stop - start, dtype=dtype) + _read_ndarray_linear_span(source, start, span) + dest[start:stop] = span + del source, dest + + +def _stream_copy_temp_run_to_full_sidecars( + array: blosc2.NDArray, + token: str, + kind: str, + full: dict, + run: SortedRun, + dtype: np.dtype, + persistent: bool, + tracker: TempRunTracker | None = None, + cparams: dict | None = None, +) -> None: + if not persistent: + raise ValueError("temp-run streaming only supports persistent runs") + + values_path = _sidecar_path(array, token, kind, "full.values") + positions_path = _sidecar_path(array, token, kind, "full.positions") + _remove_sidecar_path(values_path) + _remove_sidecar_path(positions_path) + _stream_copy_sidecar_array( + run.values_path, + values_path, + run.length, + dtype, + (int(array.chunks[0]),), + (int(array.blocks[0]),), + cparams, + ) + _stream_copy_sidecar_array( + run.positions_path, + positions_path, + run.length, + np.dtype(np.int64), + (int(array.chunks[0]),), + (int(array.blocks[0]),), + cparams, + ) + _tracker_register_delete(tracker, run.values_path, run.positions_path) + run.values_path.unlink(missing_ok=True) + run.positions_path.unlink(missing_ok=True) + full["values_path"] = values_path + full["positions_path"] = positions_path + full["runs"] = [] + full["next_run_id"] = 0 + _rebuild_full_navigation_sidecars_from_path( + array, token, kind, full, values_path, dtype, run.length, persistent, cparams + ) + + +def _build_full_descriptor( + array: blosc2.NDArray, + token: str, + kind: str, + values: np.ndarray, + persistent: bool, + cparams: dict | None = None, +) -> dict: + order = np.argsort(values, kind="stable") + positions = order.astype(np.int64, copy=False) + sorted_values = values[order] + values_sidecar = _store_array_sidecar( + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full", "positions", positions, persistent, cparams=cparams + ) + full = { + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + "runs": [], + "next_run_id": 0, + } + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) + return full + + +def _position_dtype(max_value: int) -> np.dtype: + if max_value <= np.iinfo(np.uint8).max: + return np.dtype(np.uint8) + if max_value <= np.iinfo(np.uint16).max: + return np.dtype(np.uint16) + if max_value <= np.iinfo(np.uint32).max: + return np.dtype(np.uint32) + return np.dtype(np.uint64) + + +def _resolve_ooc_mode(kind: str, in_mem: bool) -> bool: + if kind not in {"light", "medium", "full"}: + return False + return not in_mem + + +def _build_block_sorted_payload( + values: np.ndarray, block_len: int +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.dtype]: + nblocks = math.ceil(values.shape[0] / block_len) + position_dtype = _position_dtype(block_len - 1) + offsets = np.empty(nblocks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty_like(values) + positions = np.empty(values.shape[0], dtype=position_dtype) + cursor = 0 + + for block_id in range(nblocks): + start = block_id * block_len + stop = min(start + block_len, values.shape[0]) + block = values[start:stop] + order = np.argsort(block, kind="stable") + block_size = stop - start + next_cursor = cursor + block_size + sorted_values[cursor:next_cursor] = block[order] + positions[cursor:next_cursor] = order.astype(position_dtype, copy=False) + cursor = next_cursor + offsets[block_id + 1] = cursor + + return sorted_values, positions, offsets, position_dtype + + +def _build_reduced_descriptor( + array: blosc2.NDArray, + token: str, + kind: str, + values: np.ndarray, + optlevel: int, + persistent: bool, + cparams: dict | None = None, +) -> dict: + chunk_len = int(array.chunks[0]) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) + sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( + values, chunk_len, nav_segment_len, cparams + ) + l1 = _compute_sorted_boundaries(sorted_values, np.dtype(values.dtype), chunk_len) + reduced = _chunk_index_payload_storage( + array, + token, + kind, + "reduced", + "values", + sorted_values, + "positions", + positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, + cparams, + ) + reduced["position_dtype"] = positions.dtype.str + reduced["nav_segment_divisor"] = nav_segment_divisor + return reduced + + +def _segment_row_count(chunk_len: int, nav_segment_len: int) -> int: + return max(1, math.ceil(chunk_len / nav_segment_len)) + + +def _chunk_offsets(size: int, chunk_len: int) -> np.ndarray: + nchunks = math.ceil(size / chunk_len) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + if nchunks == 0: + return offsets + offsets[1:] = np.minimum(np.arange(1, nchunks + 1, dtype=np.int64) * chunk_len, size) + return offsets + + +def _index_build_threads(cparams: dict | blosc2.CParams | None = None) -> int: + if blosc2.IS_WASM: + return 1 + forced = os.getenv("BLOSC2_INDEX_BUILD_THREADS") + if forced is not None: + try: + forced_threads = int(forced) + except ValueError: + forced_threads = 1 + return _python_executor_threads(forced_threads) + if cparams is not None: + nthreads = cparams.nthreads if isinstance(cparams, blosc2.CParams) else cparams.get("nthreads") + else: + nthreads = None + if nthreads is not None: + try: + cparams_threads = int(nthreads) + except (TypeError, ValueError): + cparams_threads = 1 + return _python_executor_threads(cparams_threads) + return _python_executor_threads(int(getattr(blosc2, "nthreads", 1) or 1)) + + +def _sidecar_block_len(sidecar: dict, fallback_block_len: int) -> int: + path = sidecar.get("path") + if path is None: + return fallback_block_len + return int(blosc2.open(path).blocks[0]) + + +def _medium_nav_segment_divisor(optlevel: int) -> int: + if optlevel <= 1: + return 1 + if optlevel <= 3: + return 2 + if optlevel <= 6: + return 4 + return 8 + + +def _medium_nav_segment_len(block_len: int, chunk_len: int, optlevel: int) -> tuple[int, int]: + divisor = min(block_len, _medium_nav_segment_divisor(int(optlevel))) + max_segments_per_chunk = 2048 + chunk_floor = max(1, math.ceil(int(chunk_len) / max_segments_per_chunk)) + return max(1, block_len // divisor, chunk_floor), divisor + + +def _build_chunk_sorted_payload( + values: np.ndarray, + chunk_len: int, + nav_segment_len: int, + cparams: dict | None = None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.dtype]: + size = values.shape[0] + nchunks = math.ceil(size / chunk_len) + position_dtype = _position_dtype(chunk_len - 1) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty_like(values) + positions = np.empty(size, dtype=position_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(values.dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(values.dtype)) + + cursor = 0 + thread_count = _index_build_threads(cparams) + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk = values[start:stop] + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_sorted, chunk_positions = _sort_chunk_intra_chunk( + chunk, position_dtype, thread_count=thread_count + ) + sorted_values[cursor:next_cursor] = chunk_sorted + positions[cursor:next_cursor] = chunk_positions + offsets[chunk_id + 1] = next_cursor + l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) + + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = cursor + segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, next_cursor) + l2[row_start + segment_id] = (sorted_values[seg_start], sorted_values[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, positions, offsets, l2, position_dtype + + +def _build_chunk_sorted_payload_direct( + array: blosc2.NDArray, + target: dict, + dtype: np.dtype, + chunk_len: int, + nav_segment_len: int, + *, + payload_dtype: np.dtype | None = None, + aux_dtype: np.dtype | None = None, + value_transform=None, + aux_transform=None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + size = int(array.shape[0]) + nchunks = math.ceil(size / chunk_len) + payload_dtype = np.dtype(dtype if payload_dtype is None else payload_dtype) + aux_dtype = np.dtype(_position_dtype(chunk_len - 1) if aux_dtype is None else aux_dtype) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + payload = np.empty(size, dtype=payload_dtype) + aux = np.empty(size, dtype=aux_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(payload_dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(payload_dtype)) + + cursor = 0 + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk = _slice_values_for_target(array, target, start, stop) + order = np.argsort(chunk, kind="stable") + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_payload = chunk[order] + if value_transform is not None: + chunk_payload = value_transform(chunk_payload) + chunk_aux = order.astype(_position_dtype(chunk_len - 1), copy=False) + if aux_transform is not None: + chunk_aux = aux_transform(chunk_aux) + payload[cursor:next_cursor] = chunk_payload + aux[cursor:next_cursor] = chunk_aux + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (chunk_payload[0], chunk_payload[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_payload[seg_start], chunk_payload[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return payload, aux, offsets, l1, l2 + + +def _intra_chunk_run_ranges(chunk_size: int, thread_count: int) -> list[tuple[int, int]]: + if chunk_size <= 0: + return [] + run_count = max(1, min(thread_count, chunk_size)) + boundaries = np.linspace(0, chunk_size, run_count + 1, dtype=np.int64) + return [(int(boundaries[idx]), int(boundaries[idx + 1])) for idx in range(run_count)] + + +def _sort_chunk_run( + chunk: np.ndarray, run_start: int, run_stop: int, position_dtype: np.dtype +) -> tuple[np.ndarray, np.ndarray]: + run = chunk[run_start:run_stop] + try: + return indexing_ext.intra_chunk_sort_run(run, run_start, position_dtype) + except TypeError: + order = np.argsort(run, kind="stable") + return run[order], (order + run_start).astype(position_dtype, copy=False) + + +def _merge_sorted_run_pair( + left_values: np.ndarray, + left_positions: np.ndarray, + right_values: np.ndarray, + right_positions: np.ndarray, + dtype: np.dtype, + position_dtype: np.dtype, +) -> tuple[np.ndarray, np.ndarray]: + try: + merged_values, merged_positions = indexing_ext.intra_chunk_merge_sorted_slices( + left_values, left_positions, right_values, right_positions, position_dtype + ) + except TypeError: + merged_values, merged_positions = _merge_sorted_slices( + left_values, left_positions, right_values, right_positions, dtype + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +def _sort_chunk_intra_chunk( + chunk: np.ndarray, + position_dtype: np.dtype, + *, + thread_count: int | None = None, +) -> tuple[np.ndarray, np.ndarray]: + chunk_size = chunk.shape[0] + if chunk_size == 0: + return np.empty(0, dtype=chunk.dtype), np.empty(0, dtype=position_dtype) + if thread_count is None: + thread_count = _index_build_threads() + thread_count = max(1, min(int(thread_count), chunk_size)) + if thread_count <= 1: + order = np.argsort(chunk, kind="stable") + return chunk[order], order.astype(position_dtype, copy=False) + + def sort_run(run_range: tuple[int, int]) -> tuple[np.ndarray, np.ndarray]: + return _sort_chunk_run(chunk, run_range[0], run_range[1], position_dtype) + + run_ranges = _intra_chunk_run_ranges(chunk_size, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + runs = list(executor.map(sort_run, run_ranges)) + + while len(runs) > 1: + pair_specs = [(runs[idx], runs[idx + 1]) for idx in range(0, len(runs) - 1, 2)] + + def merge_pair( + pair_spec: tuple[tuple[np.ndarray, np.ndarray], tuple[np.ndarray, np.ndarray]], + ) -> tuple[np.ndarray, np.ndarray]: + (left_values, left_positions), (right_values, right_positions) = pair_spec + return _merge_sorted_run_pair( + left_values, left_positions, right_values, right_positions, chunk.dtype, position_dtype + ) + + if pair_specs: + merge_workers = min(thread_count, len(pair_specs)) + if merge_workers <= 1: + merged_runs = [merge_pair(pair_spec) for pair_spec in pair_specs] + else: + with ThreadPoolExecutor(max_workers=merge_workers) as executor: + merged_runs = list(executor.map(merge_pair, pair_specs)) + else: + merged_runs = [] + if len(runs) % 2 == 1: + merged_runs.append(runs[-1]) + runs = merged_runs + + return runs[0] + + +def _build_reduced_chunk_payloads_intra_chunk( + array: blosc2.NDArray, + target: dict, + dtype: np.dtype, + chunk_len: int, + nav_segment_len: int, + cparams: dict | None = None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + size = int(array.shape[0]) + nchunks = math.ceil(size / chunk_len) + position_dtype = _position_dtype(chunk_len - 1) + sorted_values = np.empty(size, dtype=dtype) + positions = np.empty(size, dtype=position_dtype) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + cursor = 0 + thread_count = _index_build_threads(cparams) + + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk_sorted, local_positions = _sort_chunk_intra_chunk( + _slice_values_for_target(array, target, start, stop), position_dtype, thread_count=thread_count + ) + chunk_size = stop - start + next_cursor = cursor + chunk_size + sorted_values[cursor:next_cursor] = chunk_sorted + positions[cursor:next_cursor] = local_positions + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, positions, offsets, l1, l2 + + +def _chunk_index_payload_storage( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + payload_name: str, + payload: np.ndarray, + aux_name: str, + aux_payload: np.ndarray, + offsets: np.ndarray, + l1: np.ndarray, + l2: np.ndarray, + persistent: bool, + chunk_len: int, + nav_segment_len: int, + cparams: dict | None = None, +) -> dict: + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + payload_sidecar = _store_array_sidecar( + array, + token, + kind, + category, + payload_name, + payload, + persistent, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + cparams=cparams, + ) + aux_sidecar = _store_array_sidecar( + array, + token, + kind, + category, + aux_name, + aux_payload, + persistent, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + cparams=cparams, + ) + offsets_sidecar = _store_array_sidecar( + array, token, kind, category, "offsets", offsets, persistent, cparams=cparams + ) + l1_sidecar = _store_array_sidecar( + array, token, kind, f"{category}_nav", "l1", l1, persistent, cparams=cparams + ) + l2_sidecar = _store_array_sidecar( + array, + token, + kind, + f"{category}_nav", + "l2", + l2, + persistent, + chunks=(nsegments_per_chunk,), + blocks=(min(nsegments_per_chunk, max(1, nsegments_per_chunk)),), + cparams=cparams, + ) + return { + "layout": "chunk-local-v1", + "chunk_len": chunk_len, + "nav_segment_len": nav_segment_len, + "nsegments_per_chunk": nsegments_per_chunk, + "values_path": payload_sidecar["path"], + f"{aux_name}_path": aux_sidecar["path"], + "offsets_path": offsets_sidecar["path"], + "l1_path": l1_sidecar["path"], + "l2_path": l2_sidecar["path"], + } + + +def _prepare_chunk_index_payload_sidecars( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + payload_name: str, + payload_dtype: np.dtype, + aux_name: str, + aux_dtype: np.dtype, + size: int, + chunk_len: int, + nav_segment_len: int, + cparams: dict | None = None, +) -> tuple[blosc2.NDArray | None, dict, blosc2.NDArray | None, dict]: + payload_handle, payload_sidecar = _create_persistent_sidecar_handle( + array, + token, + kind, + category, + payload_name, + size, + payload_dtype, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + cparams=cparams, + ) + aux_handle, aux_sidecar = _create_persistent_sidecar_handle( + array, + token, + kind, + category, + aux_name, + size, + aux_dtype, + chunks=(chunk_len,), + blocks=(nav_segment_len,), + cparams=cparams, + ) + return payload_handle, payload_sidecar, aux_handle, aux_sidecar + + +def _finalize_chunk_index_payload_storage( + array: blosc2.NDArray, + token: str, + kind: str, + category: str, + aux_name: str, + offsets: np.ndarray, + l1: np.ndarray, + l2: np.ndarray, + payload_sidecar: dict, + aux_sidecar: dict, + chunk_len: int, + nav_segment_len: int, + cparams: dict | None = None, +) -> dict: + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + offsets_sidecar = _store_array_sidecar( + array, token, kind, category, "offsets", offsets, True, cparams=cparams + ) + l1_sidecar = _store_array_sidecar(array, token, kind, f"{category}_nav", "l1", l1, True, cparams=cparams) + l2_sidecar = _store_array_sidecar( + array, + token, + kind, + f"{category}_nav", + "l2", + l2, + True, + chunks=(nsegments_per_chunk,), + blocks=(min(nsegments_per_chunk, max(1, nsegments_per_chunk)),), + cparams=cparams, + ) + return { + "layout": "chunk-local-v1", + "chunk_len": chunk_len, + "nav_segment_len": nav_segment_len, + "nsegments_per_chunk": nsegments_per_chunk, + "values_path": payload_sidecar["path"], + f"{aux_name}_path": aux_sidecar["path"], + "offsets_path": offsets_sidecar["path"], + "l1_path": l1_sidecar["path"], + "l2_path": l2_sidecar["path"], + } + + +def _build_reduced_descriptor_ooc( + array: blosc2.NDArray, + target: dict, + token: str, + kind: str, + dtype: np.dtype, + optlevel: int, + persistent: bool, + cparams: dict | None = None, +) -> dict: + if persistent: + size = int(array.shape[0]) + chunk_len = int(array.chunks[0]) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len( + int(array.blocks[0]), chunk_len, optlevel + ) + nchunks = math.ceil(size / chunk_len) + position_dtype = _position_dtype(chunk_len - 1) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + values_handle = positions_handle = None + values_sidecar = positions_sidecar = None + try: + values_handle, values_sidecar, positions_handle, positions_sidecar = ( + _prepare_chunk_index_payload_sidecars( + array, + token, + kind, + "reduced", + "values", + dtype, + "positions", + position_dtype, + size, + chunk_len, + nav_segment_len, + cparams, + ) + ) + cursor = 0 + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_sorted, local_positions = _sort_chunk_intra_chunk( + _slice_values_for_target(array, target, start, stop), position_dtype + ) + if values_handle is not None: + values_handle[cursor:next_cursor] = chunk_sorted + if positions_handle is not None: + positions_handle[cursor:next_cursor] = local_positions + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (chunk_sorted[0], chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + del values_handle, positions_handle + reduced = _finalize_chunk_index_payload_storage( + array, + token, + kind, + "reduced", + "positions", + offsets, + l1, + l2, + values_sidecar, + positions_sidecar, + chunk_len, + nav_segment_len, + cparams, + ) + except Exception: + if values_sidecar is not None: + _remove_sidecar_path(values_sidecar["path"]) + if positions_sidecar is not None: + _remove_sidecar_path(positions_sidecar["path"]) + raise + reduced["position_dtype"] = position_dtype.str + reduced["nav_segment_divisor"] = nav_segment_divisor + return reduced + + chunk_len = int(array.chunks[0]) + nav_segment_len, nav_segment_divisor = _medium_nav_segment_len(int(array.blocks[0]), chunk_len, optlevel) + sorted_values, positions, offsets, l1, l2 = _build_reduced_chunk_payloads_intra_chunk( + array, target, dtype, chunk_len, nav_segment_len, cparams + ) + reduced = _chunk_index_payload_storage( + array, + token, + kind, + "reduced", + "values", + sorted_values, + "positions", + positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, + cparams, + ) + reduced["position_dtype"] = positions.dtype.str + reduced["nav_segment_divisor"] = nav_segment_divisor + return reduced + + +def _light_bucket_count(block_len: int) -> int: + return max(1, min(64, block_len)) + + +def _pack_bucket_mask(bucket_ids: np.ndarray) -> np.uint64: + mask = np.uint64(0) + for bucket_id in np.unique(bucket_ids): + mask |= np.uint64(1) << np.uint64(int(bucket_id)) + return mask + + +def _light_value_lossy_bits(dtype: np.dtype, optlevel: int) -> int: + dtype = np.dtype(dtype) + if dtype.kind in {"i", "u"} or dtype == np.dtype(np.float32) or dtype == np.dtype(np.float64): + max_bits = dtype.itemsize + else: + return 0 + return min(max(0, 9 - int(optlevel)), max_bits) + + +def _quantize_integer_array(values: np.ndarray, bits: int) -> np.ndarray: + if bits <= 0: + return values + dtype = np.dtype(values.dtype) + base_mask = np.iinfo(dtype).max if dtype.kind == "u" else -1 + mask = np.asarray(base_mask ^ ((1 << bits) - 1), dtype=dtype)[()] + quantized = values.copy() + np.bitwise_and(quantized, mask, out=quantized) + return quantized + + +def _quantize_integer_scalar(value, dtype: np.dtype, bits: int): + scalar = np.asarray(value, dtype=dtype)[()] + if bits <= 0: + return scalar + base_mask = np.iinfo(dtype).max if dtype.kind == "u" else -1 + mask = np.asarray(base_mask ^ ((1 << bits) - 1), dtype=dtype)[()] + return np.bitwise_and(scalar, mask, dtype=dtype) + + +def _float_order_uint_dtype(dtype: np.dtype) -> np.dtype: + if dtype == np.dtype(np.float32): + return np.dtype(np.uint32) + if dtype == np.dtype(np.float64): + return np.dtype(np.uint64) + raise TypeError(f"unsupported float dtype {dtype}") + + +def _ordered_uint_from_float(values: np.ndarray) -> np.ndarray: + dtype = np.dtype(values.dtype) + uint_dtype = _float_order_uint_dtype(dtype) + bits = values.view(uint_dtype).copy() + sign_mask = np.asarray(1 << (dtype.itemsize * 8 - 1), dtype=uint_dtype)[()] + negative = (bits & sign_mask) != 0 + bits[negative] = ~bits[negative] + bits[~negative] ^= sign_mask + return bits + + +def _float_from_ordered_uint(ordered: np.ndarray, dtype: np.dtype) -> np.ndarray: + uint_dtype = _float_order_uint_dtype(dtype) + bits = ordered.astype(uint_dtype, copy=True) + sign_mask = np.asarray(1 << (dtype.itemsize * 8 - 1), dtype=uint_dtype)[()] + positive = (bits & sign_mask) != 0 + bits[positive] ^= sign_mask + bits[~positive] = ~bits[~positive] + return bits.view(dtype) + + +def _quantize_float_array(values: np.ndarray, bits: int) -> np.ndarray: + if bits <= 0: + return values + quantized = values.copy() + finite = np.isfinite(quantized) + if not np.any(finite): + return quantized + ordered = _ordered_uint_from_float(quantized[finite]) + uint_dtype = ordered.dtype + mask = np.asarray(np.iinfo(uint_dtype).max ^ ((1 << bits) - 1), dtype=uint_dtype)[()] + np.bitwise_and(ordered, mask, out=ordered) + quantized[finite] = _float_from_ordered_uint(ordered, quantized.dtype) + return quantized + + +def _quantize_float_scalar(value, dtype: np.dtype, bits: int): + scalar = np.asarray(value, dtype=dtype)[()] + if bits <= 0 or not np.isfinite(scalar): + return scalar + ordered = _ordered_uint_from_float(np.asarray([scalar], dtype=dtype)) + uint_dtype = ordered.dtype + mask = np.asarray(np.iinfo(uint_dtype).max ^ ((1 << bits) - 1), dtype=uint_dtype)[()] + np.bitwise_and(ordered, mask, out=ordered) + return _float_from_ordered_uint(ordered, dtype)[0] + + +def _quantize_light_values_array(values: np.ndarray, bits: int) -> np.ndarray: + dtype = np.dtype(values.dtype) + if bits <= 0: + return values + if dtype.kind in {"i", "u"}: + return _quantize_integer_array(values, bits) + if dtype == np.dtype(np.float32) or dtype == np.dtype(np.float64): + return _quantize_float_array(values, bits) + return values + + +def _quantize_light_value_scalar(value, dtype: np.dtype, bits: int): + dtype = np.dtype(dtype) + if bits <= 0: + return np.asarray(value, dtype=dtype)[()] + if dtype.kind in {"i", "u"}: + return _quantize_integer_scalar(value, dtype, bits) + if dtype == np.dtype(np.float32) or dtype == np.dtype(np.float64): + return _quantize_float_scalar(value, dtype, bits) + return np.asarray(value, dtype=dtype)[()] + + +def _build_light_chunk_payloads( + array: blosc2.NDArray, + target: dict, + dtype: np.dtype, + chunk_len: int, + nav_segment_len: int, + value_lossy_bits: int, + bucket_len: int, + bucket_dtype: np.dtype, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + size = int(array.shape[0]) + nchunks = math.ceil(size / chunk_len) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty(size, dtype=dtype) + bucket_positions = np.empty(size, dtype=bucket_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + position_dtype = _position_dtype(chunk_len - 1) + cursor = 0 + + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk = _slice_values_for_target(array, target, start, stop) + order = np.argsort(chunk, kind="stable") + chunk_size = stop - start + next_cursor = cursor + chunk_size + chunk_sorted = chunk[order] + stored_chunk_sorted = chunk_sorted + if value_lossy_bits > 0: + stored_chunk_sorted = _quantize_light_values_array(chunk_sorted, value_lossy_bits) + local_positions = order.astype(position_dtype, copy=False) + sorted_values[cursor:next_cursor] = stored_chunk_sorted + bucket_positions[cursor:next_cursor] = (local_positions // bucket_len).astype( + bucket_dtype, copy=False + ) + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (stored_chunk_sorted[0], stored_chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, bucket_positions, offsets, l1, l2 + + +def _build_light_chunk_payloads_intra_chunk( + array: blosc2.NDArray, + target: dict, + dtype: np.dtype, + chunk_len: int, + nav_segment_len: int, + value_lossy_bits: int, + bucket_len: int, + bucket_dtype: np.dtype, + cparams: dict | None = None, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray, np.ndarray]: + size = int(array.shape[0]) + nchunks = math.ceil(size / chunk_len) + offsets = np.empty(nchunks + 1, dtype=np.int64) + offsets[0] = 0 + sorted_values = np.empty(size, dtype=dtype) + bucket_positions = np.empty(size, dtype=bucket_dtype) + l1 = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + nsegments_per_chunk = _segment_row_count(chunk_len, nav_segment_len) + l2 = np.empty(nchunks * nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + position_dtype = _position_dtype(chunk_len - 1) + cursor = 0 + thread_count = _index_build_threads(cparams) + + for chunk_id in range(nchunks): + start = chunk_id * chunk_len + stop = min(start + chunk_len, size) + chunk_sorted, local_positions = _sort_chunk_intra_chunk( + _slice_values_for_target(array, target, start, stop), position_dtype, thread_count=thread_count + ) + chunk_size = stop - start + next_cursor = cursor + chunk_size + stored_chunk_sorted = chunk_sorted + if value_lossy_bits > 0: + stored_chunk_sorted = _quantize_light_values_array(chunk_sorted, value_lossy_bits) + sorted_values[cursor:next_cursor] = stored_chunk_sorted + bucket_positions[cursor:next_cursor] = (local_positions // bucket_len).astype( + bucket_dtype, copy=False + ) + offsets[chunk_id + 1] = next_cursor + if chunk_size > 0: + l1[chunk_id] = (stored_chunk_sorted[0], stored_chunk_sorted[-1]) + row_start = chunk_id * nsegments_per_chunk + segment_count = _segment_row_count(chunk_size, nav_segment_len) + for segment_id in range(segment_count): + seg_start = segment_id * nav_segment_len + seg_stop = min(seg_start + nav_segment_len, chunk_size) + l2[row_start + segment_id] = (chunk_sorted[seg_start], chunk_sorted[seg_stop - 1]) + for segment_id in range(segment_count, nsegments_per_chunk): + l2[row_start + segment_id] = l2[row_start + segment_count - 1] + cursor = next_cursor + + return sorted_values, bucket_positions, offsets, l1, l2 + + +def _build_light_descriptor( + array: blosc2.NDArray, + token: str, + kind: str, + values: np.ndarray, + optlevel: int, + persistent: bool, + cparams: dict | None = None, +) -> dict: + chunk_len = int(array.chunks[0]) + nav_segment_len = int(array.blocks[0]) + bucket_len = max(1, math.ceil(nav_segment_len / 64)) + bucket_count = math.ceil(chunk_len / bucket_len) + value_lossy_bits = _light_value_lossy_bits(values.dtype, optlevel) + sorted_values, positions, offsets, l2, _ = _build_chunk_sorted_payload( + values, chunk_len, nav_segment_len, cparams + ) + if value_lossy_bits > 0: + sorted_values = _quantize_light_values_array(sorted_values, value_lossy_bits) + bucket_dtype = _position_dtype(bucket_count - 1) + bucket_positions = (positions // bucket_len).astype(bucket_dtype, copy=False) + l1 = _compute_sorted_boundaries(sorted_values, np.dtype(sorted_values.dtype), chunk_len) + light = _chunk_index_payload_storage( + array, + token, + kind, + "light", + "values", + sorted_values, + "bucket_positions", + bucket_positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, + cparams, + ) + light["bucket_count"] = bucket_count + light["bucket_len"] = bucket_len + light["value_lossy_bits"] = value_lossy_bits + light["bucket_dtype"] = bucket_positions.dtype.str + return light + + +def _build_light_descriptor_ooc( + array: blosc2.NDArray, + target: dict, + token: str, + kind: str, + dtype: np.dtype, + optlevel: int, + persistent: bool, + cparams: dict | None = None, +) -> dict: + chunk_len = int(array.chunks[0]) + nav_segment_len = int(array.blocks[0]) + bucket_len = max(1, math.ceil(nav_segment_len / 64)) + bucket_count = math.ceil(chunk_len / bucket_len) + value_lossy_bits = _light_value_lossy_bits(dtype, optlevel) + bucket_dtype = _position_dtype(bucket_count - 1) + sorted_values, bucket_positions, offsets, l1, l2 = _build_light_chunk_payloads_intra_chunk( + array, + target, + dtype, + chunk_len, + nav_segment_len, + value_lossy_bits, + bucket_len, + bucket_dtype, + cparams, + ) + if persistent: + values_handle = bucket_handle = None + values_sidecar = bucket_sidecar = None + try: + values_handle, values_sidecar, bucket_handle, bucket_sidecar = ( + _prepare_chunk_index_payload_sidecars( + array, + token, + kind, + "light", + "values", + dtype, + "bucket_positions", + bucket_dtype, + len(sorted_values), + chunk_len, + nav_segment_len, + cparams, + ) + ) + if values_handle is not None: + values_handle[:] = sorted_values + if bucket_handle is not None: + bucket_handle[:] = bucket_positions + del values_handle, bucket_handle + light = _finalize_chunk_index_payload_storage( + array, + token, + kind, + "light", + "bucket_positions", + offsets, + l1, + l2, + values_sidecar, + bucket_sidecar, + chunk_len, + nav_segment_len, + cparams, + ) + except Exception: + if values_sidecar is not None: + _remove_sidecar_path(values_sidecar["path"]) + if bucket_sidecar is not None: + _remove_sidecar_path(bucket_sidecar["path"]) + raise + light["bucket_count"] = bucket_count + light["bucket_len"] = bucket_len + light["value_lossy_bits"] = value_lossy_bits + light["bucket_dtype"] = bucket_dtype.str + return light + + light = _chunk_index_payload_storage( + array, + token, + kind, + "light", + "values", + sorted_values, + "bucket_positions", + bucket_positions, + offsets, + l1, + l2, + persistent, + chunk_len, + nav_segment_len, + cparams, + ) + light["bucket_count"] = bucket_count + light["bucket_len"] = bucket_len + light["value_lossy_bits"] = value_lossy_bits + light["bucket_dtype"] = bucket_positions.dtype.str + return light + + +def _scalar_compare(left, right, dtype: np.dtype) -> int: + dtype = np.dtype(dtype) + if dtype.kind == "f": + left_nan = np.isnan(left) + right_nan = np.isnan(right) + if left_nan and right_nan: + return 0 + if left_nan: + return 1 + if right_nan: + return -1 + if left < right: + return -1 + if left > right: + return 1 + return 0 + + +def _pair_le(left_value, left_position: int, right_value, right_position: int, dtype: np.dtype) -> bool: + cmp = _scalar_compare(left_value, right_value, dtype) + if cmp < 0: + return True + if cmp > 0: + return False + return int(left_position) <= int(right_position) + + +def _pair_record_dtype(dtype: np.dtype) -> np.dtype: + return np.dtype([("value", dtype), ("position", np.int64)]) + + +def _pair_records(values: np.ndarray, positions: np.ndarray, dtype: np.dtype) -> np.ndarray: + records = np.empty(values.shape[0], dtype=_pair_record_dtype(dtype)) + records["value"] = values + records["position"] = positions + return records + + +def _merge_sorted_slices( + left_values: np.ndarray, + left_positions: np.ndarray, + right_values: np.ndarray, + right_positions: np.ndarray, + dtype: np.dtype, +) -> tuple[np.ndarray, np.ndarray]: + if left_values.size == 0: + return right_values, right_positions + if right_values.size == 0: + return left_values, left_positions + values = np.concatenate((left_values, right_values)) + positions = np.concatenate((left_positions, right_positions)) + order = np.lexsort((positions, values)) + return values[order], positions[order] + + +def _pair_searchsorted_right(values: np.ndarray, positions: np.ndarray, value, position: int) -> int: + records = _pair_records(values, positions, values.dtype) + needle = np.asarray((value, position), dtype=records.dtype)[()] + return int(np.searchsorted(records, needle, side="right")) + + +def _temp_run_storage_geometry( + length: int, dtype: np.dtype, buffer_items: int +) -> tuple[tuple[int], tuple[int]]: + chunk_items = max(1, min(length, buffer_items)) + target_block_bytes = 256 * 1024 + block_items = max(1, min(chunk_items, target_block_bytes // max(1, dtype.itemsize))) + return (chunk_items,), (block_items,) + + +def _path_disk_bytes(path: Path | str) -> int: + path = Path(path) + if not path.exists(): + return 0 + if path.is_file(): + return path.stat().st_size + return sum(entry.stat().st_size for entry in path.rglob("*") if entry.is_file()) + + +def _tracker_register_create(tracker: TempRunTracker | None, *paths: Path) -> None: + if tracker is None: + return + delta = sum(_path_disk_bytes(path) for path in paths) + tracker.current_disk_bytes += delta + tracker.total_written_bytes += delta + tracker.peak_disk_bytes = max(tracker.peak_disk_bytes, tracker.current_disk_bytes) + + +def _tracker_register_delete(tracker: TempRunTracker | None, *paths: Path) -> None: + if tracker is None: + return + delta = sum(_path_disk_bytes(path) for path in paths) + tracker.current_disk_bytes = max(0, tracker.current_disk_bytes - delta) + + +def _create_blosc2_temp_array( + path: Path, length: int, dtype: np.dtype, buffer_items: int, cparams: dict | None = None +): + chunks, blocks = _temp_run_storage_geometry(length, dtype, buffer_items) + if cparams is None: + cparams = blosc2.CParams(codec=blosc2.Codec.ZSTD, clevel=1) + return blosc2.empty( + (length,), + dtype=dtype, + chunks=chunks, + blocks=blocks, + urlpath=str(path), + mode="w", + cparams=cparams, + ) + + +def _read_ndarray_linear_span(array: blosc2.NDArray, start: int, out: np.ndarray) -> None: + if len(out) == 0: + return + chunk_len = int(array.chunks[0]) + cursor = int(start) + out_cursor = 0 + while out_cursor < len(out): + chunk_id = cursor // chunk_len + local_start = cursor % chunk_len + take = min(len(out) - out_cursor, chunk_len - local_start) + array.get_1d_span_numpy( + out[out_cursor : out_cursor + take], int(chunk_id), int(local_start), int(take) + ) + cursor += take + out_cursor += take + + +def _materialize_sorted_run( + values: np.ndarray, + positions: np.ndarray, + length: int, + value_dtype: np.dtype, + workdir: Path, + prefix: str, + tracker: TempRunTracker | None = None, + cparams: dict | None = None, +) -> SortedRun: + values_path = workdir / f"{prefix}.values.b2nd" + positions_path = workdir / f"{prefix}.positions.b2nd" + run_values = _create_blosc2_temp_array( + values_path, length, value_dtype, FULL_OOC_MERGE_BUFFER_ITEMS, cparams + ) + run_positions = _create_blosc2_temp_array( + positions_path, length, np.dtype(np.int64), FULL_OOC_MERGE_BUFFER_ITEMS, cparams + ) + run_values[:] = values + run_positions[:] = positions + del run_values, run_positions + _tracker_register_create(tracker, values_path, positions_path) + return SortedRun(values_path, positions_path, length) + + +def _copy_sidecar_to_temp_run( + path: str, + length: int, + dtype: np.dtype, + workdir: Path, + prefix: str, + tracker: TempRunTracker | None = None, + cparams: dict | None = None, +) -> Path: + out_path = workdir / f"{prefix}.b2nd" + sidecar = blosc2.open(path, mmap_mode=_INDEX_MMAP_MODE) + output = _create_blosc2_temp_array(out_path, length, dtype, FULL_OOC_MERGE_BUFFER_ITEMS, cparams) + chunk_len = int(sidecar.chunks[0]) + for chunk_id, start in enumerate(range(0, length, chunk_len)): + stop = min(start + chunk_len, length) + span = np.empty(stop - start, dtype=dtype) + sidecar.get_1d_span_numpy(span, chunk_id, 0, stop - start) + output[start:stop] = span + del output + _tracker_register_create(tracker, out_path) + return out_path + + +def _refill_run_buffer( + values_src, positions_src, cursor: int, buffer_items: int +) -> tuple[np.ndarray, np.ndarray, int]: + if cursor >= len(values_src): + values_dtype = values_src.dtype if hasattr(values_src, "dtype") else np.float64 + positions_dtype = positions_src.dtype if hasattr(positions_src, "dtype") else np.int64 + return np.empty(0, dtype=values_dtype), np.empty(0, dtype=positions_dtype), cursor + stop = min(cursor + buffer_items, len(values_src)) + if isinstance(values_src, np.ndarray): + return np.asarray(values_src[cursor:stop]), np.asarray(positions_src[cursor:stop]), stop + values = np.empty(stop - cursor, dtype=np.dtype(values_src.dtype)) + positions = np.empty(stop - cursor, dtype=np.dtype(positions_src.dtype)) + _read_ndarray_linear_span(values_src, cursor, values) + _read_ndarray_linear_span(positions_src, cursor, positions) + return values, positions, stop + + +def _merge_run_pair( + left: SortedRun, + right: SortedRun, + workdir: Path, + dtype: np.dtype, + merge_id: int, + buffer_items: int, + tracker: TempRunTracker | None = None, + cparams: dict | None = None, +) -> SortedRun: + left_values_mm = blosc2.open(str(left.values_path), mmap_mode=_INDEX_MMAP_MODE) + left_positions_mm = blosc2.open(str(left.positions_path), mmap_mode=_INDEX_MMAP_MODE) + right_values_mm = blosc2.open(str(right.values_path), mmap_mode=_INDEX_MMAP_MODE) + right_positions_mm = blosc2.open(str(right.positions_path), mmap_mode=_INDEX_MMAP_MODE) + + out_values_path = workdir / f"full_merge_values_{merge_id}.b2nd" + out_positions_path = workdir / f"full_merge_positions_{merge_id}.b2nd" + out_values = _create_blosc2_temp_array( + out_values_path, left.length + right.length, dtype, buffer_items, cparams + ) + out_positions = _create_blosc2_temp_array( + out_positions_path, left.length + right.length, np.dtype(np.int64), buffer_items, cparams + ) + + left_cursor = 0 + right_cursor = 0 + out_cursor = 0 + left_values = np.empty(0, dtype=dtype) + left_positions = np.empty(0, dtype=np.int64) + right_values = np.empty(0, dtype=dtype) + right_positions = np.empty(0, dtype=np.int64) + while True: + if left_values.size == 0: + left_values, left_positions, left_cursor = _refill_run_buffer( + left_values_mm, left_positions_mm, left_cursor, buffer_items + ) + if right_values.size == 0: + right_values, right_positions, right_cursor = _refill_run_buffer( + right_values_mm, right_positions_mm, right_cursor, buffer_items + ) + + if left_values.size == 0 and right_values.size == 0: + break + if left_values.size == 0: + take = right_values.size + out_values[out_cursor : out_cursor + take] = right_values + out_positions[out_cursor : out_cursor + take] = right_positions + out_cursor += take + right_values = np.empty(0, dtype=dtype) + right_positions = np.empty(0, dtype=np.int64) + continue + if right_values.size == 0: + take = left_values.size + out_values[out_cursor : out_cursor + take] = left_values + out_positions[out_cursor : out_cursor + take] = left_positions + out_cursor += take + left_values = np.empty(0, dtype=dtype) + left_positions = np.empty(0, dtype=np.int64) + continue + + if _pair_le(left_values[-1], left_positions[-1], right_values[-1], right_positions[-1], dtype): + left_cut = left_values.size + right_cut = _pair_searchsorted_right( + right_values, right_positions, left_values[-1], int(left_positions[-1]) + ) + else: + left_cut = _pair_searchsorted_right( + left_values, left_positions, right_values[-1], int(right_positions[-1]) + ) + right_cut = right_values.size + + merged_values, merged_positions = _merge_sorted_slices( + left_values[:left_cut], + left_positions[:left_cut], + right_values[:right_cut], + right_positions[:right_cut], + dtype, + ) + take = merged_values.size + out_values[out_cursor : out_cursor + take] = merged_values + out_positions[out_cursor : out_cursor + take] = merged_positions + out_cursor += take + left_values = left_values[left_cut:] + left_positions = left_positions[left_cut:] + right_values = right_values[right_cut:] + right_positions = right_positions[right_cut:] + + del out_values, out_positions + _tracker_register_create(tracker, out_values_path, out_positions_path) + del left_values_mm, left_positions_mm, right_values_mm, right_positions_mm + _tracker_register_delete( + tracker, left.values_path, left.positions_path, right.values_path, right.positions_path + ) + left.values_path.unlink(missing_ok=True) + left.positions_path.unlink(missing_ok=True) + right.values_path.unlink(missing_ok=True) + right.positions_path.unlink(missing_ok=True) + return SortedRun(out_values_path, out_positions_path, out_cursor) + + +def _build_full_descriptor_ooc( + array: blosc2.NDArray, + target: dict, + token: str, + kind: str, + dtype: np.dtype, + persistent: bool, + workdir: Path, + cparams: dict | None = None, +) -> dict: + size = int(array.shape[0]) + tracker = TempRunTracker() + if size == 0: + sorted_values = np.empty(0, dtype=dtype) + positions = np.empty(0, dtype=np.int64) + values_sidecar = _store_array_sidecar( + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full", "positions", positions, persistent, cparams=cparams + ) + full = { + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + "runs": [], + "next_run_id": 0, + } + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) + return full + run_items = max(int(array.chunks[0]), min(size, FULL_OOC_RUN_ITEMS)) + runs = [] + for run_id, start in enumerate(range(0, size, run_items)): + stop = min(start + run_items, size) + values = _slice_values_for_target(array, target, start, stop) + positions = np.arange(start, stop, dtype=np.int64) + order = np.lexsort((positions, values)) + sorted_values = values[order] + sorted_positions = positions[order] + runs.append( + _materialize_sorted_run( + sorted_values, + sorted_positions, + stop - start, + dtype, + workdir, + f"full_run_{run_id}", + tracker, + cparams, + ) + ) + + merge_id = 0 + while len(runs) > 1: + next_runs = [] + for idx in range(0, len(runs), 2): + if idx + 1 >= len(runs): + next_runs.append(runs[idx]) + continue + next_runs.append( + _merge_run_pair( + runs[idx], + runs[idx + 1], + workdir, + dtype, + merge_id, + FULL_OOC_MERGE_BUFFER_ITEMS, + tracker, + cparams, + ) + ) + merge_id += 1 + runs = next_runs + + final_run = runs[0] + full = { + "values_path": None, + "positions_path": None, + "runs": [], + "next_run_id": 0, + "temp_backend": "blosc2", + "temp_peak_disk_bytes": tracker.peak_disk_bytes, + "temp_total_written_bytes": tracker.total_written_bytes, + } + if persistent: + _stream_copy_temp_run_to_full_sidecars( + array, token, kind, full, final_run, dtype, persistent, tracker, cparams + ) + else: + sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] + values_sidecar = _store_array_sidecar( + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full", "positions", positions, persistent, cparams=cparams + ) + full["values_path"] = values_sidecar["path"] + full["positions_path"] = positions_sidecar["path"] + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) + del sorted_values, positions + _tracker_register_delete(tracker, final_run.values_path, final_run.positions_path) + final_run.values_path.unlink(missing_ok=True) + final_run.positions_path.unlink(missing_ok=True) + return full + + +def _build_descriptor( + array: blosc2.NDArray, + target: dict, + token: str, + kind: str, + optlevel: int, + persistent: bool, + ooc: bool, + name: str | None, + dtype: np.dtype, + levels: dict, + light: dict | None, + reduced: dict | None, + full: dict | None, + cparams: dict | None = None, +) -> dict: + return { + "name": name + or (target["expression"] if target["source"] == "expression" else _field_token(target.get("field"))), + "token": token, + "target": target.copy(), + "field": _target_field(target), + "kind": kind, + "version": INDEX_FORMAT_VERSION, + "optlevel": optlevel, + "persistent": persistent, + "ooc": ooc, + "stale": False, + "dtype": np.dtype(dtype).str, + "shape": tuple(array.shape), + "chunks": tuple(array.chunks), + "blocks": tuple(array.blocks), + "levels": levels, + "light": light, + "reduced": reduced, + "full": full, + "cparams": _plain_index_cparams(cparams), + } + + +def create_index( + array: blosc2.NDArray, + field: str | None = None, + kind: str = "light", + optlevel: int = 5, + persistent: bool | None = None, + in_mem: bool = False, + name: str | None = None, + **kwargs, +) -> dict: + cparams = _normalize_index_cparams(kwargs.pop("cparams", None)) + del kwargs + dtype = _validate_index_target(array, field) + target = _field_target_descriptor(field) + token = _target_token(target) + if kind not in SEGMENT_LEVELS_BY_KIND: + raise NotImplementedError(f"unsupported index kind {kind!r}") + if persistent is None: + persistent = _is_persistent_array(array) + use_ooc = _resolve_ooc_mode(kind, in_mem) + + if use_ooc and kind in {"light", "medium", "full"}: + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent, cparams) + light = ( + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) + if kind == "medium" + else None + ) + full = None + if kind == "full": + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + full = _build_full_descriptor_ooc( + array, target, token, kind, dtype, persistent, Path(tmpdir), cparams + ) + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + persistent, + True, + name, + dtype, + levels, + light, + reduced, + full, + cparams, + ) + else: + values = _values_for_target(array, target) + levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent, cparams) + light = ( + _build_light_descriptor(array, token, kind, values, optlevel, persistent, cparams) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor(array, token, kind, values, optlevel, persistent, cparams) + if kind == "medium" + else None + ) + full = ( + _build_full_descriptor(array, token, kind, values, persistent, cparams) + if kind == "full" + else None + ) + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + persistent, + False, + name, + dtype, + levels, + light, + reduced, + full, + cparams, + ) + + store = _load_store(array) + store["indexes"][token] = descriptor + _save_store(array, store) + return _copy_descriptor(descriptor) + + +def create_expr_index( + array: blosc2.NDArray, + expression: str, + *, + operands: dict | None = None, + kind: str = "light", + optlevel: int = 5, + persistent: bool | None = None, + in_mem: bool = False, + name: str | None = None, + **kwargs, +) -> dict: + cparams = _normalize_index_cparams(kwargs.pop("cparams", None)) + del kwargs + if operands is None: + operands = array.fields if array.dtype.fields is not None else {"value": array} + base, target, dtype = _normalize_expression_target(expression, operands) + if base is not array: + raise ValueError( + "expression index operands must resolve to the same array passed to create_expr_index()" + ) + if kind not in SEGMENT_LEVELS_BY_KIND: + raise NotImplementedError(f"unsupported index kind {kind!r}") + if persistent is None: + persistent = _is_persistent_array(array) + use_ooc = _resolve_ooc_mode(kind, in_mem) + token = _target_token(target) + + if use_ooc and kind in {"light", "medium", "full"}: + levels = _build_levels_descriptor_ooc(array, target, token, kind, dtype, persistent, cparams) + light = ( + _build_light_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor_ooc(array, target, token, kind, dtype, optlevel, persistent, cparams) + if kind == "medium" + else None + ) + full = None + if kind == "full": + with tempfile.TemporaryDirectory(prefix="blosc2-index-ooc-") as tmpdir: + full = _build_full_descriptor_ooc( + array, target, token, kind, dtype, persistent, Path(tmpdir), cparams + ) + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + persistent, + True, + name, + dtype, + levels, + light, + reduced, + full, + cparams, + ) + else: + values = _values_for_target(array, target) + levels = _build_levels_descriptor(array, target, token, kind, dtype, values, persistent, cparams) + light = ( + _build_light_descriptor(array, token, kind, values, optlevel, persistent, cparams) + if kind == "light" + else None + ) + reduced = ( + _build_reduced_descriptor(array, token, kind, values, optlevel, persistent, cparams) + if kind == "medium" + else None + ) + full = ( + _build_full_descriptor(array, token, kind, values, persistent, cparams) + if kind == "full" + else None + ) + descriptor = _build_descriptor( + array, + target, + token, + kind, + optlevel, + persistent, + False, + name, + dtype, + levels, + light, + reduced, + full, + cparams, + ) + + store = _load_store(array) + store["indexes"][token] = descriptor + _save_store(array, store) + return _copy_descriptor(descriptor) + + +def create_csindex(array: blosc2.NDArray, field: str | None = None, **kwargs) -> dict: + return create_index(array, field=field, kind="full", **kwargs) + + +def _resolve_index_token(store: dict, field: str | None, name: str | None) -> str: + token = None + if field is not None: + token = _field_token(field) + elif name is None and len(store["indexes"]) == 1: + token = next(iter(store["indexes"])) + if token is None: + for key, descriptor in store["indexes"].items(): + if descriptor.get("name") == name: + token = key + break + if token is None or token not in store["indexes"]: + raise KeyError("index not found") + return token + + +def iter_index_components(array: blosc2.NDArray, descriptor: dict): + for level in descriptor["levels"]: + level_info = descriptor["levels"][level] + yield IndexComponent(f"summary.{level}", "summary", level, level_info.get("path")) + + light = descriptor.get("light") + if light is not None: + yield IndexComponent("light.values", "light", "values", light.get("values_path")) + yield IndexComponent( + "light.bucket_positions", "light", "bucket_positions", light.get("bucket_positions_path") + ) + yield IndexComponent("light.offsets", "light", "offsets", light.get("offsets_path")) + yield IndexComponent("light_nav.l1", "light_nav", "l1", light.get("l1_path")) + yield IndexComponent("light_nav.l2", "light_nav", "l2", light.get("l2_path")) + + reduced = descriptor.get("reduced") + if reduced is not None: + yield IndexComponent("reduced.values", "reduced", "values", reduced.get("values_path")) + yield IndexComponent("reduced.positions", "reduced", "positions", reduced.get("positions_path")) + yield IndexComponent("reduced.offsets", "reduced", "offsets", reduced.get("offsets_path")) + yield IndexComponent("reduced_nav.l1", "reduced_nav", "l1", reduced.get("l1_path")) + yield IndexComponent("reduced_nav.l2", "reduced_nav", "l2", reduced.get("l2_path")) + + full = descriptor.get("full") + if full is not None: + yield IndexComponent("full.values", "full", "values", full.get("values_path")) + yield IndexComponent("full.positions", "full", "positions", full.get("positions_path")) + yield IndexComponent("full_nav.l1", "full_nav", "l1", full.get("l1_path")) + yield IndexComponent("full_nav.l2", "full_nav", "l2", full.get("l2_path")) + for run in full.get("runs", ()): + run_id = int(run["id"]) + yield IndexComponent( + f"full_run.{run_id}.values", + "full_run", + f"{run_id}.values", + run.get("values_path"), + ) + yield IndexComponent( + f"full_run.{run_id}.positions", + "full_run", + f"{run_id}.positions", + run.get("positions_path"), + ) + + +def _component_nbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: + if component.path is not None: + return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).nbytes) + token = descriptor["token"] + return int(_load_array_sidecar(array, token, component.category, component.name, component.path).nbytes) + + +def _component_cbytes(array: blosc2.NDArray, descriptor: dict, component: IndexComponent) -> int: + if component.path is not None: + return int(blosc2.open(component.path, mmap_mode=_INDEX_MMAP_MODE).cbytes) + token = descriptor["token"] + sidecar = _load_array_sidecar(array, token, component.category, component.name, component.path) + kwargs = {} + cparams = descriptor.get("cparams") + if cparams is not None: + kwargs["cparams"] = cparams + return int(blosc2.asarray(sidecar, **kwargs).cbytes) + + +class Index(Mapping): + def __init__(self, array: blosc2.NDArray, token: str): + self._array = array + self._token = token + + def _descriptor(self) -> dict: + return _descriptor_for_token(self._array, self._token) + + @property + def descriptor(self) -> dict: + return _copy_descriptor_for_token(self._array, self._token) + + @property + def kind(self) -> str: + return self._descriptor()["kind"] + + @property + def field(self) -> str | None: + return self._descriptor()["field"] + + @property + def name(self) -> str | None: + return self._descriptor()["name"] + + @property + def target(self) -> dict: + return self.descriptor["target"] + + @property + def persistent(self) -> bool: + return bool(self._descriptor()["persistent"]) + + @property + def stale(self) -> bool: + return bool(self._descriptor()["stale"]) + + @property + def nbytes(self) -> int: + descriptor = self._descriptor() + return sum( + _component_nbytes(self._array, descriptor, component) + for component in iter_index_components(self._array, descriptor) + ) + + @property + def cbytes(self) -> int: + descriptor = self._descriptor() + return sum( + _component_cbytes(self._array, descriptor, component) + for component in iter_index_components(self._array, descriptor) + ) + + @property + def cratio(self) -> float: + cbytes = self.cbytes + if cbytes == 0: + return math.inf + return self.nbytes / cbytes + + def drop(self) -> None: + drop_index(self._array, field=self.field, name=self.name) + + def rebuild(self) -> Index: + rebuild_index(self._array, field=self.field, name=self.name) + return self + + def compact(self) -> Index: + compact_index(self._array, field=self.field, name=self.name) + return self + + def __getitem__(self, key): + return self.descriptor[key] + + def __iter__(self): + return iter(self.descriptor) + + def __len__(self) -> int: + return len(self.descriptor) + + def __repr__(self) -> str: + try: + descriptor = self._descriptor() + except KeyError: + return "Index()" + return ( + f"Index(kind={descriptor['kind']!r}, field={descriptor['field']!r}, " + f"name={descriptor['name']!r}, stale={descriptor['stale']!r})" + ) + + +def _remove_sidecar_path(path: str | None) -> None: + if path: + blosc2.remove_urlpath(path) + + +def _drop_descriptor_sidecars(descriptor: dict) -> None: + for level_info in descriptor["levels"].values(): + _remove_sidecar_path(level_info["path"]) + if descriptor.get("light") is not None: + _remove_sidecar_path(descriptor["light"]["values_path"]) + _remove_sidecar_path(descriptor["light"]["bucket_positions_path"]) + _remove_sidecar_path(descriptor["light"]["offsets_path"]) + _remove_sidecar_path(descriptor["light"].get("l1_path")) + _remove_sidecar_path(descriptor["light"].get("l2_path")) + if descriptor.get("reduced") is not None: + _remove_sidecar_path(descriptor["reduced"]["values_path"]) + _remove_sidecar_path(descriptor["reduced"]["positions_path"]) + _remove_sidecar_path(descriptor["reduced"]["offsets_path"]) + _remove_sidecar_path(descriptor["reduced"].get("l1_path")) + _remove_sidecar_path(descriptor["reduced"].get("l2_path")) + if descriptor.get("full") is not None: + _remove_sidecar_path(descriptor["full"]["values_path"]) + _remove_sidecar_path(descriptor["full"]["positions_path"]) + _remove_sidecar_path(descriptor["full"].get("l1_path")) + _remove_sidecar_path(descriptor["full"].get("l2_path")) + for run in descriptor["full"].get("runs", ()): + _remove_sidecar_path(run.get("values_path")) + _remove_sidecar_path(run.get("positions_path")) + + +def _replace_levels_descriptor(array: blosc2.NDArray, descriptor: dict, kind: str, persistent: bool) -> None: + size = int(array.shape[0]) + target = descriptor["target"] + token = descriptor["token"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) + for level, level_info in descriptor["levels"].items(): + segment_len = int(level_info["segment_len"]) + start = 0 + summaries = _compute_segment_summaries( + _slice_values_for_target(array, target, start, size), np.dtype(descriptor["dtype"]), segment_len + ) + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) + level_info["path"] = sidecar["path"] + level_info["dtype"] = sidecar["dtype"] + level_info["nsegments"] = len(summaries) + + +def _replace_levels_descriptor_tail( + array: blosc2.NDArray, descriptor: dict, kind: str, old_size: int, persistent: bool +) -> None: + target = descriptor["target"] + token = descriptor["token"] + dtype = np.dtype(descriptor["dtype"]) + new_size = int(array.shape[0]) + cparams = _normalize_index_cparams(descriptor.get("cparams")) + for level, level_info in descriptor["levels"].items(): + segment_len = int(level_info["segment_len"]) + start_segment = old_size // segment_len + prefix = _load_level_summaries(array, descriptor, level)[:start_segment] + tail_start = start_segment * segment_len + tail_values = _slice_values_for_target(array, target, tail_start, new_size) + tail_summaries = _compute_segment_summaries(tail_values, dtype, segment_len) + summaries = np.concatenate((prefix, tail_summaries)) if len(prefix) else tail_summaries + sidecar = _store_array_sidecar( + array, token, kind, "summary", level, summaries, persistent, cparams=cparams + ) + level_info["path"] = sidecar["path"] + level_info["dtype"] = sidecar["dtype"] + level_info["nsegments"] = len(summaries) + + +def _replace_reduced_descriptor_tail( + array: blosc2.NDArray, descriptor: dict, old_size: int, persistent: bool +) -> None: + del old_size + target = descriptor["target"] + reduced = descriptor["reduced"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) + for key in ("values_path", "positions_path", "offsets_path", "l1_path", "l2_path"): + _remove_sidecar_path(reduced.get(key)) + if descriptor.get("ooc", False): + rebuilt = _build_reduced_descriptor_ooc( + array, + target, + descriptor["token"], + descriptor["kind"], + np.dtype(descriptor["dtype"]), + descriptor["optlevel"], + persistent, + cparams, + ) + else: + rebuilt = _build_reduced_descriptor( + array, + descriptor["token"], + descriptor["kind"], + _values_for_target(array, target), + descriptor["optlevel"], + persistent, + cparams, + ) + descriptor["reduced"] = rebuilt + + +def _replace_light_descriptor_tail( + array: blosc2.NDArray, descriptor: dict, old_size: int, persistent: bool +) -> None: + del old_size + target = descriptor["target"] + light = descriptor["light"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) + for key in ("values_path", "bucket_positions_path", "offsets_path", "l1_path", "l2_path"): + _remove_sidecar_path(light.get(key)) + if descriptor.get("ooc", False): + rebuilt = _build_light_descriptor_ooc( + array, + target, + descriptor["token"], + descriptor["kind"], + np.dtype(descriptor["dtype"]), + descriptor["optlevel"], + persistent, + cparams, + ) + else: + rebuilt = _build_light_descriptor( + array, + descriptor["token"], + descriptor["kind"], + _values_for_target(array, target), + descriptor["optlevel"], + persistent, + cparams, + ) + descriptor["light"] = rebuilt + + +def _replace_full_descriptor( + array: blosc2.NDArray, + descriptor: dict, + sorted_values: np.ndarray, + positions: np.ndarray, + persistent: bool, +) -> None: + kind = descriptor["kind"] + token = descriptor["token"] + full = descriptor["full"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) + for run in full.get("runs", ()): + _remove_sidecar_path(run.get("values_path")) + _remove_sidecar_path(run.get("positions_path")) + _remove_sidecar_path(full.get("l1_path")) + _remove_sidecar_path(full.get("l2_path")) + _clear_cached_data(array, token) + values_sidecar = _store_array_sidecar( + array, token, kind, "full", "values", sorted_values, persistent, cparams=cparams + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full", "positions", positions, persistent, cparams=cparams + ) + full["values_path"] = values_sidecar["path"] + full["positions_path"] = positions_sidecar["path"] + full["runs"] = [] + full["next_run_id"] = 0 + _rebuild_full_navigation_sidecars(array, token, kind, full, sorted_values, persistent, cparams) + + +def _replace_full_descriptor_from_paths( + array: blosc2.NDArray, + descriptor: dict, + values_path: Path, + positions_path: Path, + length: int, +) -> None: + kind = descriptor["kind"] + token = descriptor["token"] + full = descriptor["full"] + persistent = descriptor["persistent"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) + if not persistent: + raise ValueError("path-based full replacement requires persistent indexes") + for run in full.get("runs", ()): + _remove_sidecar_path(run.get("values_path")) + _remove_sidecar_path(run.get("positions_path")) + _remove_sidecar_path(full.get("l1_path")) + _remove_sidecar_path(full.get("l2_path")) + _clear_cached_data(array, token) + final_values_path = _sidecar_path(array, token, kind, "full.values") + final_positions_path = _sidecar_path(array, token, kind, "full.positions") + _remove_sidecar_path(final_values_path) + _remove_sidecar_path(final_positions_path) + _stream_copy_sidecar_array( + values_path, + final_values_path, + length, + np.dtype(descriptor["dtype"]), + (int(array.chunks[0]),), + (int(array.blocks[0]),), + cparams, + ) + _stream_copy_sidecar_array( + positions_path, + final_positions_path, + length, + np.dtype(np.int64), + (int(array.chunks[0]),), + (int(array.blocks[0]),), + cparams, + ) + values_path.unlink(missing_ok=True) + positions_path.unlink(missing_ok=True) + full["values_path"] = final_values_path + full["positions_path"] = final_positions_path + full["runs"] = [] + full["next_run_id"] = 0 + _rebuild_full_navigation_sidecars_from_path( + array, + token, + kind, + full, + final_values_path, + np.dtype(descriptor["dtype"]), + length, + persistent, + cparams, + ) + + +def _store_full_run_descriptor( + array: blosc2.NDArray, + descriptor: dict, + run_id: int, + sorted_values: np.ndarray, + positions: np.ndarray, +) -> dict: + kind = descriptor["kind"] + token = descriptor["token"] + persistent = descriptor["persistent"] + cparams = _normalize_index_cparams(descriptor.get("cparams")) + values_sidecar = _store_array_sidecar( + array, token, kind, "full_run", f"{run_id}.values", sorted_values, persistent, cparams=cparams + ) + positions_sidecar = _store_array_sidecar( + array, token, kind, "full_run", f"{run_id}.positions", positions, persistent, cparams=cparams + ) + return { + "id": run_id, + "length": len(sorted_values), + "values_path": values_sidecar["path"], + "positions_path": positions_sidecar["path"], + } + + +def _append_full_descriptor( + array: blosc2.NDArray, descriptor: dict, old_size: int, appended_values: np.ndarray +) -> None: + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + appended_positions = np.arange(old_size, old_size + len(appended_values), dtype=np.int64) + order = np.lexsort((appended_positions, appended_values)) + run_id = int(full.get("next_run_id", 0)) + run = _store_full_run_descriptor( + array, + descriptor, + run_id, + appended_values[order], + appended_positions[order], + ) + runs = list(full.get("runs", ())) + runs.append(run) + full["runs"] = runs + full["next_run_id"] = run_id + 1 + _clear_full_merge_cache(array, descriptor["token"]) + + +def append_to_indexes(array: blosc2.NDArray, old_size: int, appended_values: np.ndarray) -> None: + store = _load_store(array) + if not store["indexes"]: + return + + for descriptor in store["indexes"].values(): + kind = descriptor["kind"] + persistent = descriptor["persistent"] + target = descriptor["target"] + target_values = _values_from_numpy_target(appended_values, target) + if descriptor.get("stale", False): + continue + if kind == "full": + _append_full_descriptor(array, descriptor, old_size, target_values) + elif kind == "medium": + _replace_reduced_descriptor_tail(array, descriptor, old_size, persistent) + elif kind == "light": + _replace_light_descriptor_tail(array, descriptor, old_size, persistent) + _replace_levels_descriptor_tail(array, descriptor, kind, old_size, persistent) + descriptor["shape"] = tuple(array.shape) + descriptor["chunks"] = tuple(array.chunks) + descriptor["blocks"] = tuple(array.blocks) + descriptor["stale"] = False + _save_store(array, store) + _invalidate_query_cache(array) + + +def drop_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> None: + store = _load_store(array) + token = _resolve_index_token(store, field, name) + descriptor = store["indexes"][token] + _clear_cached_data(array, descriptor["token"]) + descriptor = store["indexes"].pop(token) + _save_store(array, store) + _drop_descriptor_sidecars(descriptor) + _invalidate_query_cache(array) + + +def rebuild_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> dict: + store = _load_store(array) + token = _resolve_index_token(store, field, name) + descriptor = store["indexes"][token] + drop_index(array, field=descriptor["field"], name=descriptor["name"]) + if descriptor["target"]["source"] == "expression": + operands = array.fields if array.dtype.fields is not None else {SELF_TARGET_NAME: array} + return create_expr_index( + array, + descriptor["target"]["expression_key"], + operands=operands, + kind=descriptor["kind"], + optlevel=descriptor["optlevel"], + persistent=descriptor["persistent"], + in_mem=not descriptor.get("ooc", False), + name=descriptor["name"], + ) + return create_index( + array, + field=descriptor["field"], + kind=descriptor["kind"], + optlevel=descriptor["optlevel"], + persistent=descriptor["persistent"], + in_mem=not descriptor.get("ooc", False), + name=descriptor["name"], + ) + + +def _full_compaction_runs(array: blosc2.NDArray, descriptor: dict, workdir: Path) -> list[SortedRun]: + full = descriptor["full"] + dtype = np.dtype(descriptor["dtype"]) + token = descriptor["token"] + runs = [] + if full["values_path"] is not None and full["positions_path"] is not None: + length = int(array.shape[0]) - sum(int(run["length"]) for run in full.get("runs", ())) + base_values_path = _copy_sidecar_to_temp_run( + full["values_path"], length, dtype, workdir, "compact_base_values" + ) + base_positions_path = _copy_sidecar_to_temp_run( + full["positions_path"], length, np.dtype(np.int64), workdir, "compact_base_positions" + ) + runs.append(SortedRun(base_values_path, base_positions_path, length)) + else: + values = _load_array_sidecar(array, token, "full", "values", full["values_path"]) + positions = _load_array_sidecar(array, token, "full", "positions", full["positions_path"]) + runs.append(_materialize_sorted_run(values, positions, len(values), dtype, workdir, "compact_base")) + + for run in full.get("runs", ()): + run_length = int(run["length"]) + run_id = int(run["id"]) + if run["values_path"] is not None and run["positions_path"] is not None: + run_values_path = _copy_sidecar_to_temp_run( + run["values_path"], run_length, dtype, workdir, f"run_{run_id}_values" + ) + run_positions_path = _copy_sidecar_to_temp_run( + run["positions_path"], run_length, np.dtype(np.int64), workdir, f"run_{run_id}_positions" + ) + runs.append(SortedRun(run_values_path, run_positions_path, run_length)) + continue + run_values, run_positions = _load_full_run_arrays(array, descriptor, run) + runs.append( + _materialize_sorted_run(run_values, run_positions, run_length, dtype, workdir, f"run_{run_id}") + ) + return runs + + +def compact_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> dict: + store = _load_store(array) + token = _resolve_index_token(store, field, name) + descriptor = store["indexes"][token] + if descriptor["kind"] != "full": + raise NotImplementedError("compact_index() is currently only implemented for full indexes") + if descriptor.get("stale", False): + raise RuntimeError("cannot compact a stale index; rebuild it first") + + full = descriptor["full"] + if not full.get("runs"): + if full.get("l1_path") is None or full.get("l2_path") is None: + sorted_values, positions = _load_full_arrays(array, descriptor) + _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) + _clear_full_merge_cache(array, descriptor["token"]) + _save_store(array, store) + _invalidate_query_cache(array) + return _copy_descriptor(descriptor) + + dtype = np.dtype(descriptor["dtype"]) + with tempfile.TemporaryDirectory(prefix="blosc2-index-compact-") as tmpdir: + workdir = Path(tmpdir) + runs = _full_compaction_runs(array, descriptor, workdir) + merge_id = 0 + while len(runs) > 1: + next_runs = [] + for idx in range(0, len(runs), 2): + if idx + 1 >= len(runs): + next_runs.append(runs[idx]) + continue + next_runs.append( + _merge_run_pair( + runs[idx], runs[idx + 1], workdir, dtype, merge_id, FULL_OOC_MERGE_BUFFER_ITEMS + ) + ) + merge_id += 1 + runs = next_runs + final_run = runs[0] + if descriptor["persistent"]: + _replace_full_descriptor_from_paths( + array, descriptor, final_run.values_path, final_run.positions_path, final_run.length + ) + else: + sorted_values = blosc2.open(str(final_run.values_path), mmap_mode=_INDEX_MMAP_MODE)[:] + positions = blosc2.open(str(final_run.positions_path), mmap_mode=_INDEX_MMAP_MODE)[:] + _replace_full_descriptor(array, descriptor, sorted_values, positions, descriptor["persistent"]) + del sorted_values, positions + final_run.values_path.unlink(missing_ok=True) + final_run.positions_path.unlink(missing_ok=True) + + _clear_full_merge_cache(array, descriptor["token"]) + _save_store(array, store) + _invalidate_query_cache(array) + return _copy_descriptor(descriptor) + + +def get_indexes(array: blosc2.NDArray) -> list[Index]: + store = _load_store(array) + return [Index(array, key) for key in sorted(store["indexes"])] + + +def get_index(array: blosc2.NDArray, field: str | None = None, name: str | None = None) -> Index: + store = _load_store(array) + token = _resolve_index_token(store, field, name) + return Index(array, token) + + +def mark_indexes_stale(array: blosc2.NDArray) -> None: + store = _load_store(array) + if not store["indexes"]: + return + changed = False + for descriptor in store["indexes"].values(): + if not descriptor.get("stale", False): + descriptor["stale"] = True + changed = True + if changed: + _save_store(array, store) + _invalidate_query_cache(array) + + +def _descriptor_for(array: blosc2.NDArray, field: str | None) -> dict | None: + return _descriptor_for_target(array, _field_target_descriptor(field)) + + +def _descriptor_for_target(array: blosc2.NDArray, target: dict) -> dict | None: + descriptor = _load_store(array)["indexes"].get(_target_token(target)) + if descriptor is None or descriptor.get("stale", False): + return None + if descriptor.get("version") != INDEX_FORMAT_VERSION: + return None + if descriptor.get("kind") == "light": + light = descriptor.get("light", {}) + if light.get("layout") != "chunk-local-v1" or "values_path" not in light: + return None + if descriptor.get("kind") == "medium": + reduced = descriptor.get("reduced", {}) + if reduced.get("layout") != "chunk-local-v1" or "values_path" not in reduced: + return None + if tuple(descriptor.get("shape", ())) != tuple(array.shape): + return None + if tuple(descriptor.get("chunks", ())) != tuple(array.chunks): + return None + return descriptor + + +def _load_level_summaries(array: blosc2.NDArray, descriptor: dict, level: str) -> np.ndarray: + level_info = descriptor["levels"][level] + return _load_array_sidecar(array, descriptor["token"], "summary", level, level_info["path"]) + + +def _full_merge_cache_key(array: blosc2.NDArray, token: str, name: str): + return _data_cache_key(array, token, "full_merged", name) + + +def _clear_full_merge_cache(array: blosc2.NDArray, token: str) -> None: + _DATA_CACHE.pop(_full_merge_cache_key(array, token, "values"), None) + _DATA_CACHE.pop(_full_merge_cache_key(array, token, "positions"), None) + + +def _load_full_run_arrays( + array: blosc2.NDArray, descriptor: dict, run: dict +) -> tuple[np.ndarray, np.ndarray]: + run_id = int(run["id"]) + token = descriptor["token"] + values = _load_array_sidecar(array, token, "full_run", f"{run_id}.values", run["values_path"]) + positions = _load_array_sidecar(array, token, "full_run", f"{run_id}.positions", run["positions_path"]) + return values, positions + + +def _load_full_navigation_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + l1_path = full.get("l1_path") + l2_path = full.get("l2_path") + if l1_path is None or l2_path is None: + raise RuntimeError("full index navigation metadata is not available") + token = descriptor["token"] + l1 = _load_array_sidecar(array, token, "full_nav", "l1", l1_path) + l2 = _load_array_sidecar(array, token, "full_nav", "l2", l2_path) + return l1, l2 + + +def _load_full_sidecar_handles(array: blosc2.NDArray, descriptor: dict): + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + token = descriptor["token"] + values_sidecar = _open_sidecar_handle(array, token, "full_handle", "values", full["values_path"]) + positions_sidecar = _open_sidecar_handle( + array, token, "full_handle", "positions", full["positions_path"] + ) + return values_sidecar, positions_sidecar + + +def _load_full_run_sidecar_handles(array: blosc2.NDArray, descriptor: dict, run: dict): + run_id = int(run["id"]) + token = descriptor["token"] + values_sidecar = _open_sidecar_handle( + array, token, "full_run_handle", f"{run_id}.values", run["values_path"] + ) + positions_sidecar = _open_sidecar_handle( + array, token, "full_run_handle", f"{run_id}.positions", run["positions_path"] + ) + return values_sidecar, positions_sidecar + + +def _load_full_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: + full = descriptor.get("full") + if full is None: + raise RuntimeError("full index metadata is not available") + token = descriptor["token"] + runs = full.get("runs", ()) + if runs: + cached_values = _DATA_CACHE.get(_full_merge_cache_key(array, token, "values")) + cached_positions = _DATA_CACHE.get(_full_merge_cache_key(array, token, "positions")) + if cached_values is not None and cached_positions is not None: + return cached_values, cached_positions + + values = _load_array_sidecar(array, token, "full", "values", full["values_path"]) + positions = _load_array_sidecar(array, token, "full", "positions", full["positions_path"]) + if runs: + dtype = np.dtype(descriptor["dtype"]) + merged_values = values + merged_positions = positions + for run in runs: + run_values, run_positions = _load_full_run_arrays(array, descriptor, run) + merged_values, merged_positions = _merge_sorted_slices( + merged_values, merged_positions, run_values, run_positions, dtype + ) + _DATA_CACHE[_full_merge_cache_key(array, token, "values")] = merged_values + _DATA_CACHE[_full_merge_cache_key(array, token, "positions")] = merged_positions + return merged_values, merged_positions + return values, positions + + +def _load_reduced_arrays( + array: blosc2.NDArray, descriptor: dict +) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + values = _load_array_sidecar(array, descriptor["token"], "reduced", "values", reduced["values_path"]) + positions = _load_array_sidecar( + array, descriptor["token"], "reduced", "positions", reduced["positions_path"] + ) + offsets = _load_array_sidecar(array, descriptor["token"], "reduced", "offsets", reduced["offsets_path"]) + return values, positions, offsets + + +def _load_reduced_navigation_arrays( + array: blosc2.NDArray, descriptor: dict +) -> tuple[np.ndarray, np.ndarray]: + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + token = descriptor["token"] + l1 = _load_array_sidecar(array, token, "reduced_nav", "l1", reduced["l1_path"]) + l2 = _load_array_sidecar(array, token, "reduced_nav", "l2", reduced["l2_path"]) + return l1, l2 + + +def _load_reduced_l1_array(array: blosc2.NDArray, descriptor: dict) -> np.ndarray: + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + token = descriptor["token"] + return _load_array_sidecar(array, token, "reduced_nav", "l1", reduced["l1_path"]) + + +def _load_reduced_sidecar_handles(array: blosc2.NDArray, descriptor: dict): + reduced = descriptor.get("reduced") + if reduced is None: + raise RuntimeError("reduced index metadata is not available") + token = descriptor["token"] + values_sidecar = _open_sidecar_handle(array, token, "reduced_handle", "values", reduced["values_path"]) + positions_sidecar = _open_sidecar_handle( + array, token, "reduced_handle", "positions", reduced["positions_path"] + ) + l2_sidecar = _open_sidecar_handle(array, token, "reduced_nav_handle", "l2", reduced["l2_path"]) + return values_sidecar, positions_sidecar, l2_sidecar + + +def _load_light_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray, np.ndarray]: + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + values = _load_array_sidecar(array, descriptor["token"], "light", "values", light["values_path"]) + positions = _load_array_sidecar( + array, descriptor["token"], "light", "bucket_positions", light["bucket_positions_path"] + ) + offsets = _load_array_sidecar(array, descriptor["token"], "light", "offsets", light["offsets_path"]) + return values, positions, offsets + + +def _load_light_navigation_arrays(array: blosc2.NDArray, descriptor: dict) -> tuple[np.ndarray, np.ndarray]: + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + token = descriptor["token"] + l1 = _load_array_sidecar(array, token, "light_nav", "l1", light["l1_path"]) + l2 = _load_array_sidecar(array, token, "light_nav", "l2", light["l2_path"]) + return l1, l2 + + +def _load_light_l1_array(array: blosc2.NDArray, descriptor: dict) -> np.ndarray: + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + token = descriptor["token"] + return _load_array_sidecar(array, token, "light_nav", "l1", light["l1_path"]) + + +def _load_light_sidecar_handles(array: blosc2.NDArray, descriptor: dict): + light = descriptor.get("light") + if light is None: + raise RuntimeError("light index metadata is not available") + token = descriptor["token"] + values_sidecar = _open_sidecar_handle(array, token, "light_handle", "values", light["values_path"]) + bucket_sidecar = _open_sidecar_handle( + array, token, "light_handle", "bucket_positions", light["bucket_positions_path"] + ) + l2_sidecar = _open_sidecar_handle(array, token, "light_nav_handle", "l2", light["l2_path"]) + return values_sidecar, bucket_sidecar, l2_sidecar + + +def _normalize_scalar(value, dtype: np.dtype): + if isinstance(value, np.generic): + return value.item() + if dtype.kind == "f" and isinstance(value, float) and np.isnan(value): + raise ValueError("NaN comparisons are not indexable") + return np.asarray(value, dtype=dtype)[()] + + +def _candidate_units_from_summary(summaries: np.ndarray, op: str, value, dtype: np.dtype) -> np.ndarray: + mins = summaries["min"] + maxs = summaries["max"] + flags = summaries["flags"] + valid = (flags & FLAG_ALL_NAN) == 0 + value = _normalize_scalar(value, dtype) + if op == "==": + return valid & (mins <= value) & (value <= maxs) + if op == "<": + return valid & (mins < value) + if op == "<=": + return valid & (mins <= value) + if op == ">": + return valid & (maxs > value) + if op == ">=": + return valid & (maxs >= value) + raise ValueError(f"unsupported comparison operator {op!r}") + + +def _intervals_from_sorted(values: np.ndarray, op: str, value, dtype: np.dtype) -> list[tuple[int, int]]: + value = _normalize_scalar(value, dtype) + if op == "==": + lo = np.searchsorted(values, value, side="left") + hi = np.searchsorted(values, value, side="right") + elif op == "<": + lo = 0 + hi = np.searchsorted(values, value, side="left") + elif op == "<=": + lo = 0 + hi = np.searchsorted(values, value, side="right") + elif op == ">": + lo = np.searchsorted(values, value, side="right") + hi = len(values) + elif op == ">=": + lo = np.searchsorted(values, value, side="left") + hi = len(values) + else: + raise ValueError(f"unsupported comparison operator {op!r}") + return [] if lo >= hi else [(int(lo), int(hi))] + + +def _operand_target(operand) -> tuple[blosc2.NDArray, str | None] | None: + if isinstance(operand, blosc2.NDField): + return operand.ndarr, operand.field + if isinstance(operand, blosc2.NDArray): + return operand, None + return None + + +def _literal_value(node: ast.AST): + if isinstance(node, ast.Constant): + return node.value + if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.USub): + value = _literal_value(node.operand) + if isinstance(value, bool): + raise ValueError("boolean negation is not a scalar literal here") + return -value + if isinstance(node, ast.UnaryOp) and isinstance(node.op, ast.UAdd): + return _literal_value(node.operand) + raise ValueError("node is not a supported scalar literal") + + +def _flip_operator(op: str) -> str: + return {"<": ">", "<=": ">=", ">": "<", ">=": "<=", "==": "=="}[op] + + +def _compare_operator(node: ast.AST) -> str | None: + if isinstance(node, ast.Eq): + return "==" + if isinstance(node, ast.Lt): + return "<" + if isinstance(node, ast.LtE): + return "<=" + if isinstance(node, ast.Gt): + return ">" + if isinstance(node, ast.GtE): + return ">=" + return None + + +def _compare_target_from_node(node: ast.AST, operands: dict) -> tuple[blosc2.NDArray, dict] | None: + if isinstance(node, ast.Name): + operand = operands.get(node.id) + target = _operand_target(operand) if operand is not None else None + if target is None: + return None + base, field = target + if base.ndim != 1: + return None + return base, _field_target_descriptor(field) + + normalized = _normalize_expression_node(node, operands) + if normalized is None: + return None + base, expression_key, dependencies = normalized + return base, _expression_target_descriptor(ast.unparse(node), expression_key, dependencies) + + +def _target_from_compare( + node: ast.Compare, operands: dict +) -> tuple[blosc2.NDArray, dict, str, object] | None: + if len(node.ops) != 1 or len(node.comparators) != 1: + return None + op = _compare_operator(node.ops[0]) + if op is None: + return None + + try: + left_target = _compare_target_from_node(node.left, operands) + right_target = _compare_target_from_node(node.comparators[0], operands) + if left_target is not None: + value = _literal_value(node.comparators[0]) + elif right_target is not None: + value = _literal_value(node.left) + op = _flip_operator(op) + else: + return None + except ValueError: + return None + + base, target = left_target if left_target is not None else right_target + return base, target, op, value + + +def _finest_level(descriptor: dict) -> str: + level_names = tuple(descriptor["levels"]) + return level_names[-1] + + +def _plan_segment_compare(node: ast.Compare, operands: dict) -> SegmentPredicatePlan | None: + target = _target_from_compare(node, operands) + if target is None: + return None + base, target_info, op, value = target + descriptor = _descriptor_for_target(base, target_info) + if descriptor is None: + return None + level = _finest_level(descriptor) + level_info = descriptor["levels"][level] + dtype = np.dtype(descriptor["dtype"]) + try: + summaries = _load_level_summaries(base, descriptor, level) + candidate_units = _candidate_units_from_summary(summaries, op, value, dtype) + except (RuntimeError, ValueError, TypeError): + return None + return SegmentPredicatePlan( + base=base, + candidate_units=candidate_units, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + level=level, + segment_len=level_info["segment_len"], + ) + + +def _same_segment_space(left: SegmentPredicatePlan, right: SegmentPredicatePlan) -> bool: + return ( + left.base is right.base + and left.level == right.level + and left.segment_len == right.segment_len + and left.candidate_units.shape == right.candidate_units.shape + ) + + +def _merge_segment_plans( + left: SegmentPredicatePlan, right: SegmentPredicatePlan, op: str +) -> SegmentPredicatePlan | None: + if not _same_segment_space(left, right): + return None + if op == "and": + candidate_units = left.candidate_units & right.candidate_units + else: + candidate_units = left.candidate_units | right.candidate_units + return SegmentPredicatePlan( + base=left.base, + candidate_units=candidate_units, + descriptor=left.descriptor, + target=left.target, + field=left.field, + level=left.level, + segment_len=left.segment_len, + ) + + +def _plan_segment_boolop(node: ast.BoolOp, operands: dict) -> SegmentPredicatePlan | None: + op = "and" if isinstance(node.op, ast.And) else "or" if isinstance(node.op, ast.Or) else None + if op is None: + return None + plans = [_plan_segment_node(value, operands) for value in node.values] + if op == "and": + plans = [plan for plan in plans if plan is not None] + if not plans: + return None + elif any(plan is None for plan in plans): + return None + + plan = plans[0] + for other in plans[1:]: + merged = _merge_segment_plans(plan, other, op) + if merged is None: + return None + plan = merged + return plan + + +def _plan_segment_bitop(node: ast.BinOp, operands: dict) -> SegmentPredicatePlan | None: + if isinstance(node.op, ast.BitAnd): + op = "and" + elif isinstance(node.op, ast.BitOr): + op = "or" + else: + return None + + left = _plan_segment_node(node.left, operands) + right = _plan_segment_node(node.right, operands) + if op == "and": + if left is None: + return right + if right is None: + return left + return _merge_segment_plans(left, right, op) + if left is None or right is None: + return None + return _merge_segment_plans(left, right, op) + + +def _plan_segment_node(node: ast.AST, operands: dict) -> SegmentPredicatePlan | None: + if isinstance(node, ast.Compare): + return _plan_segment_compare(node, operands) + if isinstance(node, ast.BoolOp): + return _plan_segment_boolop(node, operands) + if isinstance(node, ast.BinOp): + return _plan_segment_bitop(node, operands) + return None + + +def _plan_exact_compare(node: ast.Compare, operands: dict) -> ExactPredicatePlan | None: + target = _target_from_compare(node, operands) + if target is None: + return None + base, target_info, op, value = target + descriptor = _descriptor_for_target(base, target_info) + if descriptor is None or descriptor.get("kind") not in {"light", "medium", "full"}: + return None + try: + value = _normalize_scalar(value, np.dtype(descriptor["dtype"])) + except (RuntimeError, ValueError, TypeError): + return None + if op == "==": + return ExactPredicatePlan( + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + lower=value, + lower_inclusive=True, + upper=value, + upper_inclusive=True, + ) + if op == ">": + return ExactPredicatePlan( + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + lower=value, + lower_inclusive=False, + ) + if op == ">=": + return ExactPredicatePlan( + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + lower=value, + lower_inclusive=True, + ) + if op == "<": + return ExactPredicatePlan( + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + upper=value, + upper_inclusive=False, + ) + if op == "<=": + return ExactPredicatePlan( + base=base, + descriptor=descriptor, + target=target_info, + field=_target_field(target_info), + upper=value, + upper_inclusive=True, + ) + return None + + +def _same_base(left: ExactPredicatePlan, right: ExactPredicatePlan) -> bool: + return left.base is right.base and left.descriptor["token"] == right.descriptor["token"] + + +def _merge_lower_bound( + left: object | None, left_inclusive: bool, right: object | None, right_inclusive: bool +) -> tuple[object | None, bool]: + if left is None: + return right, right_inclusive + if right is None: + return left, left_inclusive + if left < right: + return right, right_inclusive + if left > right: + return left, left_inclusive + return left, left_inclusive and right_inclusive + + +def _merge_upper_bound( + left: object | None, left_inclusive: bool, right: object | None, right_inclusive: bool +) -> tuple[object | None, bool]: + if left is None: + return right, right_inclusive + if right is None: + return left, left_inclusive + if left < right: + return left, left_inclusive + if left > right: + return right, right_inclusive + return left, left_inclusive and right_inclusive + + +def _merge_exact_plans( + left: ExactPredicatePlan, right: ExactPredicatePlan, op: str +) -> ExactPredicatePlan | None: + if op != "and" or not _same_base(left, right): + return None + lower, lower_inclusive = _merge_lower_bound( + left.lower, left.lower_inclusive, right.lower, right.lower_inclusive + ) + upper, upper_inclusive = _merge_upper_bound( + left.upper, left.upper_inclusive, right.upper, right.upper_inclusive + ) + return ExactPredicatePlan( + base=left.base, + descriptor=left.descriptor, + target=left.target, + field=left.field, + lower=lower, + lower_inclusive=lower_inclusive, + upper=upper, + upper_inclusive=upper_inclusive, + ) + + +def _plan_exact_conjunction(node: ast.AST, operands: dict) -> list[ExactPredicatePlan] | None: + if isinstance(node, ast.Compare): + plan = _plan_exact_compare(node, operands) + return None if plan is None else [plan] + if isinstance(node, ast.BoolOp): + if not isinstance(node.op, ast.And): + return None + plans = [] + for value in node.values: + subplans = _plan_exact_conjunction(value, operands) + if subplans is None: + return None + plans.extend(subplans) + return plans + if isinstance(node, ast.BinOp): + if not isinstance(node.op, ast.BitAnd): + return None + left = _plan_exact_conjunction(node.left, operands) + right = _plan_exact_conjunction(node.right, operands) + if left is None or right is None: + return None + return left + right + return None + + +def _plan_exact_boolop(node: ast.BoolOp, operands: dict) -> ExactPredicatePlan | None: + if not isinstance(node.op, ast.And): + return None + plans = [_plan_exact_node(value, operands) for value in node.values] + if any(plan is None for plan in plans): + return None + plan = plans[0] + for other in plans[1:]: + merged = _merge_exact_plans(plan, other, "and") + if merged is None: + return None + plan = merged + return plan + + +def _plan_exact_bitop(node: ast.BinOp, operands: dict) -> ExactPredicatePlan | None: + if not isinstance(node.op, ast.BitAnd): + return None + left = _plan_exact_node(node.left, operands) + right = _plan_exact_node(node.right, operands) + if left is None or right is None: + return None + return _merge_exact_plans(left, right, "and") + + +def _plan_exact_node(node: ast.AST, operands: dict) -> ExactPredicatePlan | None: + if isinstance(node, ast.Compare): + return _plan_exact_compare(node, operands) + if isinstance(node, ast.BoolOp): + return _plan_exact_boolop(node, operands) + if isinstance(node, ast.BinOp): + return _plan_exact_bitop(node, operands) + return None + + +def _range_is_empty(plan: ExactPredicatePlan) -> bool: + if plan.lower is None or plan.upper is None: + return False + if plan.lower < plan.upper: + return False + if plan.lower > plan.upper: + return True + return not (plan.lower_inclusive and plan.upper_inclusive) + + +def _candidate_units_from_exact_plan( + summaries: np.ndarray, dtype: np.dtype, plan: ExactPredicatePlan +) -> np.ndarray: + candidate_units = np.ones(len(summaries), dtype=bool) + if plan.lower is not None: + lower_op = ">=" if plan.lower_inclusive else ">" + candidate_units &= _candidate_units_from_summary(summaries, lower_op, plan.lower, dtype) + if plan.upper is not None: + upper_op = "<=" if plan.upper_inclusive else "<" + candidate_units &= _candidate_units_from_summary(summaries, upper_op, plan.upper, dtype) + return candidate_units + + +def _search_bounds(values: np.ndarray, plan: ExactPredicatePlan) -> tuple[int, int]: + try: + return indexing_ext.index_search_bounds( + values, plan.lower, plan.lower_inclusive, plan.upper, plan.upper_inclusive + ) + except TypeError: + lo = 0 + hi = len(values) + if plan.lower is not None: + side = "left" if plan.lower_inclusive else "right" + lo = int(np.searchsorted(values, plan.lower, side=side)) + if plan.upper is not None: + side = "right" if plan.upper_inclusive else "left" + hi = int(np.searchsorted(values, plan.upper, side=side)) + return lo, hi + + +def _candidate_units_from_boundaries(boundaries: np.ndarray, plan: ExactPredicatePlan) -> np.ndarray: + if len(boundaries) == 0: + return np.zeros(0, dtype=bool) + starts = boundaries["start"] + ends = boundaries["end"] + candidate = np.ones(len(boundaries), dtype=bool) + if plan.lower is not None: + candidate &= ends >= plan.lower if plan.lower_inclusive else ends > plan.lower + if plan.upper is not None: + candidate &= starts <= plan.upper if plan.upper_inclusive else starts < plan.upper + return candidate + + +def _full_runs_need_bounded_fallback(descriptor: dict) -> bool: + full = descriptor.get("full") + if full is None: + return False + runs = tuple(full.get("runs", ())) + if not runs: + return False + if len(runs) >= FULL_RUN_BOUNDED_FALLBACK_RUNS: + return True + return sum(int(run["length"]) for run in runs) >= FULL_RUN_BOUNDED_FALLBACK_ITEMS + + +def _full_query_mode_override() -> str: + mode = os.getenv("BLOSC2_FULL_EXACT_QUERY_MODE", "auto").strip().lower() + if mode not in {"auto", "selective-ooc", "whole-load"}: + return "auto" + return mode + + +def _contiguous_true_runs(mask: np.ndarray) -> list[tuple[int, int]]: + true_ids = np.flatnonzero(mask) + if len(true_ids) == 0: + return [] + breaks = np.nonzero(np.diff(true_ids) != 1)[0] + 1 + runs = [] + start = 0 + for stop in (*breaks, len(true_ids)): + part = true_ids[start:stop] + runs.append((int(part[0]), int(part[-1]) + 1)) + start = stop + return runs + + +def _sorted_chunk_boundaries_from_handle( + array: blosc2.NDArray, + token: str, + category: str, + name: str, + values_sidecar, + dtype: np.dtype, +) -> np.ndarray: + cache_key = _data_cache_key(array, token, category, name) + cached = _DATA_CACHE.get(cache_key) + if cached is not None: + return cached + + size = int(values_sidecar.shape[0]) + chunk_len = int(values_sidecar.chunks[0]) + nchunks = math.ceil(size / chunk_len) + boundaries = np.empty(nchunks, dtype=_boundary_dtype(dtype)) + start_value = np.empty(1, dtype=dtype) + end_value = np.empty(1, dtype=dtype) + for chunk_id in range(nchunks): + chunk_start = chunk_id * chunk_len + chunk_stop = min(chunk_start + chunk_len, size) + values_sidecar.get_1d_span_numpy(start_value, chunk_id, 0, 1) + values_sidecar.get_1d_span_numpy(end_value, chunk_id, chunk_stop - chunk_start - 1, 1) + boundaries[chunk_id] = (start_value[0], end_value[0]) + _DATA_CACHE[cache_key] = boundaries + return boundaries + + +def _full_supports_selective_ooc_lookup(array: blosc2.NDArray, descriptor: dict) -> bool: + full = descriptor.get("full") + if full is None or full.get("runs"): + return False + if not descriptor.get("persistent", False): + return False + if full.get("values_path") is None or full.get("positions_path") is None: + return False + if full.get("l1_path") is None or full.get("l2_path") is None: + return False + try: + values_sidecar, positions_sidecar = _load_full_sidecar_handles(array, descriptor) + except Exception: + return False + return ( + _supports_block_reads(array) + and _supports_block_reads(values_sidecar) + and _supports_block_reads(positions_sidecar) + ) + + +def _exact_positions_from_sorted_chunks( + values_sidecar, + positions_sidecar, + boundaries: np.ndarray, + plan: ExactPredicatePlan, + chunk_len: int, + dtype: np.dtype, +) -> np.ndarray: + candidate_chunks = _candidate_units_from_boundaries(boundaries, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64) + + parts = [] + size = int(values_sidecar.shape[0]) + for chunk_id in np.flatnonzero(candidate_chunks): + chunk_start = int(chunk_id) * chunk_len + chunk_stop = min(chunk_start + chunk_len, size) + span_items = chunk_stop - chunk_start + span_values = np.empty(span_items, dtype=dtype) + values_sidecar.get_1d_span_numpy(span_values, int(chunk_id), 0, span_items) + lo, hi = _search_bounds(span_values, plan) + if lo >= hi: + continue + matched = np.empty(hi - lo, dtype=np.int64) + positions_sidecar.get_1d_span_numpy(matched, int(chunk_id), lo, hi - lo) + parts.append(matched) + + if not parts: + return np.empty(0, dtype=np.int64) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + +def _exact_positions_from_compact_full_base( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + full = descriptor["full"] + l1, l2 = _load_full_navigation_arrays(array, descriptor) + candidate_chunks = _candidate_units_from_boundaries(l1, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64) + + candidate_blocks = _candidate_units_from_boundaries(l2, plan) + if not np.any(candidate_blocks): + return np.empty(0, dtype=np.int64) + + values_sidecar, positions_sidecar = _load_full_sidecar_handles(array, descriptor) + dtype = np.dtype(descriptor["dtype"]) + chunk_len = int(full["sidecar_chunk_len"]) + block_len = int(full["sidecar_block_len"]) + size = int(values_sidecar.shape[0]) + parts = [] + span_count = 0 + + for chunk_id in np.flatnonzero(candidate_chunks): + chunk_start = int(chunk_id) * chunk_len + chunk_stop = min(chunk_start + chunk_len, size) + first_block = chunk_start // block_len + nblocks = math.ceil((chunk_stop - chunk_start) / block_len) + block_mask = np.asarray(candidate_blocks[first_block : first_block + nblocks], dtype=bool) + if not np.any(block_mask): + continue + span_runs = _contiguous_true_runs(block_mask) + span_count += len(span_runs) + if span_count > FULL_SELECTIVE_OOC_MAX_SPANS: + raise RuntimeError("too many candidate spans for selective full lookup") + + for block_start_idx, block_stop_idx in span_runs: + span_start = chunk_start + block_start_idx * block_len + span_stop = min(chunk_start + block_stop_idx * block_len, chunk_stop) + local_start = span_start - chunk_start + span_items = span_stop - span_start + span_values = np.empty(span_items, dtype=dtype) + values_sidecar.get_1d_span_numpy(span_values, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(span_values, plan) + if lo >= hi: + continue + matched = np.empty(hi - lo, dtype=np.int64) + positions_sidecar.get_1d_span_numpy(matched, int(chunk_id), local_start + lo, hi - lo) + parts.append(matched) + + if not parts: + return np.empty(0, dtype=np.int64) + positions = np.concatenate(parts) if len(parts) > 1 else parts[0] + return np.sort(positions.astype(np.int64, copy=False), kind="stable") + + +def _exact_positions_from_full_runs_bounded( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + full = descriptor["full"] + dtype = np.dtype(descriptor["dtype"]) + parts = [] + + base_descriptor = descriptor.copy() + base_full = full.copy() + base_full["runs"] = [] + base_descriptor["full"] = base_full + if _full_supports_selective_ooc_lookup(array, base_descriptor): + base_positions = _exact_positions_from_compact_full_base(array, base_descriptor, plan) + if len(base_positions): + parts.append(base_positions) + else: + base_values = _load_array_sidecar(array, descriptor["token"], "full", "values", full["values_path"]) + base_positions = _load_array_sidecar( + array, descriptor["token"], "full", "positions", full["positions_path"] + ) + lo, hi = _search_bounds(base_values, plan) + if lo < hi: + parts.append(base_positions[lo:hi].astype(np.int64, copy=False)) + + for run in full.get("runs", ()): + if run.get("values_path") is None or run.get("positions_path") is None: + run_values, raw_run_positions = _load_full_run_arrays(array, descriptor, run) + lo, hi = _search_bounds(run_values, plan) + run_positions = ( + np.empty(0, dtype=np.int64) + if lo >= hi + else raw_run_positions[lo:hi].astype(np.int64, copy=False) + ) + else: + run_values_sidecar, run_positions_sidecar = _load_full_run_sidecar_handles( + array, descriptor, run + ) + chunk_boundaries = _sorted_chunk_boundaries_from_handle( + array, + descriptor["token"], + "full_run_bounds", + f"{int(run['id'])}.chunks", + run_values_sidecar, + dtype, + ) + run_positions = _exact_positions_from_sorted_chunks( + run_values_sidecar, + run_positions_sidecar, + chunk_boundaries, + plan, + int(run_values_sidecar.chunks[0]), + dtype, + ) + if len(run_positions): + parts.append(run_positions) + + if not parts: + return np.empty(0, dtype=np.int64) + positions = np.concatenate(parts) if len(parts) > 1 else parts[0] + return np.sort(positions.astype(np.int64, copy=False), kind="stable") + + +def _exact_positions_from_full_selective_ooc( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + return _exact_positions_from_compact_full_base(array, descriptor, plan) + + +def _exact_positions_from_full( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> np.ndarray: + if _range_is_empty(plan): + return np.empty(0, dtype=np.int64) + mode = _full_query_mode_override() + if mode != "whole-load" and _full_runs_need_bounded_fallback(descriptor): + return _exact_positions_from_full_runs_bounded(array, descriptor, plan) + if mode != "whole-load" and _full_supports_selective_ooc_lookup(array, descriptor): + try: + return _exact_positions_from_full_selective_ooc(array, descriptor, plan) + except RuntimeError: + pass + sorted_values, positions = _load_full_arrays(array, descriptor) + lo, hi = _search_bounds(sorted_values, plan) + if lo >= hi: + return np.empty(0, dtype=np.int64) + return np.sort(positions[lo:hi], kind="stable") + + +def _chunk_nav_supports_selective_ooc_lookup(array: blosc2.NDArray, descriptor: dict, kind: str) -> bool: + if descriptor.get("kind") != kind or not descriptor.get("persistent", False): + return False + meta = descriptor.get("light" if kind == "light" else "reduced") + if meta is None or meta.get("layout") != "chunk-local-v1": + return False + required_paths = ("values_path", "l1_path", "l2_path") + if any(meta.get(name) is None for name in required_paths): + return False + if kind == "light": + if meta.get("bucket_positions_path") is None: + return False + try: + values_sidecar, bucket_sidecar, l2_sidecar = _load_light_sidecar_handles(array, descriptor) + except Exception: + return False + return ( + _supports_block_reads(array) + and _supports_block_reads(values_sidecar) + and _supports_block_reads(bucket_sidecar) + and _supports_block_reads(l2_sidecar) + ) + if meta.get("positions_path") is None: + return False + try: + values_sidecar, positions_sidecar, l2_sidecar = _load_reduced_sidecar_handles(array, descriptor) + except Exception: + return False + return ( + _supports_block_reads(array) + and _supports_block_reads(values_sidecar) + and _supports_block_reads(positions_sidecar) + and _supports_block_reads(l2_sidecar) + ) + + +def _chunk_nav_candidate_runs( + l2_row: np.ndarray, segment_count: int, plan: ExactPredicatePlan +) -> tuple[list[tuple[int, int]], int]: + segment_lo, segment_hi = _sorted_boundary_search_bounds(l2_row[:segment_count], plan) + if segment_lo >= segment_hi: + return [], 0 + return [(segment_lo, segment_hi)], segment_hi - segment_lo + + +def _index_query_thread_count(task_count: int) -> int: + if blosc2.IS_WASM: + return 1 + if task_count < INDEX_QUERY_MIN_CHUNKS_PER_THREAD: + return 1 + configured_threads = int(getattr(blosc2, "nthreads", 1) or 1) + return _python_executor_threads(min(configured_threads, task_count // INDEX_QUERY_MIN_CHUNKS_PER_THREAD)) + + +def _chunk_batches(chunk_ids: np.ndarray, thread_count: int) -> list[np.ndarray]: + if thread_count <= 1 or len(chunk_ids) == 0: + return [chunk_ids] + batch_size = max(1, math.ceil(len(chunk_ids) / thread_count)) + return [chunk_ids[start : start + batch_size] for start in range(0, len(chunk_ids), batch_size)] + + +def _downstream_query_thread_count(task_count: int, plan: IndexPlan) -> int: + if plan.lookup_path == "chunk-nav-ooc": + return 1 + return _index_query_thread_count(task_count) + + +def _merge_position_batches(position_batches: list[np.ndarray]) -> np.ndarray: + if not position_batches: + return np.empty(0, dtype=np.int64) + return np.concatenate(position_batches) if len(position_batches) > 1 else position_batches[0] + + +def _run_position_batches(chunk_ids: np.ndarray, thread_count: int, process_batch) -> tuple[np.ndarray, int]: + if thread_count <= 1: + return process_batch(chunk_ids) + batches = _chunk_batches(chunk_ids, thread_count) + position_batches = [] + total_candidate_segments = 0 + with ThreadPoolExecutor(max_workers=thread_count) as executor: + for positions_part, batch_candidate_segments in executor.map(process_batch, batches): + total_candidate_segments += batch_candidate_segments + if len(positions_part) > 0: + position_batches.append(positions_part) + return _merge_position_batches(position_batches), total_candidate_segments + + +def _light_batch_result_dtype(where_x) -> np.dtype: + return _where_output_dtype(where_x) + + +def _light_worker_source(where_x): + if _supports_block_reads(where_x) and getattr(where_x, "urlpath", None) is not None: + return blosc2.open(str(where_x.urlpath), mmap_mode=_INDEX_MMAP_MODE) + return where_x + + +def _gather_mmap_source(where_x): + """Return a cached mmap handle for *where_x* for use in repeated gather operations. + + On Windows mmap is disabled (see ``_INDEX_MMAP_MODE``), so the original handle + is returned unchanged. + """ + if _INDEX_MMAP_MODE is None: + return where_x + urlpath = getattr(where_x, "urlpath", None) + if not _supports_block_reads(where_x) or urlpath is None: + return where_x + urlpath = str(urlpath) + handle = _GATHER_MMAP_HANDLES.get(urlpath) + if handle is None: + handle = blosc2.open(urlpath, mmap_mode=_INDEX_MMAP_MODE) + _GATHER_MMAP_HANDLES[urlpath] = handle + return handle + + +def _light_match_from_span(span: np.ndarray, plan: IndexPlan) -> np.ndarray: + if plan.target is not None and plan.target.get("source") == "expression": + field_values = _values_from_numpy_target(span, plan.target) + else: + field_values = span if plan.field is None else span[plan.field] + match = np.ones(len(field_values), dtype=bool) + if plan.lower is not None: + match &= field_values >= plan.lower if plan.lower_inclusive else field_values > plan.lower + if plan.upper is not None: + match &= field_values <= plan.upper if plan.upper_inclusive else field_values < plan.upper + return match + + +def _process_light_chunk_batch( + chunk_ids: np.ndarray, + where_x, + plan: IndexPlan, + total_len: int, + chunk_len: int, + return_positions: bool = False, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + value_parts = [] + position_parts = [] + local_where_x = _light_worker_source(where_x) + for chunk_id in chunk_ids: + bucket_mask = plan.bucket_masks[int(chunk_id)] + chunk_start = int(chunk_id) * plan.chunk_len + chunk_stop = min(chunk_start + plan.chunk_len, total_len) + for run_start, run_stop in _contiguous_true_runs(np.asarray(bucket_mask, dtype=bool)): + start = chunk_start + run_start * plan.bucket_len + stop = min(chunk_start + run_stop * plan.bucket_len, chunk_stop) + if start >= stop: + continue + if _supports_block_reads(local_where_x): + span = np.empty(stop - start, dtype=local_where_x.dtype) + base_chunk_id = start // chunk_len + local_start = start - base_chunk_id * chunk_len + local_where_x.get_1d_span_numpy(span, base_chunk_id, local_start, stop - start) + else: + span = local_where_x[start:stop] + match = _light_match_from_span(span, plan) + if np.any(match): + value_parts.append(np.require(span[match], requirements="C")) + if return_positions: + position_parts.append(np.flatnonzero(match).astype(np.int64, copy=False) + start) + if return_positions: + return _merge_value_position_batches(value_parts, position_parts, _light_batch_result_dtype(where_x)) + if not value_parts: + return np.empty(0, dtype=_light_batch_result_dtype(where_x)) + return np.concatenate(value_parts) if len(value_parts) > 1 else value_parts[0] + + +def _merge_result_batches(parts: list[np.ndarray], dtype: np.dtype) -> np.ndarray: + parts = [part for part in parts if len(part) > 0] + if not parts: + return np.empty(0, dtype=dtype) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + +def _merge_value_position_batches( + value_batches: list[np.ndarray], position_batches: list[np.ndarray], dtype: np.dtype +) -> tuple[np.ndarray, np.ndarray]: + return _merge_result_batches(value_batches, dtype), _merge_position_batches(position_batches) + + +def _merge_segment_query_batches( + parts: list[np.ndarray] | list[tuple[np.ndarray, np.ndarray]], + dtype: np.dtype, + *, + return_positions: bool, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + if return_positions: + value_batches = [] + position_batches = [] + for values, positions in parts: + if len(values) > 0: + value_batches.append(values) + if len(positions) > 0: + position_batches.append(positions) + return _merge_value_position_batches(value_batches, position_batches, dtype) + + value_batches = [part for part in parts if len(part) > 0] + if value_batches: + return np.concatenate(value_batches) if len(value_batches) > 1 else value_batches[0] + return np.empty(0, dtype=dtype) + + +def _process_segment_query_batch( + units: np.ndarray, + expression: str, + operands: dict, + ne_args: dict, + where: dict, + plan: IndexPlan, + result_dtype: np.dtype, + return_positions: bool, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + from .lazyexpr import _get_result, ne_evaluate + from .utils import get_chunk_operands + + chunk_operands = {} + value_parts = [] + position_parts = [] + for unit in units: + start = int(unit) * plan.segment_len + stop = min(start + plan.segment_len, plan.base.shape[0]) + cslice = (slice(start, stop, 1),) + get_chunk_operands(operands, cslice, chunk_operands, plan.base.shape) + if return_positions: + match = ne_evaluate(expression, chunk_operands, **ne_args) + if np.any(match): + value_parts.append(np.require(chunk_operands["_where_x"][match], requirements="C")) + absolute = np.arange(start, stop, dtype=np.int64) + position_parts.append(absolute[match]) + else: + result, _ = _get_result(expression, chunk_operands, ne_args, where) + if len(result) > 0: + value_parts.append(np.require(result, requirements="C")) + if return_positions: + return _merge_value_position_batches(value_parts, position_parts, result_dtype) + return _merge_result_batches(value_parts, result_dtype) + + +def _reduced_positions_from_cython_batches( + candidate_chunk_ids: np.ndarray, thread_count: int, process_batch +) -> tuple[np.ndarray, int]: + return _run_position_batches(candidate_chunk_ids, thread_count, process_batch) + + +def _reduced_positions_from_python_batches( + candidate_chunk_ids: np.ndarray, thread_count: int, process_batch +) -> tuple[list[np.ndarray], int]: + if thread_count <= 1: + return process_batch(candidate_chunk_ids) + parts = [] + total_candidate_segments = 0 + batches = _chunk_batches(candidate_chunk_ids, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + for batch_parts, batch_candidate_segments in executor.map(process_batch, batches): + total_candidate_segments += batch_candidate_segments + parts.extend(batch_parts) + return parts, total_candidate_segments + + +def _sorted_boundary_search_bounds(boundaries: np.ndarray, plan: ExactPredicatePlan) -> tuple[int, int]: + if len(boundaries) == 0: + return 0, 0 + starts = boundaries["start"] + ends = boundaries["end"] + try: + lo, hi = indexing_ext.index_search_boundary_bounds( + starts, ends, plan.lower, plan.lower_inclusive, plan.upper, plan.upper_inclusive + ) + except TypeError: + lo = 0 + hi = len(boundaries) + if plan.lower is not None: + lo = int(np.searchsorted(ends, plan.lower, side="left" if plan.lower_inclusive else "right")) + if plan.upper is not None: + hi = int(np.searchsorted(starts, plan.upper, side="right" if plan.upper_inclusive else "left")) + if lo < 0: + lo = 0 + if hi > len(boundaries): + hi = len(boundaries) + return lo, hi + + +def _light_search_plan( + plan: ExactPredicatePlan, dtype: np.dtype, value_lossy_bits: int +) -> ExactPredicatePlan: + if value_lossy_bits <= 0 or plan.lower is None: + return plan + if dtype.kind in {"i", "u"}: + next_lower = plan.lower if plan.lower_inclusive else min(int(plan.lower) + 1, np.iinfo(dtype).max) + else: + next_lower = ( + plan.lower + if plan.lower_inclusive + else np.nextafter(np.asarray(plan.lower, dtype=dtype)[()], np.inf) + ) + return ExactPredicatePlan( + base=plan.base, + descriptor=plan.descriptor, + target=plan.target, + field=plan.field, + lower=_quantize_light_value_scalar(next_lower, dtype, value_lossy_bits), + lower_inclusive=True, + upper=plan.upper, + upper_inclusive=plan.upper_inclusive, + ) + + +def _bucket_masks_from_light_chunk_nav_ooc( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> tuple[np.ndarray, int, int]: + light = descriptor["light"] + offsets = _load_array_sidecar(array, descriptor["token"], "light", "offsets", light["offsets_path"]) + l1 = _load_light_l1_array(array, descriptor) + candidate_chunks = _candidate_units_from_boundaries(l1, plan) + bucket_masks = np.zeros((len(l1), int(light["bucket_count"])), dtype=bool) + if not np.any(candidate_chunks): + return bucket_masks, 0, 0 + + values_sidecar, bucket_sidecar, l2_sidecar = _load_light_sidecar_handles(array, descriptor) + dtype = np.dtype(descriptor["dtype"]) + chunk_len = int(light["chunk_len"]) + nav_segment_len = int(light["nav_segment_len"]) + nsegments_per_chunk = int(light["nsegments_per_chunk"]) + bucket_dtype = np.dtype(light.get("bucket_dtype", np.uint16)) + value_lossy_bits = int(light.get("value_lossy_bits", 0)) + search_plan = _light_search_plan(plan, dtype, value_lossy_bits) + total_candidate_segments = 0 + candidate_chunk_ids = np.flatnonzero(candidate_chunks).astype(np.intp, copy=False) + + def process_batch(chunk_ids: np.ndarray) -> tuple[list[tuple[int, np.ndarray]], int]: + if len(chunk_ids) == 0: + return [], 0 + batch_values = blosc2.open(light["values_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_buckets = blosc2.open(light["bucket_positions_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(light["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_results = [] + batch_candidate_segments = 0 + l2_row = np.empty(nsegments_per_chunk, dtype=_boundary_dtype(dtype)) + span_values = np.empty(chunk_len, dtype=dtype) + bucket_ids = np.empty(chunk_len, dtype=bucket_dtype) + for chunk_id in chunk_ids: + chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) + segment_count = _segment_row_count(chunk_items, nav_segment_len) + batch_l2.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) + segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) + batch_candidate_segments += candidate_segments + if not segment_runs: + continue + matched_buckets = np.zeros(int(light["bucket_count"]), dtype=bool) + for seg_start_idx, seg_stop_idx in segment_runs: + local_start = seg_start_idx * nav_segment_len + local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) + span_items = local_stop - local_start + values_view = span_values[:span_items] + batch_values.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(values_view, search_plan) + if lo >= hi: + continue + bucket_view = bucket_ids[: hi - lo] + batch_buckets.get_1d_span_numpy(bucket_view, int(chunk_id), local_start + lo, hi - lo) + matched_buckets[bucket_view.astype(np.intp, copy=False)] = True + if np.any(matched_buckets): + batch_results.append((int(chunk_id), matched_buckets)) + return batch_results, batch_candidate_segments + + thread_count = _index_query_thread_count(len(candidate_chunk_ids)) + if thread_count <= 1: + batch_results, total_candidate_segments = process_batch(candidate_chunk_ids) + for chunk_id, matched_buckets in batch_results: + bucket_masks[chunk_id] = matched_buckets + else: + batches = _chunk_batches(candidate_chunk_ids, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + for batch_results, batch_candidate_segments in executor.map(process_batch, batches): + total_candidate_segments += batch_candidate_segments + for chunk_id, matched_buckets in batch_results: + bucket_masks[chunk_id] = matched_buckets + + return bucket_masks, int(np.count_nonzero(candidate_chunks)), total_candidate_segments + + +def _exact_positions_from_reduced_chunk_nav_ooc( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> tuple[np.ndarray, int, int]: + reduced = descriptor["reduced"] + offsets = _load_array_sidecar(array, descriptor["token"], "reduced", "offsets", reduced["offsets_path"]) + l1 = _load_reduced_l1_array(array, descriptor) + candidate_chunks = _candidate_units_from_boundaries(l1, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64), 0, 0 + + dtype = np.dtype(descriptor["dtype"]) + chunk_len = int(reduced["chunk_len"]) + nav_segment_len = int(reduced["nav_segment_len"]) + nsegments_per_chunk = int(reduced["nsegments_per_chunk"]) + local_position_dtype = np.dtype(reduced.get("position_dtype", np.uint32)) + candidate_chunk_ids = np.flatnonzero(candidate_chunks).astype(np.intp, copy=False) + l2_boundary_dtype = _boundary_dtype(dtype) + + def process_cython_batch(chunk_ids: np.ndarray) -> tuple[np.ndarray, int]: + if len(chunk_ids) == 0: + return np.empty(0, dtype=np.int64), 0 + batch_values = blosc2.open(reduced["values_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_positions = blosc2.open(reduced["positions_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) + batch_span_values = np.empty(chunk_len, dtype=dtype) + batch_local_positions = np.empty(chunk_len, dtype=local_position_dtype) + return indexing_ext.index_collect_reduced_chunk_nav_positions( + offsets, + chunk_ids, + batch_values, + batch_positions, + batch_l2, + batch_l2_row, + batch_span_values, + batch_local_positions, + chunk_len, + nav_segment_len, + nsegments_per_chunk, + plan.lower, + plan.lower_inclusive, + plan.upper, + plan.upper_inclusive, + ) + + try: + thread_count = _index_query_thread_count(len(candidate_chunk_ids)) + positions, total_candidate_segments = _reduced_positions_from_cython_batches( + candidate_chunk_ids, thread_count, process_cython_batch + ) + if len(positions) == 0: + return np.empty(0, dtype=np.int64), int(candidate_chunk_ids.size), total_candidate_segments + return np.sort(positions, kind="stable"), int(candidate_chunk_ids.size), total_candidate_segments + except TypeError: + pass + + def process_batch(chunk_ids: np.ndarray) -> tuple[list[np.ndarray], int]: + if len(chunk_ids) == 0: + return [], 0 + batch_values = blosc2.open(reduced["values_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_positions = blosc2.open(reduced["positions_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_l2 = blosc2.open(reduced["l2_path"], mmap_mode=_INDEX_MMAP_MODE) + batch_parts = [] + batch_candidate_segments = 0 + l2_row = np.empty(nsegments_per_chunk, dtype=l2_boundary_dtype) + span_values = np.empty(chunk_len, dtype=dtype) + local_positions = np.empty(chunk_len, dtype=local_position_dtype) + for chunk_id in chunk_ids: + chunk_items = int(offsets[chunk_id + 1] - offsets[chunk_id]) + segment_count = _segment_row_count(chunk_items, nav_segment_len) + batch_l2.get_1d_span_numpy(l2_row, int(chunk_id), 0, nsegments_per_chunk) + segment_runs, candidate_segments = _chunk_nav_candidate_runs(l2_row, segment_count, plan) + batch_candidate_segments += candidate_segments + if not segment_runs: + continue + for seg_start_idx, seg_stop_idx in segment_runs: + local_start = seg_start_idx * nav_segment_len + local_stop = min(seg_stop_idx * nav_segment_len, chunk_items) + span_items = local_stop - local_start + values_view = span_values[:span_items] + batch_values.get_1d_span_numpy(values_view, int(chunk_id), local_start, span_items) + lo, hi = _search_bounds(values_view, plan) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + batch_positions.get_1d_span_numpy(positions_view, int(chunk_id), local_start + lo, hi - lo) + batch_parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + return batch_parts, batch_candidate_segments + + thread_count = _index_query_thread_count(len(candidate_chunk_ids)) + parts, total_candidate_segments = _reduced_positions_from_python_batches( + candidate_chunk_ids, thread_count, process_batch + ) + + if not parts: + return np.empty(0, dtype=np.int64), int(candidate_chunk_ids.size), total_candidate_segments + positions = np.concatenate(parts) if len(parts) > 1 else parts[0] + return ( + np.sort(positions, kind="stable"), + int(candidate_chunk_ids.size), + total_candidate_segments, + ) + + +def _bit_count_sum(masks: np.ndarray) -> int: + if masks.dtype == bool: + return int(np.count_nonzero(masks)) + return sum(int(mask).bit_count() for mask in masks.tolist()) + + +def _bucket_masks_from_light( + array: blosc2.NDArray, descriptor: dict, plan: ExactPredicatePlan +) -> tuple[np.ndarray, int, int]: + if _range_is_empty(plan): + return np.empty((0, 0), dtype=bool), 0, 0 + + if _chunk_nav_supports_selective_ooc_lookup(array, descriptor, "light"): + return _bucket_masks_from_light_chunk_nav_ooc(array, descriptor, plan) + + summaries = _load_level_summaries(array, descriptor, "chunk") + dtype = np.dtype(descriptor["dtype"]) + candidate_chunks = _candidate_units_from_exact_plan(summaries, dtype, plan) + light = descriptor["light"] + chunk_len = int(light["chunk_len"]) + bucket_count = int(light["bucket_count"]) + bucket_masks = np.zeros((len(summaries), bucket_count), dtype=bool) + if not np.any(candidate_chunks): + return bucket_masks, 0, 0 + + sorted_values, bucket_positions, offsets = _load_light_arrays(array, descriptor) + value_lossy_bits = int(light.get("value_lossy_bits", 0)) + nav_segment_len = int(light["nav_segment_len"]) + nsegments_per_chunk = int(light["nsegments_per_chunk"]) + l2 = _load_light_navigation_arrays(array, descriptor)[1] + total_candidate_segments = 0 + + for chunk_id in np.flatnonzero(candidate_chunks): + start = int(offsets[chunk_id]) + stop = int(offsets[chunk_id + 1]) + chunk_values = sorted_values[start:stop] + row_start = int(chunk_id) * nsegments_per_chunk + row_stop = row_start + _segment_row_count(min(chunk_len, stop - start), nav_segment_len) + segment_mask = _candidate_units_from_boundaries(l2[row_start:row_stop], plan) + total_candidate_segments += int(np.count_nonzero(segment_mask)) + if not np.any(segment_mask): + continue + + if value_lossy_bits > 0: + if plan.lower is not None: + if dtype.kind in {"i", "u"}: + if plan.lower_inclusive: + next_lower = plan.lower + else: + max_value = np.iinfo(dtype).max + next_lower = min(int(plan.lower) + 1, max_value) + else: + if plan.lower_inclusive: + next_lower = plan.lower + else: + next_lower = np.nextafter(np.asarray(plan.lower, dtype=dtype)[()], np.inf) + lower = _quantize_light_value_scalar(next_lower, dtype, value_lossy_bits) + lower_inclusive = True + else: + lower = None + lower_inclusive = True + search_plan = ExactPredicatePlan( + base=plan.base, + descriptor=plan.descriptor, + target=plan.target, + field=plan.field, + lower=lower, + lower_inclusive=lower_inclusive, + upper=plan.upper, + upper_inclusive=plan.upper_inclusive, + ) + lo, hi = _search_bounds(chunk_values, search_plan) + else: + lo, hi = _search_bounds(chunk_values, plan) + if lo >= hi: + continue + bucket_masks[ + int(chunk_id), np.unique(bucket_positions[start + lo : start + hi].astype(np.int64)) + ] = True + return bucket_masks, int(np.count_nonzero(candidate_chunks)), total_candidate_segments + + +def _exact_positions_from_reduced( + array: blosc2.NDArray, descriptor: dict, dtype: np.dtype, plan: ExactPredicatePlan +) -> tuple[np.ndarray, int, int]: + if _range_is_empty(plan): + return np.empty(0, dtype=np.int64), 0, 0 + + if _chunk_nav_supports_selective_ooc_lookup(array, descriptor, "medium"): + return _exact_positions_from_reduced_chunk_nav_ooc(array, descriptor, plan) + + summaries = _load_level_summaries(array, descriptor, "chunk") + candidate_chunks = _candidate_units_from_exact_plan(summaries, dtype, plan) + if not np.any(candidate_chunks): + return np.empty(0, dtype=np.int64), 0, 0 + + sorted_values, local_positions, offsets = _load_reduced_arrays(array, descriptor) + chunk_len = int(descriptor["reduced"]["chunk_len"]) + nav_segment_len = int(descriptor["reduced"]["nav_segment_len"]) + nsegments_per_chunk = int(descriptor["reduced"]["nsegments_per_chunk"]) + l2 = _load_reduced_navigation_arrays(array, descriptor)[1] + parts = [] + total_candidate_segments = 0 + for chunk_id in np.flatnonzero(candidate_chunks): + start = int(offsets[chunk_id]) + stop = int(offsets[chunk_id + 1]) + chunk_values = sorted_values[start:stop] + row_start = int(chunk_id) * nsegments_per_chunk + row_stop = row_start + _segment_row_count(min(chunk_len, stop - start), nav_segment_len) + segment_mask = _candidate_units_from_boundaries(l2[row_start:row_stop], plan) + total_candidate_segments += int(np.count_nonzero(segment_mask)) + if not np.any(segment_mask): + continue + lo, hi = _search_bounds(chunk_values, plan) + if lo >= hi: + continue + local = local_positions[start + lo : start + hi].astype(np.int64, copy=False) + parts.append(chunk_id * chunk_len + local) + + if not parts: + return np.empty(0, dtype=np.int64), int(np.count_nonzero(candidate_chunks)), total_candidate_segments + merged = np.concatenate(parts) if len(parts) > 1 else parts[0] + return np.sort(merged, kind="stable"), int(np.count_nonzero(candidate_chunks)), total_candidate_segments + + +def _exact_positions_from_plan(plan: ExactPredicatePlan) -> np.ndarray | None: + kind = plan.descriptor["kind"] + if kind == "full": + return _exact_positions_from_full(plan.base, plan.descriptor, plan) + if kind == "medium": + return _exact_positions_from_reduced( + plan.base, plan.descriptor, np.dtype(plan.descriptor["dtype"]), plan + )[0] + return None + + +def _multi_exact_positions(plans: list[ExactPredicatePlan]) -> tuple[blosc2.NDArray, np.ndarray] | None: + if not plans: + return None + base = plans[0].base + merged_by_target: dict[str, ExactPredicatePlan] = {} + for plan in plans: + if plan.base is not base: + return None + key = plan.descriptor["token"] + current = merged_by_target.get(key) + if current is None: + merged_by_target[key] = plan + continue + merged = _merge_exact_plans(current, plan, "and") + if merged is None: + return None + merged_by_target[key] = merged + + exact_arrays = [] + for plan in merged_by_target.values(): + positions = _exact_positions_from_plan(plan) + if positions is None: + return None + exact_arrays.append(np.asarray(positions, dtype=np.int64)) + + result = exact_arrays[0] + for other in exact_arrays[1:]: + result = np.intersect1d(result, other, assume_unique=False) + return base, result + + +def _plan_multi_exact_query(plans: list[ExactPredicatePlan]) -> IndexPlan | None: + multi_exact = _multi_exact_positions(plans) + if multi_exact is None: + return None + base, exact_positions = multi_exact + if len(exact_positions) >= int(base.shape[0]): + return None + descriptor = _copy_descriptor(plans[0].descriptor) + lookup_path = None + if descriptor["kind"] == "medium": + lookup_path = ( + "chunk-nav-ooc" + if _chunk_nav_supports_selective_ooc_lookup(base, descriptor, "medium") + else "chunk-nav" + ) + return IndexPlan( + True, + "multi-field exact indexes selected", + descriptor=descriptor, + base=base, + target=plans[0].descriptor.get("target"), + field=None, + level="exact", + total_units=int(base.shape[0]), + selected_units=len(exact_positions), + exact_positions=exact_positions, + lookup_path=lookup_path, + ) + + +def _plan_single_exact_query(exact_plan: ExactPredicatePlan) -> IndexPlan: + kind = exact_plan.descriptor["kind"] + if kind == "full": + exact_positions = _exact_positions_from_full(exact_plan.base, exact_plan.descriptor, exact_plan) + return IndexPlan( + True, + f"{kind} exact index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + target=exact_plan.descriptor.get("target"), + field=exact_plan.field, + level=kind, + total_units=exact_plan.base.shape[0], + selected_units=len(exact_positions), + exact_positions=exact_positions, + ) + if kind == "medium": + dtype = np.dtype(exact_plan.descriptor["dtype"]) + exact_positions, candidate_chunks, candidate_nav_segments = _exact_positions_from_reduced( + exact_plan.base, exact_plan.descriptor, dtype, exact_plan + ) + return IndexPlan( + True, + f"{kind} exact index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + target=exact_plan.descriptor.get("target"), + field=exact_plan.field, + level=kind, + total_units=exact_plan.base.shape[0], + selected_units=len(exact_positions), + exact_positions=exact_positions, + chunk_len=int(exact_plan.descriptor["reduced"]["chunk_len"]), + candidate_chunks=candidate_chunks, + candidate_nav_segments=candidate_nav_segments, + lookup_path="chunk-nav-ooc" + if _chunk_nav_supports_selective_ooc_lookup(exact_plan.base, exact_plan.descriptor, "medium") + else "chunk-nav", + ) + bucket_masks, candidate_chunks, candidate_nav_segments = _bucket_masks_from_light( + exact_plan.base, exact_plan.descriptor, exact_plan + ) + light = exact_plan.descriptor["light"] + total_units = bucket_masks.size + selected_units = _bit_count_sum(bucket_masks) + if selected_units < total_units: + return IndexPlan( + True, + "light approximate-order index selected", + descriptor=_copy_descriptor(exact_plan.descriptor), + base=exact_plan.base, + target=exact_plan.descriptor.get("target"), + field=exact_plan.field, + level=kind, + total_units=total_units, + selected_units=selected_units, + bucket_masks=bucket_masks, + bucket_len=int(light["bucket_len"]), + chunk_len=int(light["chunk_len"]), + lower=exact_plan.lower, + lower_inclusive=exact_plan.lower_inclusive, + upper=exact_plan.upper, + upper_inclusive=exact_plan.upper_inclusive, + candidate_chunks=candidate_chunks, + candidate_nav_segments=candidate_nav_segments, + lookup_path="chunk-nav-ooc" + if _chunk_nav_supports_selective_ooc_lookup(exact_plan.base, exact_plan.descriptor, "light") + else "chunk-nav", + ) + return IndexPlan(False, "available exact index does not prune any units for this predicate") + + +def plan_query(expression: str, operands: dict, where: dict | None, *, use_index: bool = True) -> IndexPlan: + if not use_index: + return IndexPlan(False, "index usage disabled for this query") + if where is None or len(where) != 1: + return IndexPlan(False, "indexing is only available for where(x) style filtering") + + try: + tree = ast.parse(expression, mode="eval") + except SyntaxError: + return IndexPlan(False, "expression is not valid Python syntax for planning") + + exact_terms = _plan_exact_conjunction(tree.body, operands) + if exact_terms is not None and len(exact_terms) > 1: + multi_exact_plan = _plan_multi_exact_query(exact_terms) + if multi_exact_plan is not None: + return multi_exact_plan + + exact_plan = _plan_exact_node(tree.body, operands) + if exact_plan is not None: + exact_query_plan = _plan_single_exact_query(exact_plan) + if exact_query_plan.usable: + return exact_query_plan + + segment_plan = _plan_segment_node(tree.body, operands) + if segment_plan is None: + return IndexPlan(False, "no usable index was found for this predicate") + + total_units = len(segment_plan.candidate_units) + selected_units = int(np.count_nonzero(segment_plan.candidate_units)) + if selected_units == total_units: + return IndexPlan( + False, + "available index does not prune any units for this predicate", + descriptor=_copy_descriptor(segment_plan.descriptor), + base=segment_plan.base, + target=segment_plan.descriptor.get("target"), + field=segment_plan.field, + level=segment_plan.level, + segment_len=segment_plan.segment_len, + candidate_units=segment_plan.candidate_units, + total_units=total_units, + selected_units=selected_units, + ) + + return IndexPlan( + True, + f"{segment_plan.level} summaries selected", + descriptor=_copy_descriptor(segment_plan.descriptor), + base=segment_plan.base, + target=segment_plan.descriptor.get("target"), + field=segment_plan.field, + level=segment_plan.level, + segment_len=segment_plan.segment_len, + candidate_units=segment_plan.candidate_units, + total_units=total_units, + selected_units=selected_units, + ) + + +def _where_output_dtype(where_x) -> np.dtype: + return where_x.dtype if hasattr(where_x, "dtype") else np.asarray(where_x).dtype + + +def evaluate_segment_query( + expression: str, + operands: dict, + ne_args: dict, + where: dict, + plan: IndexPlan, + *, + return_positions: bool = False, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + if plan.base is None or plan.candidate_units is None or plan.segment_len is None: + raise ValueError("segment evaluation requires a segment-based plan") + + candidate_units = np.flatnonzero(plan.candidate_units).astype(np.intp, copy=False) + result_dtype = _where_output_dtype(where["_where_x"]) + + thread_count = _downstream_query_thread_count(len(candidate_units), plan) + if thread_count <= 1: + parts = [ + _process_segment_query_batch( + candidate_units, + expression, + operands, + ne_args, + where, + plan, + result_dtype, + return_positions=return_positions, + ) + ] + else: + batches = _chunk_batches(candidate_units, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + parts = list( + executor.map( + _process_segment_query_batch, + batches, + [expression] * len(batches), + [operands] * len(batches), + [ne_args] * len(batches), + [where] * len(batches), + [plan] * len(batches), + [result_dtype] * len(batches), + [return_positions] * len(batches), + ) + ) + + return _merge_segment_query_batches(parts, result_dtype, return_positions=return_positions) + + +def evaluate_light_query( + expression: str, + operands: dict, + ne_args: dict, + where: dict, + plan: IndexPlan, + *, + return_positions: bool = False, +) -> np.ndarray | tuple[np.ndarray, np.ndarray]: + del expression, operands, ne_args + + if plan.base is None or plan.bucket_masks is None or plan.chunk_len is None or plan.bucket_len is None: + raise ValueError("light evaluation requires bucket masks and chunk geometry") + + total_len = int(plan.base.shape[0]) + chunk_len = int(plan.base.chunks[0]) + where_x = where["_where_x"] + candidate_chunk_ids = np.flatnonzero(np.any(plan.bucket_masks, axis=1)).astype(np.intp, copy=False) + result_dtype = _where_output_dtype(where["_where_x"]) + + thread_count = _downstream_query_thread_count(len(candidate_chunk_ids), plan) + if thread_count <= 1: + parts = [ + _process_light_chunk_batch( + candidate_chunk_ids, where_x, plan, total_len, chunk_len, return_positions + ) + ] + else: + batches = _chunk_batches(candidate_chunk_ids, thread_count) + with ThreadPoolExecutor(max_workers=thread_count) as executor: + parts = list( + executor.map( + _process_light_chunk_batch, + batches, + [where_x] * len(batches), + [plan] * len(batches), + [total_len] * len(batches), + [chunk_len] * len(batches), + [return_positions] * len(batches), + ) + ) + + if return_positions: + value_batches = [] + position_batches = [] + for values, positions in parts: + if len(values) > 0: + value_batches.append(values) + if len(positions) > 0: + position_batches.append(positions) + return _merge_value_position_batches(value_batches, position_batches, result_dtype) + + return _merge_result_batches(parts, result_dtype) + + +def _gather_positions(where_x, positions: np.ndarray) -> np.ndarray: + if len(positions) == 0: + return np.empty(0, dtype=_where_output_dtype(where_x)) + + positions = np.asarray(positions, dtype=np.int64) + breaks = np.nonzero(np.diff(positions) != 1)[0] + 1 + runs = np.split(positions, breaks) + parts = [] + for run in runs: + start = int(run[0]) + stop = int(run[-1]) + 1 + parts.append(where_x[start:stop]) + return np.concatenate(parts) if len(parts) > 1 else parts[0] + + +def _gather_positions_by_chunk(where_x, positions: np.ndarray, chunk_len: int) -> np.ndarray: + if len(positions) == 0: + return np.empty(0, dtype=_where_output_dtype(where_x)) + + positions = np.asarray(positions, dtype=np.int64) + output = np.empty(len(positions), dtype=_where_output_dtype(where_x)) + chunk_ids = positions // chunk_len + breaks = np.nonzero(np.diff(chunk_ids) != 0)[0] + 1 + start_idx = 0 + for stop_idx in (*breaks, len(positions)): + chunk_positions = positions[start_idx:stop_idx] + chunk_id = int(chunk_ids[start_idx]) + chunk_start = chunk_id * chunk_len + chunk_stop = chunk_start + chunk_len + chunk_values = where_x[chunk_start:chunk_stop] + local_positions = chunk_positions - chunk_start + output[start_idx:stop_idx] = chunk_values[local_positions] + start_idx = stop_idx + return output + + +def _supports_block_reads(where_x) -> bool: + return isinstance(where_x, blosc2.NDArray) and hasattr(where_x, "get_1d_span_numpy") + + +def _gather_positions_by_block( + where_x, positions: np.ndarray, chunk_len: int, block_len: int, total_len: int +) -> np.ndarray: + if len(positions) == 0: + return np.empty(0, dtype=_where_output_dtype(where_x)) + if not _supports_block_reads(where_x): + return _gather_positions_by_chunk(where_x, positions, chunk_len) + + positions = np.asarray(positions, dtype=np.int64) + output = np.empty(len(positions), dtype=_where_output_dtype(where_x)) + chunk_ids = positions // chunk_len + chunk_breaks = np.nonzero(np.diff(chunk_ids) != 0)[0] + 1 + chunk_start_idx = 0 + for chunk_stop_idx in (*chunk_breaks, len(positions)): + chunk_positions = positions[chunk_start_idx:chunk_stop_idx] + chunk_id = int(chunk_ids[chunk_start_idx]) + chunk_origin = chunk_id * chunk_len + local_positions = chunk_positions - chunk_origin + if np.any(np.diff(local_positions) < 0): + order = np.argsort(local_positions, kind="stable") + sorted_local_positions = local_positions[order] + else: + order = None + sorted_local_positions = local_positions + + sorted_output = ( + output[chunk_start_idx:chunk_stop_idx] + if order is None + else np.empty(len(chunk_positions), dtype=output.dtype) + ) + block_ids = sorted_local_positions // block_len + block_breaks = np.nonzero(np.diff(block_ids) != 0)[0] + 1 + block_start_idx = 0 + for block_stop_idx in (*block_breaks, len(sorted_local_positions)): + block_positions = sorted_local_positions[block_start_idx:block_stop_idx] + span_start = int(block_positions[0]) + span_stop = int(block_positions[-1]) + 1 + span_items = span_stop - span_start + span_values = np.empty(span_items, dtype=output.dtype) + where_x.get_1d_span_numpy(span_values, chunk_id, span_start, span_items) + sorted_output[block_start_idx:block_stop_idx] = span_values[block_positions - span_start] + block_start_idx = block_stop_idx + + if order is None: + output[chunk_start_idx:chunk_stop_idx] = sorted_output + else: + inverse = np.empty(len(order), dtype=np.intp) + inverse[order] = np.arange(len(order), dtype=np.intp) + output[chunk_start_idx:chunk_stop_idx] = sorted_output[inverse] + chunk_start_idx = chunk_stop_idx + return output + + +def evaluate_full_query(where: dict, plan: IndexPlan) -> np.ndarray: + if plan.exact_positions is None: + raise ValueError("full evaluation requires exact positions") + if plan.base is not None: + # Use a cached mmap handle when available so blosc2_schunk_get_lazychunk can return + # a zero-copy pointer into the mapped region instead of malloc+pread per block. + gather_source = _gather_mmap_source(where["_where_x"]) + block_gather_threshold = int(plan.base.blocks[0]) + if len(plan.exact_positions) <= block_gather_threshold: + return _gather_positions_by_block( + gather_source, + plan.exact_positions, + int(plan.base.chunks[0]), + int(plan.base.blocks[0]), + int(plan.base.shape[0]), + ) + return _gather_positions_by_chunk(gather_source, plan.exact_positions, int(plan.base.chunks[0])) + return _gather_positions(where["_where_x"], plan.exact_positions) + + +def _normalize_primary_order_target(array: blosc2.NDArray, order: str | None) -> tuple[dict, str | None]: + if order is None: + return _field_target_descriptor(None), None + if array.dtype.fields is not None and order in array.dtype.fields: + return _field_target_descriptor(order), order + operands = array.fields if array.dtype.fields is not None else {SELF_TARGET_NAME: array} + base, target, _ = _normalize_expression_target(order, operands) + if base is not array: + raise ValueError("ordered expressions must resolve to the target array") + return target, None + + +def _full_run_count(descriptor: dict | None) -> int: + if descriptor is None or descriptor.get("full") is None: + return 0 + return len(descriptor["full"].get("runs", ())) + + +def _full_lookup_path(descriptor: dict | None, *, ordered: bool) -> str | None: + if descriptor is None or descriptor.get("kind") != "full": + return None + if _full_run_count(descriptor): + if not ordered and _full_runs_need_bounded_fallback(descriptor): + return "run-bounded-ooc" + return "in-memory-merge" + if ordered: + return "in-memory-order" + mode = _full_query_mode_override() + if mode == "whole-load": + return "whole-load" + if ( + descriptor.get("persistent") + and descriptor["full"].get("l1_path") + and descriptor["full"].get("l2_path") + ): + return "compact-selective-ooc" + return "in-memory" + + +def _normalize_order_fields( + array: blosc2.NDArray, order: str | list[str] | None +) -> tuple[dict, list[str | None]]: + if order is None: + if array.dtype.fields is None: + return _field_target_descriptor(None), [None] + return _field_target_descriptor(array.dtype.names[0]), list(array.dtype.names) + if isinstance(order, list): + fields = list(order) + else: + fields = [order] + primary_target, primary_field = _normalize_primary_order_target(array, fields[0]) + normalized_order = [primary_field if primary_field is not None else fields[0]] + if len(fields) > 1: + if array.dtype.fields is None: + raise ValueError("secondary order keys are only supported for structured arrays") + for field in fields[1:]: + if field not in array.dtype.fields: + raise ValueError(f"field {field!r} is not present in the dtype") + normalized_order.extend(fields[1:]) + return primary_target, normalized_order + + +def is_expression_order(array: blosc2.NDArray, order: str | list[str] | None) -> bool: + if order is None: + return False + primary = order[0] if isinstance(order, list) else order + try: + target, _ = _normalize_primary_order_target(array, primary) + except (TypeError, ValueError): + return False + return target["source"] == "expression" + + +def plan_array_order( + array: blosc2.NDArray, order: str | list[str] | None = None, *, require_full: bool = False +) -> OrderedIndexPlan: + try: + primary_target, order_fields = _normalize_order_fields(array, order) + except (TypeError, ValueError) as exc: + return OrderedIndexPlan(False, str(exc)) + primary_field = _target_field(primary_target) + descriptor = _full_descriptor_for_order(array, primary_target) + if descriptor is None: + if require_full: + label = primary_field if primary_field is not None else primary_target.get("expression") + return OrderedIndexPlan(False, f"order target {label!r} must have an associated full index") + return OrderedIndexPlan(False, "no matching full index was found for ordered access") + return OrderedIndexPlan( + True, + "ordered access will reuse a full index", + descriptor=_copy_descriptor(descriptor), + base=array, + field=primary_field, + order_fields=order_fields, + total_rows=int(array.shape[0]), + selected_rows=int(array.shape[0]), + secondary_refinement=len(order_fields) > 1, + ) + + +def _positions_in_input_order( + positions: np.ndarray, start: int | None, stop: int | None, step: int | None +) -> np.ndarray: + if step is None: + step = 1 + if step == 0: + raise ValueError("step cannot be zero") + return positions[slice(start, stop, step)] + + +def _full_descriptor_for_order(array: blosc2.NDArray, target: dict) -> dict | None: + descriptor = _descriptor_for_target(array, target) + if descriptor is None or descriptor.get("kind") != "full": + return None + return descriptor + + +def _equal_primary_values(left, right, dtype: np.dtype) -> bool: + return _scalar_compare(left, right, dtype) == 0 + + +def _refine_secondary_order( + array: blosc2.NDArray, + positions: np.ndarray, + primary_values: np.ndarray, + primary_dtype: np.dtype, + secondary_fields: list[str], +) -> np.ndarray: + if not secondary_fields or len(positions) <= 1: + return positions + + refined = positions.copy() + start = 0 + while start < len(refined): + stop = start + 1 + while stop < len(refined) and _equal_primary_values( + primary_values[start], primary_values[stop], primary_dtype + ): + stop += 1 + if stop - start > 1: + tied_positions = refined[start:stop] + tied_rows = array[tied_positions] + tie_order = np.argsort(tied_rows, order=secondary_fields, kind="stable") + refined[start:stop] = tied_positions[tie_order] + start = stop + return refined + + +def _ordered_positions_from_exact_positions( + array: blosc2.NDArray, descriptor: dict, exact_positions: np.ndarray, order_fields: list[str | None] +) -> np.ndarray: + sorted_values, sorted_positions = _load_full_arrays(array, descriptor) + if len(exact_positions) == len(sorted_positions): + selected_positions = np.asarray(sorted_positions, dtype=np.int64) + selected_values = np.asarray(sorted_values) + else: + keep = np.zeros(int(array.shape[0]), dtype=bool) + keep[np.asarray(exact_positions, dtype=np.int64)] = True + mask = keep[sorted_positions] + selected_positions = np.asarray(sorted_positions[mask], dtype=np.int64) + selected_values = np.asarray(sorted_values[mask]) + + secondary_fields = [field for field in order_fields[1:] if field is not None] + if secondary_fields: + selected_positions = _refine_secondary_order( + array, selected_positions, selected_values, np.dtype(descriptor["dtype"]), secondary_fields + ) + return selected_positions + + +def ordered_indices( + array: blosc2.NDArray, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + require_full: bool = False, +) -> np.ndarray | None: + ordered_plan = plan_array_order(array, order=order, require_full=require_full) + if not ordered_plan.usable: + if require_full: + raise ValueError(ordered_plan.reason) + return None + order_fields = ordered_plan.order_fields + descriptor = ordered_plan.descriptor + positions = _ordered_positions_from_exact_positions( + array, descriptor, np.arange(int(array.shape[0]), dtype=np.int64), order_fields + ) + return _positions_in_input_order(positions, start, stop, step) + + +def plan_ordered_query( + expression: str, operands: dict, where: dict, order: str | list[str] +) -> OrderedIndexPlan: + if len(where) != 1: + return OrderedIndexPlan(False, "ordered index reuse is only available for where(x) style filtering") + base = where["_where_x"] + if not isinstance(base, blosc2.NDArray) or base.ndim != 1: + return OrderedIndexPlan(False, "ordered index reuse requires a 1-D NDArray target") + + base_order_plan = plan_array_order(base, order=order, require_full=False) + if not base_order_plan.usable: + return base_order_plan + + filter_plan = plan_query(expression, operands, where, use_index=True) + if not filter_plan.usable: + return OrderedIndexPlan( + False, f"ordered access cannot reuse an index because filtering does not: {filter_plan.reason}" + ) + if filter_plan.base is not base or filter_plan.exact_positions is None: + return OrderedIndexPlan( + False, "ordered access currently requires exact row positions from filtering" + ) + + return OrderedIndexPlan( + True, + "ordered access will reuse a full index after exact filtering", + descriptor=base_order_plan.descriptor, + base=base, + field=base_order_plan.field, + order_fields=base_order_plan.order_fields, + total_rows=int(base.shape[0]), + selected_rows=len(filter_plan.exact_positions), + secondary_refinement=base_order_plan.secondary_refinement, + ) + + +def ordered_query_indices( + expression: str, + operands: dict, + where: dict, + order: str | list[str], + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, +) -> np.ndarray | None: + ordered_plan = plan_ordered_query(expression, operands, where, order) + if not ordered_plan.usable: + return None + base = ordered_plan.base + order_fields = ordered_plan.order_fields + descriptor = ordered_plan.descriptor + + plan = plan_query(expression, operands, where, use_index=True) + + positions = _ordered_positions_from_exact_positions(base, descriptor, plan.exact_positions, order_fields) + return _positions_in_input_order(positions, start, stop, step) + + +def read_sorted( + array: blosc2.NDArray, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + require_full: bool = False, +) -> np.ndarray | None: + positions = ordered_indices( + array, order=order, start=start, stop=stop, step=step, require_full=require_full + ) + if positions is None: + return None + return _gather_positions_by_block( + array, positions, int(array.chunks[0]), int(array.blocks[0]), int(array.shape[0]) + ) + + +def iter_sorted( + array: blosc2.NDArray, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + batch_size: int | None = None, +) -> np.ndarray: + positions = ordered_indices(array, order=order, start=start, stop=stop, step=step, require_full=True) + if batch_size is None: + batch_size = max(1, int(array.blocks[0])) + if batch_size <= 0: + raise ValueError("batch_size must be positive") + + for idx in range(0, len(positions), batch_size): + batch = _gather_positions_by_block( + array, + positions[idx : idx + batch_size], + int(array.chunks[0]), + int(array.blocks[0]), + int(array.shape[0]), + ) + yield from batch + + +def will_use_index(expr) -> bool: + where = getattr(expr, "_where_args", None) + order = getattr(expr, "_order", None) + if order is not None: + return plan_ordered_query(expr.expression, expr.operands, where, order).usable + return plan_query(expr.expression, expr.operands, where).usable + + +def explain_query(expr) -> dict: + """Return planning details for a lazy query. + + This is an internal helper behind :meth:`blosc2.LazyExpr.explain`. The + returned mapping summarizes whether indexing can be used, which index kind + was selected, and additional diagnostics such as candidate counts and the + lookup path chosen for ``full`` indexes. + """ + where = getattr(expr, "_where_args", None) + order = getattr(expr, "_order", None) + if order is not None: + ordered_plan = plan_ordered_query(expr.expression, expr.operands, where, order) + filter_plan = plan_query(expr.expression, expr.operands, where) + return { + "will_use_index": ordered_plan.usable, + "reason": ordered_plan.reason, + "target": None if ordered_plan.descriptor is None else ordered_plan.descriptor.get("target"), + "field": ordered_plan.field, + "kind": None if ordered_plan.descriptor is None else ordered_plan.descriptor["kind"], + "level": "full" if ordered_plan.usable else None, + "ordered_access": True, + "order": ordered_plan.order_fields, + "secondary_refinement": ordered_plan.secondary_refinement, + "candidate_units": ordered_plan.selected_rows, + "total_units": ordered_plan.total_rows, + "candidate_chunks": ordered_plan.selected_rows, + "total_chunks": ordered_plan.total_rows, + "exact_rows": ordered_plan.selected_rows if ordered_plan.usable else None, + "filter_reason": filter_plan.reason, + "filter_level": filter_plan.level, + "full_runs": _full_run_count(ordered_plan.descriptor), + "lookup_path": _full_lookup_path(ordered_plan.descriptor, ordered=True), + "descriptor": ordered_plan.descriptor, + } + + plan = plan_query(expr.expression, expr.operands, where) + return { + "will_use_index": plan.usable, + "reason": plan.reason, + "target": None if plan.descriptor is None else plan.descriptor.get("target"), + "field": plan.field, + "kind": None if plan.descriptor is None else plan.descriptor["kind"], + "level": plan.level, + "ordered_access": False, + "order": None, + "secondary_refinement": False, + "candidate_units": plan.selected_units, + "total_units": plan.total_units, + "candidate_chunks": plan.candidate_chunks if plan.candidate_chunks else plan.selected_units, + "total_chunks": plan.total_units, + "candidate_nav_segments": plan.candidate_nav_segments or None, + "candidate_base_spans": plan.candidate_base_spans or None, + "exact_rows": None if plan.exact_positions is None else len(plan.exact_positions), + "full_runs": _full_run_count(plan.descriptor), + "lookup_path": plan.lookup_path or _full_lookup_path(plan.descriptor, ordered=False), + "descriptor": plan.descriptor, + } diff --git a/src/blosc2/indexing_ext.pyx b/src/blosc2/indexing_ext.pyx new file mode 100644 index 00000000..2be99330 --- /dev/null +++ b/src/blosc2/indexing_ext.pyx @@ -0,0 +1,2195 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### +# cython: boundscheck=False, wraparound=False, initializedcheck=False + +import numpy as np +cimport numpy as np +import cython + +from libc.stdint cimport int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t, uint32_t, uint64_t + + +ctypedef fused sort_float_t: + np.float32_t + np.float64_t + + +ctypedef fused sort_ordered_t: + np.int8_t + np.int16_t + np.int32_t + np.int64_t + np.uint8_t + np.uint16_t + np.uint32_t + np.uint64_t + + +cdef inline bint _le_float_pair( + sort_float_t left_value, + uint64_t left_position, + sort_float_t right_value, + uint64_t right_position, +) noexcept nogil: + cdef bint left_nan = left_value != left_value + cdef bint right_nan = right_value != right_value + if left_nan: + if right_nan: + return left_position <= right_position + return False + if right_nan: + return True + if left_value < right_value: + return True + if left_value > right_value: + return False + return left_position <= right_position + + +cdef inline bint _le_ordered_pair( + sort_ordered_t left_value, + uint64_t left_position, + sort_ordered_t right_value, + uint64_t right_position, +) noexcept nogil: + if left_value < right_value: + return True + if left_value > right_value: + return False + return left_position <= right_position + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef void _stable_mergesort_float( + sort_float_t[:] values, + uint64_t[:] positions, + sort_float_t[:] tmp_values, + uint64_t[:] tmp_positions, +) noexcept nogil: + cdef Py_ssize_t n = values.shape[0] + cdef Py_ssize_t width = 1 + cdef Py_ssize_t start + cdef Py_ssize_t mid + cdef Py_ssize_t stop + cdef Py_ssize_t left + cdef Py_ssize_t right + cdef Py_ssize_t out + cdef sort_float_t[:] src_values = values + cdef uint64_t[:] src_positions = positions + cdef sort_float_t[:] dst_values = tmp_values + cdef uint64_t[:] dst_positions = tmp_positions + cdef sort_float_t[:] swap_values + cdef uint64_t[:] swap_positions + cdef bint in_original = True + while width < n: + start = 0 + while start < n: + mid = start + width + if mid > n: + mid = n + stop = start + 2 * width + if stop > n: + stop = n + left = start + right = mid + out = start + while left < mid and right < stop: + if _le_float_pair( + src_values[left], src_positions[left], src_values[right], src_positions[right] + ): + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + else: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + while left < mid: + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + out += 1 + while right < stop: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + start = stop + swap_values = src_values + src_values = dst_values + dst_values = swap_values + swap_positions = src_positions + src_positions = dst_positions + dst_positions = swap_positions + in_original = not in_original + width <<= 1 + if not in_original: + for start in range(n): + values[start] = src_values[start] + positions[start] = src_positions[start] + + +@cython.boundscheck(False) +@cython.wraparound(False) +cdef void _stable_mergesort_ordered( + sort_ordered_t[:] values, + uint64_t[:] positions, + sort_ordered_t[:] tmp_values, + uint64_t[:] tmp_positions, +) noexcept nogil: + cdef Py_ssize_t n = values.shape[0] + cdef Py_ssize_t width = 1 + cdef Py_ssize_t start + cdef Py_ssize_t mid + cdef Py_ssize_t stop + cdef Py_ssize_t left + cdef Py_ssize_t right + cdef Py_ssize_t out + cdef sort_ordered_t[:] src_values = values + cdef uint64_t[:] src_positions = positions + cdef sort_ordered_t[:] dst_values = tmp_values + cdef uint64_t[:] dst_positions = tmp_positions + cdef sort_ordered_t[:] swap_values + cdef uint64_t[:] swap_positions + cdef bint in_original = True + while width < n: + start = 0 + while start < n: + mid = start + width + if mid > n: + mid = n + stop = start + 2 * width + if stop > n: + stop = n + left = start + right = mid + out = start + while left < mid and right < stop: + if _le_ordered_pair( + src_values[left], src_positions[left], src_values[right], src_positions[right] + ): + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + else: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + while left < mid: + dst_values[out] = src_values[left] + dst_positions[out] = src_positions[left] + left += 1 + out += 1 + while right < stop: + dst_values[out] = src_values[right] + dst_positions[out] = src_positions[right] + right += 1 + out += 1 + start = stop + swap_values = src_values + src_values = dst_values + dst_values = swap_values + swap_positions = src_positions + src_positions = dst_positions + dst_positions = swap_positions + in_original = not in_original + width <<= 1 + if not in_original: + for start in range(n): + values[start] = src_values[start] + positions[start] = src_positions[start] + + +cdef tuple _intra_chunk_sort_run_float32(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.float32_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.float32_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.float32_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.float32_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_float(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_float64(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.float64_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.float64_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.float64_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.float64_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_float(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int8(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int8_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int8_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int8_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int8_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int16(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int16_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int16_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int16_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int16_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int32(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int32_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int32_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int32_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int32_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_int64(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.int64_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.int64_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.int64_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.int64_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint8(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint8_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint8_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint8_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint8_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint16(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint16_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint16_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint16_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint16_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint32(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint32_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint32_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint32_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint32_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_sort_run_uint64(np.ndarray values, Py_ssize_t run_start, np.dtype position_dtype): + cdef np.ndarray[np.uint64_t, ndim=1] sorted_values = np.array(values, copy=True, order="C") + cdef np.ndarray[np.uint64_t, ndim=1] positions = np.empty(sorted_values.shape[0], dtype=np.uint64) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_values = np.empty_like(sorted_values) + cdef np.ndarray[np.uint64_t, ndim=1] tmp_positions = np.empty_like(positions) + cdef np.uint64_t[:] sorted_values_mv = sorted_values + cdef np.uint64_t[:] positions_mv = positions + cdef np.uint64_t[:] tmp_values_mv = tmp_values + cdef np.uint64_t[:] tmp_positions_mv = tmp_positions + cdef Py_ssize_t idx + with nogil: + for idx in range(sorted_values.shape[0]): + positions[idx] = (run_start + idx) + _stable_mergesort_ordered(sorted_values_mv, positions_mv, tmp_values_mv, tmp_positions_mv) + return sorted_values, positions.astype(position_dtype, copy=False) + + +def intra_chunk_sort_run(np.ndarray values, Py_ssize_t run_start, object position_dtype): + cdef np.dtype dtype = values.dtype + cdef np.dtype pos_dtype = np.dtype(position_dtype) + if dtype == np.dtype(np.float32): + return _intra_chunk_sort_run_float32(values, run_start, pos_dtype) + if dtype == np.dtype(np.float64): + return _intra_chunk_sort_run_float64(values, run_start, pos_dtype) + if dtype == np.dtype(np.int8): + return _intra_chunk_sort_run_int8(values, run_start, pos_dtype) + if dtype == np.dtype(np.int16): + return _intra_chunk_sort_run_int16(values, run_start, pos_dtype) + if dtype == np.dtype(np.int32): + return _intra_chunk_sort_run_int32(values, run_start, pos_dtype) + if dtype == np.dtype(np.int64): + return _intra_chunk_sort_run_int64(values, run_start, pos_dtype) + if dtype == np.dtype(np.uint8) or dtype == np.dtype(np.bool_): + sorted_values, positions = _intra_chunk_sort_run_uint8(values.view(np.uint8), run_start, pos_dtype) + if dtype == np.dtype(np.bool_): + return sorted_values.view(np.bool_), positions + return sorted_values, positions + if dtype == np.dtype(np.uint16): + return _intra_chunk_sort_run_uint16(values, run_start, pos_dtype) + if dtype == np.dtype(np.uint32): + return _intra_chunk_sort_run_uint32(values, run_start, pos_dtype) + if dtype == np.dtype(np.uint64): + return _intra_chunk_sort_run_uint64(values, run_start, pos_dtype) + if dtype.kind in {"m", "M"}: + sorted_values, positions = _intra_chunk_sort_run_int64(values.view(np.int64), run_start, pos_dtype) + return sorted_values.view(dtype), positions + raise TypeError("unsupported dtype for intra_chunk_sort_run") + + +cdef void _linear_merge_float( + sort_float_t[:] left_values, + uint64_t[:] left_positions, + sort_float_t[:] right_values, + uint64_t[:] right_positions, + sort_float_t[:] out_values, + uint64_t[:] out_positions, +) noexcept nogil: + cdef Py_ssize_t left = 0 + cdef Py_ssize_t right = 0 + cdef Py_ssize_t out = 0 + cdef Py_ssize_t left_n = left_values.shape[0] + cdef Py_ssize_t right_n = right_values.shape[0] + while left < left_n and right < right_n: + if _le_float_pair(left_values[left], left_positions[left], right_values[right], right_positions[right]): + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + else: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + while left < left_n: + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + out += 1 + while right < right_n: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + + +cdef void _linear_merge_ordered( + sort_ordered_t[:] left_values, + uint64_t[:] left_positions, + sort_ordered_t[:] right_values, + uint64_t[:] right_positions, + sort_ordered_t[:] out_values, + uint64_t[:] out_positions, +) noexcept nogil: + cdef Py_ssize_t left = 0 + cdef Py_ssize_t right = 0 + cdef Py_ssize_t out = 0 + cdef Py_ssize_t left_n = left_values.shape[0] + cdef Py_ssize_t right_n = right_values.shape[0] + while left < left_n and right < right_n: + if _le_ordered_pair( + left_values[left], left_positions[left], right_values[right], right_positions[right] + ): + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + else: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + while left < left_n: + out_values[out] = left_values[left] + out_positions[out] = left_positions[left] + left += 1 + out += 1 + while right < right_n: + out_values[out] = right_values[right] + out_positions[out] = right_positions[right] + right += 1 + out += 1 + + +cdef tuple _intra_chunk_merge_float32( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.float32_t, ndim=1] merged_values = np.empty(total, dtype=np.float32) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.float32_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.float32_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.float32_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_float( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_float64( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.float64_t, ndim=1] merged_values = np.empty(total, dtype=np.float64) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.float64_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.float64_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.float64_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_float( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int8( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int8_t, ndim=1] merged_values = np.empty(total, dtype=np.int8) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int8_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int8_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int8_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int16( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int16_t, ndim=1] merged_values = np.empty(total, dtype=np.int16) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int16_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int16_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int16_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int32( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int32_t, ndim=1] merged_values = np.empty(total, dtype=np.int32) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int32_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int32_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int32_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_int64( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.int64_t, ndim=1] merged_values = np.empty(total, dtype=np.int64) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.int64_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.int64_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.int64_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint8( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint8_t, ndim=1] merged_values = np.empty(total, dtype=np.uint8) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint8_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint8_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint8_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint16( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint16_t, ndim=1] merged_values = np.empty(total, dtype=np.uint16) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint16_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint16_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint16_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint32( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint32_t, ndim=1] merged_values = np.empty(total, dtype=np.uint32) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint32_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint32_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint32_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +cdef tuple _intra_chunk_merge_uint64( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, np.dtype position_dtype +): + cdef Py_ssize_t total = left_values.shape[0] + right_values.shape[0] + cdef np.ndarray[np.uint64_t, ndim=1] merged_values = np.empty(total, dtype=np.uint64) + cdef np.ndarray[np.uint64_t, ndim=1] merged_positions = np.empty(total, dtype=np.uint64) + cdef np.uint64_t[:] left_values_mv = left_values + cdef np.uint64_t[:] left_positions_mv = np.asarray(left_positions, dtype=np.uint64) + cdef np.uint64_t[:] right_values_mv = right_values + cdef np.uint64_t[:] right_positions_mv = np.asarray(right_positions, dtype=np.uint64) + cdef np.uint64_t[:] merged_values_mv = merged_values + cdef np.uint64_t[:] merged_positions_mv = merged_positions + with nogil: + _linear_merge_ordered( + left_values_mv, left_positions_mv, right_values_mv, right_positions_mv, merged_values_mv, merged_positions_mv + ) + return merged_values, merged_positions.astype(position_dtype, copy=False) + + +def intra_chunk_merge_sorted_slices( + np.ndarray left_values, np.ndarray left_positions, np.ndarray right_values, np.ndarray right_positions, object position_dtype +): + cdef np.dtype dtype = left_values.dtype + cdef np.dtype pos_dtype = np.dtype(position_dtype) + if dtype != right_values.dtype: + raise TypeError("left_values and right_values must have the same dtype") + if dtype == np.dtype(np.float32): + return _intra_chunk_merge_float32(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.float64): + return _intra_chunk_merge_float64(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int8): + return _intra_chunk_merge_int8(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int16): + return _intra_chunk_merge_int16(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int32): + return _intra_chunk_merge_int32(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.int64): + return _intra_chunk_merge_int64(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint8): + return _intra_chunk_merge_uint8(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint16): + return _intra_chunk_merge_uint16(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint32): + return _intra_chunk_merge_uint32(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.uint64): + return _intra_chunk_merge_uint64(left_values, left_positions, right_values, right_positions, pos_dtype) + if dtype == np.dtype(np.bool_): + merged_values, merged_positions = _intra_chunk_merge_uint8( + left_values.view(np.uint8), left_positions, right_values.view(np.uint8), right_positions, pos_dtype + ) + return merged_values.view(np.bool_), merged_positions + if dtype.kind in {"m", "M"}: + merged_values, merged_positions = _intra_chunk_merge_int64( + left_values.view(np.int64), left_positions, right_values.view(np.int64), right_positions, pos_dtype + ) + return merged_values.view(dtype), merged_positions + raise TypeError("unsupported dtype for intra_chunk_merge_sorted_slices") + + +cdef inline Py_ssize_t _search_left_float32(np.float32_t[:] values, np.float32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_float32(np.float32_t[:] values, np.float32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_float64(np.float64_t[:] values, np.float64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_float64(np.float64_t[:] values, np.float64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int8(np.int8_t[:] values, np.int8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int8(np.int8_t[:] values, np.int8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int16(np.int16_t[:] values, np.int16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int16(np.int16_t[:] values, np.int16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int32(np.int32_t[:] values, np.int32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int32(np.int32_t[:] values, np.int32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_int64(np.int64_t[:] values, np.int64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_int64(np.int64_t[:] values, np.int64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint8(np.uint8_t[:] values, np.uint8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint8(np.uint8_t[:] values, np.uint8_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint16(np.uint16_t[:] values, np.uint16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint16(np.uint16_t[:] values, np.uint16_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint32(np.uint32_t[:] values, np.uint32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint32(np.uint32_t[:] values, np.uint32_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_left_uint64(np.uint64_t[:] values, np.uint64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] < target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline Py_ssize_t _search_right_uint64(np.uint64_t[:] values, np.uint64_t target) noexcept nogil: + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef Py_ssize_t mid + while lo < hi: + mid = lo + ((hi - lo) >> 1) + if values[mid] <= target: + lo = mid + 1 + else: + hi = mid + return lo + + +cdef inline tuple _search_bounds_float32_impl( + np.float32_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef np.float32_t lower_v + cdef np.float32_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float32(values, lower_v) if lower_inclusive else _search_right_float32(values, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float32(values, upper_v) if upper_inclusive else _search_left_float32(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_float32_impl( + np.float32_t[:] starts, + np.float32_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef np.float32_t lower_v + cdef np.float32_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float32(ends, lower_v) if lower_inclusive else _search_right_float32(ends, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float32(starts, upper_v) if upper_inclusive else _search_left_float32(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_float64_impl( + np.float64_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef np.float64_t lower_v + cdef np.float64_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float64(values, lower_v) if lower_inclusive else _search_right_float64(values, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float64(values, upper_v) if upper_inclusive else _search_left_float64(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_float64_impl( + np.float64_t[:] starts, + np.float64_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef np.float64_t lower_v + cdef np.float64_t upper_v + if lower is not None: + lower_v = lower + lo = _search_left_float64(ends, lower_v) if lower_inclusive else _search_right_float64(ends, lower_v) + if upper is not None: + upper_v = upper + hi = _search_right_float64(starts, upper_v) if upper_inclusive else _search_left_float64(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int8_impl( + np.int8_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int8_t lower_v + cdef np.int8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 127: + lo = hi + elif lower_i >= -128: + lower_v = lower_i + lo = _search_left_int8(values, lower_v) if lower_inclusive else _search_right_int8(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -128: + hi = 0 + elif upper_i <= 127: + upper_v = upper_i + hi = _search_right_int8(values, upper_v) if upper_inclusive else _search_left_int8(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int8_impl( + np.int8_t[:] starts, + np.int8_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int8_t lower_v + cdef np.int8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 127: + lo = hi + elif lower_i >= -128: + lower_v = lower_i + lo = _search_left_int8(ends, lower_v) if lower_inclusive else _search_right_int8(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -128: + hi = 0 + elif upper_i <= 127: + upper_v = upper_i + hi = _search_right_int8(starts, upper_v) if upper_inclusive else _search_left_int8(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int16_impl( + np.int16_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int16_t lower_v + cdef np.int16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 32767: + lo = hi + elif lower_i >= -32768: + lower_v = lower_i + lo = _search_left_int16(values, lower_v) if lower_inclusive else _search_right_int16(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -32768: + hi = 0 + elif upper_i <= 32767: + upper_v = upper_i + hi = _search_right_int16(values, upper_v) if upper_inclusive else _search_left_int16(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int16_impl( + np.int16_t[:] starts, + np.int16_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef int lower_i + cdef int upper_i + cdef np.int16_t lower_v + cdef np.int16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 32767: + lo = hi + elif lower_i >= -32768: + lower_v = lower_i + lo = _search_left_int16(ends, lower_v) if lower_inclusive else _search_right_int16(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -32768: + hi = 0 + elif upper_i <= 32767: + upper_v = upper_i + hi = _search_right_int16(starts, upper_v) if upper_inclusive else _search_left_int16(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int32_impl( + np.int32_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef long long lower_i + cdef long long upper_i + cdef np.int32_t lower_v + cdef np.int32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 2147483647: + lo = hi + elif lower_i >= -2147483648: + lower_v = lower_i + lo = _search_left_int32(values, lower_v) if lower_inclusive else _search_right_int32(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -2147483648: + hi = 0 + elif upper_i <= 2147483647: + upper_v = upper_i + hi = _search_right_int32(values, upper_v) if upper_inclusive else _search_left_int32(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int32_impl( + np.int32_t[:] starts, + np.int32_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef long long lower_i + cdef long long upper_i + cdef np.int32_t lower_v + cdef np.int32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 2147483647: + lo = hi + elif lower_i >= -2147483648: + lower_v = lower_i + lo = _search_left_int32(ends, lower_v) if lower_inclusive else _search_right_int32(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -2147483648: + hi = 0 + elif upper_i <= 2147483647: + upper_v = upper_i + hi = _search_right_int32(starts, upper_v) if upper_inclusive else _search_left_int32(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_int64_impl( + np.int64_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.int64_t lower_v + cdef np.int64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 9223372036854775807: + lo = hi + elif lower_i >= -9223372036854775808: + lower_v = lower_i + lo = _search_left_int64(values, lower_v) if lower_inclusive else _search_right_int64(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -9223372036854775808: + hi = 0 + elif upper_i <= 9223372036854775807: + upper_v = upper_i + hi = _search_right_int64(values, upper_v) if upper_inclusive else _search_left_int64(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_int64_impl( + np.int64_t[:] starts, + np.int64_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.int64_t lower_v + cdef np.int64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 9223372036854775807: + lo = hi + elif lower_i >= -9223372036854775808: + lower_v = lower_i + lo = _search_left_int64(ends, lower_v) if lower_inclusive else _search_right_int64(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < -9223372036854775808: + hi = 0 + elif upper_i <= 9223372036854775807: + upper_v = upper_i + hi = _search_right_int64(starts, upper_v) if upper_inclusive else _search_left_int64(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint8_impl( + np.uint8_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint8_t lower_v + cdef np.uint8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 255: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint8(values, lower_v) if lower_inclusive else _search_right_uint8(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 255: + upper_v = upper_i + hi = _search_right_uint8(values, upper_v) if upper_inclusive else _search_left_uint8(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint8_impl( + np.uint8_t[:] starts, + np.uint8_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint8_t lower_v + cdef np.uint8_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 255: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint8(ends, lower_v) if lower_inclusive else _search_right_uint8(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 255: + upper_v = upper_i + hi = _search_right_uint8(starts, upper_v) if upper_inclusive else _search_left_uint8(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint16_impl( + np.uint16_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint16_t lower_v + cdef np.uint16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 65535: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint16(values, lower_v) if lower_inclusive else _search_right_uint16(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 65535: + upper_v = upper_i + hi = _search_right_uint16(values, upper_v) if upper_inclusive else _search_left_uint16(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint16_impl( + np.uint16_t[:] starts, + np.uint16_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint16_t lower_v + cdef np.uint16_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 65535: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint16(ends, lower_v) if lower_inclusive else _search_right_uint16(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 65535: + upper_v = upper_i + hi = _search_right_uint16(starts, upper_v) if upper_inclusive else _search_left_uint16(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint32_impl( + np.uint32_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint32_t lower_v + cdef np.uint32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 4294967295: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint32(values, lower_v) if lower_inclusive else _search_right_uint32(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 4294967295: + upper_v = upper_i + hi = _search_right_uint32(values, upper_v) if upper_inclusive else _search_left_uint32(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint32_impl( + np.uint32_t[:] starts, + np.uint32_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint32_t lower_v + cdef np.uint32_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 4294967295: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint32(ends, lower_v) if lower_inclusive else _search_right_uint32(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 4294967295: + upper_v = upper_i + hi = _search_right_uint32(starts, upper_v) if upper_inclusive else _search_left_uint32(starts, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_bounds_uint64_impl( + np.uint64_t[:] values, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = values.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint64_t lower_v + cdef np.uint64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 18446744073709551615: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint64(values, lower_v) if lower_inclusive else _search_right_uint64(values, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 18446744073709551615: + upper_v = upper_i + hi = _search_right_uint64(values, upper_v) if upper_inclusive else _search_left_uint64(values, upper_v) + return int(lo), int(hi) + + +cdef inline tuple _search_boundary_bounds_uint64_impl( + np.uint64_t[:] starts, + np.uint64_t[:] ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef Py_ssize_t lo = 0 + cdef Py_ssize_t hi = starts.shape[0] + cdef object lower_i + cdef object upper_i + cdef np.uint64_t lower_v + cdef np.uint64_t upper_v + if lower is not None: + lower_i = int(lower) + if lower_i > 18446744073709551615: + lo = hi + elif lower_i >= 0: + lower_v = lower_i + lo = _search_left_uint64(ends, lower_v) if lower_inclusive else _search_right_uint64(ends, lower_v) + if upper is not None: + upper_i = int(upper) + if upper_i < 0: + hi = 0 + elif upper_i <= 18446744073709551615: + upper_v = upper_i + hi = _search_right_uint64(starts, upper_v) if upper_inclusive else _search_left_uint64(starts, upper_v) + return int(lo), int(hi) + + +def index_search_bounds(np.ndarray values, object lower, bint lower_inclusive, object upper, bint upper_inclusive): + cdef np.dtype dtype = values.dtype + if dtype == np.dtype(np.float32): + return _search_bounds_float32_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.float64): + return _search_bounds_float64_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int8): + return _search_bounds_int8_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int16): + return _search_bounds_int16_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int32): + return _search_bounds_int32_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int64): + return _search_bounds_int64_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint8): + return _search_bounds_uint8_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint16): + return _search_bounds_uint16_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint32): + return _search_bounds_uint32_impl(values, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint64): + return _search_bounds_uint64_impl(values, lower, lower_inclusive, upper, upper_inclusive) + raise TypeError("unsupported dtype for index_search_bounds") + + +def index_search_boundary_bounds( + np.ndarray starts, + np.ndarray ends, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.dtype dtype = starts.dtype + if dtype != ends.dtype: + raise TypeError("starts and ends must have the same dtype") + if dtype == np.dtype(np.float32): + return _search_boundary_bounds_float32_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.float64): + return _search_boundary_bounds_float64_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int8): + return _search_boundary_bounds_int8_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int16): + return _search_boundary_bounds_int16_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int32): + return _search_boundary_bounds_int32_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.int64): + return _search_boundary_bounds_int64_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint8): + return _search_boundary_bounds_uint8_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint16): + return _search_boundary_bounds_uint16_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint32): + return _search_boundary_bounds_uint32_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + if dtype == np.dtype(np.uint64): + return _search_boundary_bounds_uint64_impl(starts, ends, lower, lower_inclusive, upper, upper_inclusive) + raise TypeError("unsupported dtype for index_search_boundary_bounds") + + +cdef tuple _collect_chunk_positions_float32( + np.ndarray[np.int64_t, ndim=1] offsets, + np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, + object positions_sidecar, + object l2_sidecar, + np.ndarray l2_row, + np.ndarray[np.float32_t, ndim=1] span_values, + np.ndarray local_positions, + int64_t chunk_len, + int32_t nav_segment_len, + int32_t nsegments_per_chunk, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.ndarray[np.float32_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.float32_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_float32_impl( + starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive + ) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_float32_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_float64( + np.ndarray[np.int64_t, ndim=1] offsets, + np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, + object positions_sidecar, + object l2_sidecar, + np.ndarray l2_row, + np.ndarray[np.float64_t, ndim=1] span_values, + np.ndarray local_positions, + int64_t chunk_len, + int32_t nav_segment_len, + int32_t nsegments_per_chunk, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.ndarray[np.float64_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.float64_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_float64_impl( + starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive + ) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_float64_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int8( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int8_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int8_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int8_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int8_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int8_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int16( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int16_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int16_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int16_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int16_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int16_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int32( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int32_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int32_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int32_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int32_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int32_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_int64( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.int64_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.int64_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.int64_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_int64_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_int64_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint8( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint8_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint8_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint8_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint8_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint8_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint16( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint16_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint16_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint16_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint16_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint16_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint32( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint32_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint32_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint32_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint32_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint32_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +cdef tuple _collect_chunk_positions_uint64( + np.ndarray[np.int64_t, ndim=1] offsets, np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, object positions_sidecar, object l2_sidecar, np.ndarray l2_row, + np.ndarray[np.uint64_t, ndim=1] span_values, np.ndarray local_positions, + int64_t chunk_len, int32_t nav_segment_len, int32_t nsegments_per_chunk, + object lower, bint lower_inclusive, object upper, bint upper_inclusive, +): + cdef np.ndarray[np.uint64_t, ndim=1] starts = l2_row["start"] + cdef np.ndarray[np.uint64_t, ndim=1] ends = l2_row["end"] + cdef Py_ssize_t idx + cdef int64_t chunk_id + cdef int64_t chunk_items + cdef int32_t segment_count + cdef int seg_lo + cdef int seg_hi + cdef int64_t local_start + cdef int64_t local_stop + cdef int32_t span_items + cdef int lo + cdef int hi + cdef int total_candidate_segments = 0 + cdef list parts = [] + cdef np.ndarray values_view + cdef np.ndarray positions_view + for idx in range(candidate_chunk_ids.shape[0]): + chunk_id = candidate_chunk_ids[idx] + chunk_items = offsets[chunk_id + 1] - offsets[chunk_id] + segment_count = ((chunk_items + nav_segment_len - 1) // nav_segment_len) + l2_sidecar.get_1d_span_numpy(l2_row, chunk_id, 0, nsegments_per_chunk) + seg_lo, seg_hi = _search_boundary_bounds_uint64_impl(starts[:segment_count], ends[:segment_count], lower, lower_inclusive, upper, upper_inclusive) + total_candidate_segments += seg_hi - seg_lo + if seg_lo >= seg_hi: + continue + local_start = seg_lo * nav_segment_len + local_stop = min(seg_hi * nav_segment_len, chunk_items) + span_items = (local_stop - local_start) + values_view = span_values[:span_items] + values_sidecar.get_1d_span_numpy(values_view, chunk_id, local_start, span_items) + lo, hi = _search_bounds_uint64_impl(values_view, lower, lower_inclusive, upper, upper_inclusive) + if lo >= hi: + continue + positions_view = local_positions[: hi - lo] + positions_sidecar.get_1d_span_numpy(positions_view, chunk_id, (local_start + lo), hi - lo) + parts.append(chunk_id * chunk_len + positions_view.astype(np.int64, copy=False)) + if not parts: + return np.empty(0, dtype=np.int64), total_candidate_segments + return (np.concatenate(parts) if len(parts) > 1 else parts[0]), total_candidate_segments + + +def index_collect_reduced_chunk_nav_positions( + np.ndarray[np.int64_t, ndim=1] offsets, + np.ndarray[np.intp_t, ndim=1] candidate_chunk_ids, + object values_sidecar, + object positions_sidecar, + object l2_sidecar, + np.ndarray l2_row, + np.ndarray span_values, + np.ndarray local_positions, + int64_t chunk_len, + int32_t nav_segment_len, + int32_t nsegments_per_chunk, + object lower, + bint lower_inclusive, + object upper, + bint upper_inclusive, +): + cdef np.dtype dtype = span_values.dtype + if dtype == np.dtype(np.float32): + return _collect_chunk_positions_float32( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.float64): + return _collect_chunk_positions_float64( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int8): + return _collect_chunk_positions_int8( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int16): + return _collect_chunk_positions_int16( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int32): + return _collect_chunk_positions_int32( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.int64): + return _collect_chunk_positions_int64( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint8): + return _collect_chunk_positions_uint8( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint16): + return _collect_chunk_positions_uint16( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint32): + return _collect_chunk_positions_uint32( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + if dtype == np.dtype(np.uint64): + return _collect_chunk_positions_uint64( + offsets, candidate_chunk_ids, values_sidecar, positions_sidecar, l2_sidecar, l2_row, + span_values, local_positions, chunk_len, nav_segment_len, nsegments_per_chunk, + lower, lower_inclusive, upper, upper_inclusive + ) + raise TypeError("unsupported dtype for index_collect_reduced_chunk_nav_positions") diff --git a/src/blosc2/lazyexpr.py b/src/blosc2/lazyexpr.py index 63d656bc..5e53a97b 100644 --- a/src/blosc2/lazyexpr.py +++ b/src/blosc2/lazyexpr.py @@ -1758,6 +1758,7 @@ def slices_eval( # noqa: C901 ne_args = {} chunks = kwargs.get("chunks") where: dict | None = kwargs.pop("_where_args", None) + use_index = kwargs.pop("_use_index", True) _indices = kwargs.pop("_indices", False) if _indices and (not where or len(where) != 1): raise NotImplementedError("Indices can only be used with one where condition") @@ -1823,7 +1824,10 @@ def slices_eval( # noqa: C901 # Get the dtype of the array to sort dtype_ = operands["_where_x"].dtype # Now, use only the fields that are necessary for the sorting - dtype_ = np.dtype([(f, dtype_[f]) for f in _order]) + if dtype_.fields is not None and all(f in dtype_.fields for f in _order): + dtype_ = np.dtype([(f, dtype_[f]) for f in _order]) + else: + dtype_ = np.dtype(np.int64) # Iterate over the operands and get the chunks chunk_operands = {} @@ -1836,6 +1840,83 @@ def slices_eval( # noqa: C901 if 0 not in chunks else np.asarray(shape) ) + index_plan = None + if where is not None and len(where) == 1 and use_index and _slice == (): + from . import indexing + + _cache_array = where["_where_x"] + _cache_tokens = [indexing.SELF_TARGET_NAME] + + # --- Ordered path --- + if _order is not None: + ordered_plan = indexing.plan_ordered_query(expression, operands, where, _order) + if ordered_plan.usable: + cached_coords = indexing.get_cached_coords(_cache_array, expression, _cache_tokens, _order) + if cached_coords is not None: + return cached_coords + ordered_positions = indexing.ordered_query_indices(expression, operands, where, _order) + if ordered_positions is not None: + indexing.store_cached_coords( + _cache_array, expression, _cache_tokens, _order, ordered_positions + ) + return ordered_positions + elif indexing.is_expression_order(where["_where_x"], _order): + raise ValueError("expression order requires a matching full expression index") + + # --- Indices-only path (.indices().compute()) --- + if _indices and _order is None: + cached_coords = indexing.get_cached_coords(_cache_array, expression, _cache_tokens, None) + if cached_coords is not None: + return cached_coords + + # --- Value-returning path (arr[cond][:]) — cache check before plan_query --- + _cache_urlpath = getattr(_cache_array, "urlpath", None) or getattr( + getattr(_cache_array, "ndarr", None), "urlpath", None + ) + if not _indices and _order is None: + cached_coords = indexing.get_cached_coords(_cache_array, expression, _cache_tokens, None) + if cached_coords is not None: + cached_plan = indexing.IndexPlan( + usable=True, reason="cache-hit", base=_cache_array, exact_positions=cached_coords + ) + return indexing.evaluate_full_query(where, cached_plan) + + index_plan = indexing.plan_query(expression, operands, where, use_index=use_index) + + if _indices and _order is None and index_plan.usable: + if index_plan.exact_positions is not None: + coords = np.asarray(index_plan.exact_positions, dtype=np.int64) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return coords + if index_plan.bucket_masks is not None: + _, coords = indexing.evaluate_light_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return coords + if index_plan.candidate_units is not None and index_plan.segment_len is not None: + _, coords = indexing.evaluate_segment_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return coords + if index_plan.usable and not (_indices or _order): + if index_plan.exact_positions is not None: + coords = np.asarray(index_plan.exact_positions, dtype=np.int64) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return indexing.evaluate_full_query(where, index_plan) + if index_plan.bucket_masks is not None: + result, coords = indexing.evaluate_light_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return result + if index_plan.candidate_units is not None and index_plan.segment_len is not None: + result, coords = indexing.evaluate_segment_query( + expression, operands, ne_args, where, index_plan, return_positions=True + ) + indexing.store_cached_coords(_cache_array, expression, _cache_tokens, None, coords) + return result for chunk_slice in intersecting_chunks: # Check whether current cslice intersects with _slice @@ -1851,6 +1932,15 @@ def slices_eval( # noqa: C901 offset = tuple(s.start for s in cslice) # offset for the udf cslice_shape = tuple(s.stop - s.start for s in cslice) len_chunk = math.prod(cslice_shape) + if ( + index_plan is not None + and index_plan.usable + and index_plan.level == "chunk" + and not index_plan.candidate_units[nchunk] + ): + if _indices or _order: + leninputs += len_chunk + continue # get local index of part of out that is to be updated cslice_subidx = ( ndindex.ndindex(cslice).as_subindex(_slice).raw @@ -3687,6 +3777,39 @@ def sort(self, order: str | list[str] | None = None) -> blosc2.LazyArray: lazy_expr._order = order return lazy_expr + def will_use_index(self) -> bool: + """Return whether the current lazy query can use an index.""" + from . import indexing + + return indexing.will_use_index(self) + + def explain(self) -> dict: + """Explain how this lazy query will be executed. + + Returns a dictionary describing the planner decision for the current + query. Typical fields include whether an index will be used, the chosen + index kind and level, candidate counts, and the lookup path selected + for ``full`` indexes. + + Returns: + dict: Query planning metadata for the current expression. + + Examples: + >>> import numpy as np + >>> import blosc2 + >>> arr = blosc2.asarray(np.arange(10)) + >>> _ = arr.create_index(kind="full") + >>> expr = blosc2.lazyexpr("(a >= 3) & (a < 6)", {"a": arr}).where(arr) + >>> info = expr.explain() + >>> info["will_use_index"] + True + >>> info["kind"] + 'full' + """ + from . import indexing + + return indexing.explain_query(self) + def compute( self, item=(), @@ -3735,6 +3858,7 @@ def compute( "_indices", "_order", "_ne_args", + "_use_index", "dtype", "shape", "fp_accuracy", diff --git a/src/blosc2/ndarray.py b/src/blosc2/ndarray.py index c972e03a..a571152b 100644 --- a/src/blosc2/ndarray.py +++ b/src/blosc2/ndarray.py @@ -13,6 +13,7 @@ import tempfile from abc import abstractmethod from collections import OrderedDict, namedtuple +from collections.abc import Mapping from functools import reduce from itertools import product from typing import TYPE_CHECKING, Any, NamedTuple, Protocol, runtime_checkable @@ -143,6 +144,41 @@ def __getitem__(self, key: Any) -> Any: ... +class FieldsAccessor(Mapping): + """Read-only mapping of structured field views.""" + + def __init__(self, field_views: dict[str, Any]): + self._field_views = field_views + + def __getitem__(self, key: str) -> Any: + return self._field_views[key] + + def __iter__(self) -> Iterator[str]: + return iter(self._field_views) + + def __len__(self) -> int: + return len(self._field_views) + + def __setitem__(self, key: str, value: object) -> None: + raise TypeError(f'assign through the field view, e.g. array.fields["{key}"][:] = values') + + def copy(self) -> dict[str, Any]: + return dict(self._field_views) + + def __or__(self, other: object) -> dict[str, Any]: + if not isinstance(other, Mapping): + return NotImplemented + return self.copy() | dict(other) + + def __ror__(self, other: object) -> dict[str, Any]: + if not isinstance(other, Mapping): + return NotImplemented + return dict(other) | self.copy() + + def __repr__(self) -> str: + return repr(self._field_views) + + def is_documented_by(original): def wrapper(target): target.__doc__ = original.__doc__ @@ -3695,10 +3731,11 @@ def __init__(self, **kwargs): base = kwargs.pop("_base", None) super().__init__(kwargs["_array"], base=base) # Accessor to fields - self._fields = {} + field_views = {} if self.dtype.fields: for field in self.dtype.fields: - self._fields[field] = NDField(self, field) + field_views[field] = NDField(self, field) + self._fields = FieldsAccessor(field_views) @property def cparams(self) -> blosc2.CParams: @@ -3747,14 +3784,14 @@ def vlmeta(self) -> dict: return self.schunk.vlmeta @property - def fields(self) -> dict: + def fields(self) -> Mapping[str, NDField]: """ - Dictionary with the fields of the structured array. + Read-only mapping with the fields of the structured array. Returns ------- - fields: dict - A dictionary with the fields of the structured array. + fields: Mapping + A read-only mapping with the fields of the structured array. See Also -------- @@ -3770,6 +3807,8 @@ def fields(self) -> dict: >>> sa = blosc2.zeros(shape, dtype=dtype) >>> # Check that fields are equal >>> assert sa.fields['a'] == sa.fields['b'] + >>> # Assign through the field view + >>> sa.fields['a'][:] = 1 """ return self._fields @@ -4409,14 +4448,22 @@ def __setitem__( _slice = ndindex.ndindex(()).expand(self.shape) # just get whole array else: # do nothing return self - return self._get_set_findex_default(_slice, value=value) + result = self._get_set_findex_default(_slice, value=value) + from . import indexing + + indexing.mark_indexes_stale(self) + return result start, stop, step, none_mask = get_ndarray_start_stop(self.ndim, key_, self.shape) if step != (1,) * self.ndim: # handle non-unit or negative steps if np.any(none_mask): raise ValueError("Cannot mix non-unit steps and None indexing for __setitem__.") - return self._get_set_nonunit_steps((start, stop, step, mask), value=value) + result = self._get_set_nonunit_steps((start, stop, step, mask), value=value) + from . import indexing + + indexing.mark_indexes_stale(self) + return result shape = [sp - st for sp, st in zip(stop, start, strict=False)] if isinstance(value, blosc2.Operand): # handles SimpleProxy, NDArray, LazyExpr etc. @@ -4431,7 +4478,11 @@ def __setitem__( # when using complex functions (e.g. conj) with real arrays value = value.real.astype(self.dtype) - return super().set_slice((start, stop), value) + result = super().set_slice((start, stop), value) + from . import indexing + + indexing.mark_indexes_stale(self) + return result def __iter__(self): """Iterate over the (outer) elements of the array. @@ -4705,6 +4756,237 @@ def save(self, urlpath: str, contiguous=True, **kwargs: Any) -> None: super().copy(self.dtype, cparams=asdict(self.cparams), **kwargs) + def create_index( + self, + field: str | None = None, + kind: str = "light", + optlevel: int = 5, + persistent: bool | None = None, + in_mem: bool = False, + name: str | None = None, + **kwargs: Any, + ) -> dict: + """Create an index for a 1-D array or structured field. + + Parameters + ---------- + field : str or None, optional + Field to index for structured dtypes. Use ``None`` to index the + array values for plain 1-D arrays. Structured arrays require an + explicit field name. + kind : {"ultralight", "light", "medium", "full"}, optional + Index tier to build. Use ``light`` or ``medium`` for faster/lighter + filter-oriented indexes, and ``full`` when exact ordered access via + ``sort(order=...)``, ``indices(order=...)``, or ``itersorted(...)`` + should reuse the index directly. + optlevel : int, optional + Optimization level for index payload construction. + persistent : bool or None, optional + Whether index sidecars should be persisted. If ``None``, this follows whether the base array is persistent. + in_mem : bool, optional + Force the in-memory builder. When set to ``True``, index creation materializes the indexed field in RAM and + may allocate additional temporary arrays for sorting, permutations, and block payloads. For large datasets + this can require substantially more memory than the final index itself, so the default is ``False`` and + uses the out-of-core builders for ``light``, ``medium``, and ``full``. + name : str or None, optional + Optional logical label stored in the descriptor. Index identity is + still driven by the target field, so creating another index on the + same field replaces the previous one. + kwargs : dict, optional + Keyword arguments forwarded to the index builder. At the moment the + supported option is ``cparams``. Pass ``cparams`` to control the + compression settings used for index sidecars, including + ``codec``, ``clevel``, and ``nthreads``. If provided, + ``cparams["nthreads"]`` becomes the default build-thread count for + intra-chunk sorting unless ``BLOSC2_INDEX_BUILD_THREADS`` overrides + it. + + Notes + ----- + The current indexing model supports one active index target per field. + Append operations keep compatible indexes current, while general + mutation and resize operations mark indexes as stale until rebuild. + + Chunk-local index creation uses parallel intra-chunk sorting by default. + Set ``BLOSC2_INDEX_BUILD_THREADS=1`` to disable parallel sorting. If + ``cparams`` is provided in ``kwargs``, its ``nthreads`` value becomes + the default build-thread count unless + ``BLOSC2_INDEX_BUILD_THREADS`` overrides it. + """ + from . import indexing + + return indexing.create_index( + self, + field=field, + kind=kind, + optlevel=optlevel, + persistent=persistent, + in_mem=in_mem, + name=name, + **kwargs, + ) + + def create_csindex(self, field: str | None = None, **kwargs: Any) -> dict: + """Create a fully sorted index for a 1-D array or structured field. + + This is a convenience wrapper for ``create_index(kind="full")`` and is + the required index tier for direct ordered reuse in + ``sort(order=...)``, ``indices(order=...)``, and ``itersorted(...)``. + """ + from . import indexing + + return indexing.create_csindex(self, field=field, **kwargs) + + def create_expr_index( + self, + expression: str, + *, + operands: dict | None = None, + kind: str = "light", + optlevel: int = 3, + persistent: bool | None = None, + in_mem: bool = False, + name: str | None = None, + **kwargs: Any, + ) -> dict: + """Create an index on a derived 1-D expression stream. + + Parameters + ---------- + expression : str + Deterministic scalar expression to materialize and index. Structured + arrays typically use field names directly, such as ``"abs(x)"`` or + ``"a + b"``. For plain 1-D arrays, provide ``operands`` explicitly + or use the default ``"value"`` name. + operands : dict or None, optional + Operand mapping used for normalization and evaluation. When omitted, + structured arrays default to ``self.fields`` and plain arrays use + ``{"value": self}``. + kind, optlevel, persistent, in_mem, name + Same meaning as in :meth:`create_index`. Setting ``in_mem=True`` + materializes the derived expression stream in RAM and can allocate + additional temporary arrays for sorting and block payloads, so the + default remains ``False`` and uses the out-of-core builders for + ``light``, ``medium``, and ``full``. + kwargs : dict, optional + Keyword arguments forwarded to the index builder. At the moment the + supported option is ``cparams``. Pass ``cparams`` to control the + compression settings used for index sidecars, including + ``codec``, ``clevel``, and ``nthreads``. If provided, + ``cparams["nthreads"]`` becomes the default build-thread count for + intra-chunk sorting unless ``BLOSC2_INDEX_BUILD_THREADS`` overrides + it. + + Notes + ----- + Expression indexes are matched by normalized expression identity. The + current implementation supports one active index target per normalized + expression key. + + Chunk-local index creation uses parallel intra-chunk sorting by default. + Set ``BLOSC2_INDEX_BUILD_THREADS=1`` to disable parallel sorting. If + ``cparams`` is provided in ``kwargs``, its ``nthreads`` value becomes + the default build-thread count unless + ``BLOSC2_INDEX_BUILD_THREADS`` overrides it. + """ + from . import indexing + + return indexing.create_expr_index( + self, + expression, + operands=operands, + kind=kind, + optlevel=optlevel, + persistent=persistent, + in_mem=in_mem, + name=name, + **kwargs, + ) + + def drop_index(self, field: str | None = None, name: str | None = None) -> None: + """Drop an index by field or optional descriptor label.""" + from . import indexing + + indexing.drop_index(self, field=field, name=name) + + def rebuild_index(self, field: str | None = None, name: str | None = None) -> dict: + """Rebuild an index by field or optional descriptor label.""" + from . import indexing + + return indexing.rebuild_index(self, field=field, name=name) + + def compact_index(self, field: str | None = None, name: str | None = None) -> dict: + """Compact a ``full`` index by merging its compact base and append runs. + + Parameters + ---------- + field : str or None, optional + Structured field identifying the target ``full`` index. Use + ``None`` to compact the value index for a plain 1-D array. + name : str or None, optional + Optional logical index label. When omitted and the array has a + single index, that index is selected automatically. + + Returns + ------- + out : dict + The updated index descriptor after compaction. + + Notes + ----- + This is currently implemented only for ``kind="full"`` indexes. It is + a structural maintenance operation: the compact base sidecars and any + pending append runs are merged into one compact ``full.values`` sidecar + and one compact ``full.positions`` sidecar. For persistent indexes, the + compact lookup metadata is rebuilt as part of the process and + ``full["runs"]`` becomes empty afterwards. + + Compaction does not change query results. It is useful after many + append operations, where ``full`` maintenance stays cheap on append by + recording sorted runs but later queries may still have extra work until + the index is consolidated explicitly. + + Examples + -------- + >>> import blosc2 + >>> import numpy as np + >>> dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + >>> data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + >>> arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + >>> _ = arr.create_index(field="id", kind="full") + >>> _ = arr.append(np.array([(0, 100), (3, 101)], dtype=dtype)) + >>> len(arr.indexes[0]["full"]["runs"]) + 1 + >>> compacted = arr.compact_index("id") + >>> compacted["full"]["runs"] + [] + """ + from . import indexing + + return indexing.compact_index(self, field=field, name=name) + + def index(self, field: str | None = None, name: str | None = None) -> blosc2.indexing.Index: + """Return a live view over one index. + + Parameters + ---------- + field : str or None, optional + Structured field identifying the target index. Use ``None`` for the + value index on a plain 1-D array. + name : str or None, optional + Optional logical index label. When omitted and the array has a + single index, that index is selected automatically. + """ + from . import indexing + + return indexing.get_index(self, field=field, name=name) + + @property + def indexes(self) -> list[blosc2.indexing.Index]: + from . import indexing + + return indexing.get_indexes(self) + def resize(self, newshape: tuple | list) -> None: """Change the shape of the array by growing or shrinking one or more dimensions. @@ -4745,6 +5027,57 @@ def resize(self, newshape: tuple | list) -> None: ) blosc2_ext.check_access_mode(self.schunk.urlpath, self.schunk.mode) super().resize(newshape) + from . import indexing + + indexing.mark_indexes_stale(self) + + def append(self, values: object) -> int: + """Append values to a 1-D array and keep indexes current when possible. + + Parameters + ---------- + values : object + Values to append. Scalars append one element; array-like inputs must be + compatible with ``self.dtype`` and flatten to one dimension. + + Returns + ------- + out : int + The new length of the array. + + Notes + ----- + Appending to indexed arrays updates the index sidecars as part of the + append path. For ``full`` indexes this extends the sorted payload + incrementally; for ``light`` and ``medium`` only the affected tail + segments and block payloads are recomputed. General slice updates and + resizes outside ``append()`` still mark indexes as stale. + """ + if self.ndim != 1: + raise ValueError("append() is only supported for 1-D arrays") + if 0 in self.chunks or 0 in self.blocks: + raise ValueError("Cannot append to arrays with zero-sized chunks or blocks") + + blosc2_ext.check_access_mode(self.schunk.urlpath, self.schunk.mode) + + appended = np.asarray(values, dtype=self.dtype) + if appended.ndim == 0: + appended = appended.reshape(1) + elif appended.ndim != 1: + appended = appended.reshape(-1) + if appended.dtype != self.dtype: + appended = appended.astype(self.dtype, copy=False) + if len(appended) == 0: + return int(self.shape[0]) + + old_size = int(self.shape[0]) + super().resize((old_size + len(appended),)) + super().set_slice(([old_size], [old_size + len(appended)]), appended) + + from . import indexing + + indexing.append_to_indexes(self, old_size, appended) + return int(self.shape[0]) def slice(self, key: int | slice | Sequence[slice], **kwargs: Any) -> NDArray: """Get a (multidimensional) slice as a new :ref:`NDArray`. @@ -4867,16 +5200,52 @@ def indices(self, order: str | list[str] | None = None, **kwargs: Any) -> NDArra This is only valid for 1-dim structured arrays. + When the primary order key has a matching ``full`` index, the ordered + positions are produced directly from that index. Secondary keys refine + ties after the primary indexed order and the traversal is ascending and + stable. + See full documentation in :func:`indices`. """ return indices(self, order, **kwargs) + def itersorted( + self, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + batch_size: int | None = None, + ) -> Iterator[np.generic | np.void]: + """Iterate array values following a matching full index order. + + Parameters + ---------- + order : str, list of str, optional + Sort order to iterate. The first field must have an associated + ``full`` index. Traversal is ascending and stable; if only the + primary key is indexed, secondary keys refine ties after the primary + indexed order. + start, stop, step : int or None, optional + Optional slice applied to the ordered sequence before iteration. + batch_size : int or None, optional + Internal prefetch size used when reading ordered rows. Larger values + reduce read overhead at the cost of more temporary memory. + """ + return itersorted(self, order, start=start, stop=stop, step=step, batch_size=batch_size) + def sort(self, order: str | list[str] | None = None, **kwargs: Any) -> NDArray: """ Return a sorted array following the specified order, or the order of the fields. This is only valid for 1-dim structured arrays. + When the primary order key has a matching ``full`` index, the ordered + rows are gathered directly from that index. Secondary keys refine ties + after the primary indexed order and the traversal is ascending and + stable. + See full documentation in :func:`sort`. """ return sort(self, order, **kwargs) @@ -5958,6 +6327,32 @@ def save(array: NDArray, urlpath: str, contiguous=True, **kwargs: Any) -> None: array.save(urlpath, contiguous, **kwargs) +def _ndarray_asarray_requires_copy( + array: NDArray, dtype: np.dtype, chunks, blocks, user_kwargs: dict[str, Any] +) -> bool: + if np.dtype(dtype) != np.dtype(array.dtype): + return True + if "chunks" in user_kwargs and tuple(chunks) != tuple(array.chunks): + return True + if "blocks" in user_kwargs and tuple(blocks) != tuple(array.blocks): + return True + + copy_keys = { + "cparams", + "dparams", + "meta", + "urlpath", + "contiguous", + "mode", + "mmap_mode", + "initial_mapping_size", + "storage", + "out", + "_chunksize_reduc_factor", + } + return builtins.any(key in user_kwargs for key in copy_keys) + + def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: Any) -> NDArray: """Convert the `array` to an `NDArray`. @@ -5969,7 +6364,8 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: copy: bool | None, optional Whether to copy the input. If True, the function copies. If False, raise a ValueError if copy is necessary. If None and - input is NDArray, avoid copy by returning lazyexpr. + input is NDArray, return the original array when no dtype, + partition, or storage-related changes are requested. Default: None. kwargs: dict, optional @@ -5977,8 +6373,9 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: Returns ------- - out: :ref:`NDArray` or :ref:`LazyExpr` - An new NDArray or LazyExpr made of :paramref:`array`. + out: :ref:`NDArray` + A new :ref:`NDArray` made of :paramref:`array`, or the original + array when a copy is not required. Notes ----- @@ -5996,7 +6393,11 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: >>> a = np.arange(0, np.prod(shape), dtype=np.int64).reshape(shape) >>> # Create a NDArray from a NumPy array >>> nda = blosc2.asarray(a) + >>> # NDArray inputs are returned as-is unless a copy is requested + >>> blosc2.asarray(nda) is nda + True """ + user_kwargs = kwargs.copy() # Convert scalars to numpy array casting = kwargs.pop("casting", "unsafe") if casting != "unsafe": @@ -6004,7 +6405,7 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: if not hasattr(array, "shape"): array = np.asarray(array) # defaults if dtype=None dtype_ = blosc2.proxy.convert_dtype(array.dtype) - dtype = kwargs.pop("dtype", dtype_) # check if dtype provided + dtype = blosc2.proxy.convert_dtype(kwargs.pop("dtype", dtype_)) # check if dtype provided kwargs = _check_ndarray_kwargs(**kwargs) chunks = kwargs.pop("chunks", None) blocks = kwargs.pop("blocks", None) @@ -6016,9 +6417,17 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: if blocks is None and hasattr(array, "blocks") and isinstance(array.blocks, tuple | list): blocks = array.blocks - copy = True if copy is None and not isinstance(array, NDArray) else copy + requires_copy = isinstance(array, NDArray) and _ndarray_asarray_requires_copy( + array, dtype, chunks, blocks, user_kwargs + ) + if copy is None: + copy = not isinstance(array, NDArray) or requires_copy + elif copy is False and requires_copy: + raise ValueError( + "Cannot satisfy dtype, partition, or storage changes with copy=False for NDArray input." + ) if copy: - chunks, blocks = compute_chunks_blocks(array.shape, chunks, blocks, dtype_, **kwargs) + chunks, blocks = compute_chunks_blocks(array.shape, chunks, blocks, dtype, **kwargs) # Fast path for small arrays. This is not too expensive in terms of memory consumption. shape = array.shape small_size = 2**24 # 16 MB @@ -6033,7 +6442,7 @@ def asarray(array: Sequence | blosc2.Array, copy: bool | None = None, **kwargs: return blosc2_ext.asarray(array, chunks, blocks, **kwargs) # Create the empty array - ndarr = empty(shape, dtype_, chunks=chunks, blocks=blocks, **kwargs) + ndarr = empty(shape, dtype, chunks=chunks, blocks=blocks, **kwargs) behaved = are_partitions_behaved(shape, chunks, blocks) # Get the coordinates of the chunks @@ -6216,21 +6625,39 @@ def indices(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: The (structured) array to be sorted. order: str, list of str, optional Specifies which fields to compare first, second, etc. A single - field can be specified as a string. Not all fields need to be - specified, only the ones by which the array is to be sorted. - If None, the array is not sorted. + field can be specified as a string. The primary order key may also be + an indexed expression such as ``"abs(x)"`` when a matching ``full`` + expression index exists. Not all fields need to be specified, only the + ones by which the array is to be sorted. If None, the array is not sorted. kwargs: Any, optional Keyword arguments that are supported by the :func:`empty` constructor. Returns ------- out: :ref:`NDArray` - The sorted array. + The ordered logical positions. + + Notes + ----- + If the primary order key has a matching ``full`` field or expression index, + the positions are returned directly from that index in ascending stable + order. Secondary keys refine ties after the primary indexed order. + Field-based orders without a matching full index fall back to a + scan-plus-sort path. """ if not order: # Shortcut for this relatively rare case return arange(array.shape[0], dtype=np.int64) + if isinstance(array, blosc2.NDArray): + from . import indexing + + ordered = indexing.ordered_indices(array, order=order) + if ordered is not None: + return blosc2.asarray(ordered, **kwargs) + if indexing.is_expression_order(array, order): + raise ValueError("expression order requires a matching full expression index") + # Create a lazy array to access the sort machinery there # This is a bit of a hack, but it is the simplest way to do it # (the sorting mechanism in LazyExpr should be improved to avoid this) @@ -6251,8 +6678,10 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An The (structured) array to be sorted. order: str, list of str, optional Specifies which fields to compare first, second, etc. A single - field can be specified as a string. Not all fields need to be - specified, only the ones by which the array is to be sorted. + field can be specified as a string. The primary order key may also be + an indexed expression such as ``"abs(x)"`` when a matching ``full`` + expression index exists. Not all fields need to be specified, only the + ones by which the array is to be sorted. kwargs: Any, optional Keyword arguments that are supported by the :func:`empty` constructor. @@ -6260,10 +6689,26 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An ------- out: :ref:`NDArray` The sorted array. + + Notes + ----- + If the primary order key has a matching ``full`` field or expression index, + rows are gathered directly in ascending stable index order. Secondary keys + refine ties after the primary indexed order. Field-based orders without a + matching full index fall back to a scan-plus-sort path. """ if not order: return array + if isinstance(array, blosc2.NDArray): + from . import indexing + + ordered = indexing.read_sorted(array, order=order) + if ordered is not None: + return blosc2.asarray(ordered, **kwargs) + if indexing.is_expression_order(array, order): + raise ValueError("expression order requires a matching full expression index") + # Create a lazy array to access the sort machinery there # This is a bit of a hack, but it is the simplest way to do it # (the sorting mechanism in LazyExpr should be improved to avoid this) @@ -6272,6 +6717,44 @@ def sort(array: blosc2.Array, order: str | list[str] | None = None, **kwargs: An return larr.sort(order).compute(**kwargs) +def itersorted( + array: blosc2.Array, + order: str | list[str] | None = None, + *, + start: int | None = None, + stop: int | None = None, + step: int | None = None, + batch_size: int | None = None, +) -> Iterator[np.generic | np.void]: + """ + Iterate array values following a matching full index order. + + Parameters + ---------- + array : :ref:`blosc2.Array` + The array to iterate. + order : str, list of str, optional + Specifies which fields define the ordered traversal. The first field + must have an associated ``full`` index. + start, stop, step : int or None, optional + Optional slice applied to the ordered sequence before iteration. + batch_size : int or None, optional + Internal prefetch size used during iteration. + + Notes + ----- + This requires a matching ``full`` index on the primary order key. The + iteration order is ascending and stable. Secondary keys refine ties after + the primary indexed order. + """ + if not isinstance(array, blosc2.NDArray): + raise TypeError("itersorted() is only supported on NDArray") + + from . import indexing + + return indexing.iter_sorted(array, order=order, start=start, stop=stop, step=step, batch_size=batch_size) + + # Class for dealing with fields in an NDArray # This will allow to access fields by name in the dtype of the NDArray class NDField(Operand): diff --git a/src/blosc2/storage.py b/src/blosc2/storage.py index 0015aea9..5863b145 100644 --- a/src/blosc2/storage.py +++ b/src/blosc2/storage.py @@ -13,7 +13,7 @@ def default_nthreads(): - return blosc2.nthreads + return 1 if blosc2.IS_WASM else blosc2.nthreads def default_filters(): diff --git a/tests/ndarray/test_indexing.py b/tests/ndarray/test_indexing.py new file mode 100644 index 00000000..2829fc18 --- /dev/null +++ b/tests/ndarray/test_indexing.py @@ -0,0 +1,1892 @@ +####################################################################### +# Copyright (c) 2019-present, Blosc Development Team +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause +####################################################################### +import gc +import math +from pathlib import Path + +import numpy as np +import pytest + +import blosc2 +import blosc2.indexing as indexing + + +@pytest.mark.parametrize("kind", ["ultralight", "light", "medium", "full"]) +def test_scalar_index_matches_scan(kind): + data = np.arange(200_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(10_000,), blocks=(2_000,)) + descriptor = arr.create_index(kind=kind) + + assert descriptor["kind"] == kind + assert descriptor["field"] is None + assert descriptor["target"] == {"source": "field", "field": None} + assert len(arr.indexes) == 1 + + expr = ((arr >= 120_000) & (arr < 125_000)).where(arr) + assert expr.will_use_index() is True + explanation = expr.explain() + assert explanation["candidate_units"] < explanation["total_units"] or explanation["level"] == "full" + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data >= 120_000) & (data < 125_000)]) + + +@pytest.mark.parametrize("kind", ["ultralight", "light", "medium", "full"]) +def test_structured_field_index_matches_scan(kind): + dtype = np.dtype([("id", np.int64), ("payload", np.float64)]) + data = np.empty(120_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + data["payload"] = np.linspace(0, 1, data.shape[0], dtype=np.float64) + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + descriptor = arr.create_index(field="id", kind=kind) + assert descriptor["target"] == {"source": "field", "field": "id"} + + expr = blosc2.lazyexpr("(id >= 48_000) & (id < 51_000)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= 48_000) & (data["id"] < 51_000)]) + + +def test_module_level_will_use_index_matches_lazyexpr_method(): + import blosc2.indexing as indexing + + indexed = blosc2.asarray(np.arange(100_000, dtype=np.int64), chunks=(10_000,), blocks=(2_000,)) + indexed.create_index(kind="medium") + indexed_expr = ((indexed >= 48_000) & (indexed < 51_000)).where(indexed) + + plain = blosc2.asarray(np.arange(100_000, dtype=np.int64), chunks=(10_000,), blocks=(2_000,)) + plain_expr = ((plain >= 48_000) & (plain < 51_000)).where(plain) + + assert indexing.will_use_index(indexed_expr) is True + assert indexed_expr.will_use_index() is True + assert indexing.will_use_index(indexed_expr) == indexed_expr.will_use_index() + + assert indexing.will_use_index(plain_expr) is False + assert plain_expr.will_use_index() is False + assert indexing.will_use_index(plain_expr) == plain_expr.will_use_index() + + +def test_index_accessor_exposes_live_view_and_sizes(): + import blosc2.indexing as indexing + + arr = blosc2.asarray(np.arange(1_000, dtype=np.int64), chunks=(250,), blocks=(50,)) + arr.create_index(kind="medium") + + idx = arr.index() + assert isinstance(idx, indexing.Index) + assert idx.kind == "medium" + assert idx.field is None + assert idx.name == "__self__" + assert idx.target == {"source": "field", "field": None} + assert idx.persistent is False + assert idx.stale is False + assert idx["kind"] == "medium" + assert idx["target"]["field"] is None + assert idx.nbytes > 0 + assert idx.cbytes > 0 + assert idx.cratio == pytest.approx(idx.nbytes / idx.cbytes) + + arr[:3] = -1 + assert idx.stale is True + + rebuilt = idx.rebuild() + assert rebuilt is idx + assert idx.stale is False + + idx.drop() + assert arr.indexes == [] + with pytest.raises(KeyError): + _ = idx.kind + + +def test_index_accessor_compact_updates_live_view(tmp_path): + path = tmp_path / "index_accessor_compact.b2nd" + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + idx = arr.index("a") + assert idx.kind == "full" + assert idx.persistent is True + assert idx.nbytes > 0 + assert idx.cbytes > 0 + + arr.append(np.array([(0, 100), (3, 101)], dtype=dtype)) + assert len(idx["full"]["runs"]) == 1 + + compacted = idx.compact() + assert compacted is idx + assert idx["full"]["runs"] == [] + + reopened = blosc2.open(path, mode="a") + assert reopened.index("a")["full"]["runs"] == [] + + +def test_gather_positions_by_block_avoids_whole_chunk_fallback_for_multi_block_reads(monkeypatch): + import blosc2.indexing as indexing + + class FakeSource: + def __init__(self, data, chunk_len): + self.data = np.asarray(data) + self.dtype = self.data.dtype + self.chunk_len = chunk_len + self.slice_reads = 0 + self.span_reads = [] + + def __getitem__(self, key): + self.slice_reads += 1 + return self.data[key] + + def get_1d_span_numpy(self, out, nchunk, start, nitems): + self.span_reads.append((int(nchunk), int(start), int(nitems))) + base = int(nchunk) * self.chunk_len + int(start) + out[:] = self.data[base : base + int(nitems)] + + chunk_len = 10 + block_len = 4 + data = np.arange(40, dtype=np.int64) + positions = np.array([1, 5, 7, 12, 19], dtype=np.int64) + source = FakeSource(data, chunk_len) + + monkeypatch.setattr(indexing, "_supports_block_reads", lambda _: True) + + gathered = indexing._gather_positions_by_block(source, positions, chunk_len, block_len, len(data)) + + np.testing.assert_array_equal(gathered, data[positions]) + assert source.slice_reads == 0 + assert source.span_reads == [(0, 1, 1), (0, 5, 3), (1, 2, 1), (1, 9, 1)] + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_random_field_index_matches_scan(kind): + rng = np.random.default_rng(0) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(150_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, chunks=(15_000,), blocks=(3_000,)) + arr.create_index(field="id", kind=kind) + + expr = blosc2.lazyexpr("(id >= 70_000) & (id < 71_200)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= 70_000) & (data["id"] < 71_200)]) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_random_field_point_query_matches_scan(kind): + rng = np.random.default_rng(1) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(200_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, chunks=(20_000,), blocks=(4_000,)) + arr.create_index(field="id", kind=kind) + + expr = blosc2.lazyexpr("(id >= 123_456) & (id < 123_457)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= 123_456) & (data["id"] < 123_457)]) + + +@pytest.mark.parametrize( + "dtype", + [ + np.int8, + np.int16, + np.int32, + np.int64, + np.uint8, + np.uint16, + np.uint32, + np.uint64, + np.float32, + np.float64, + ], +) +def test_medium_numeric_dtype_query_matches_scan(dtype): + values = np.arange(2_000, dtype=dtype) + if np.issubdtype(dtype, np.floating): + values = values / dtype(10) + + arr = blosc2.asarray(values, chunks=(500,), blocks=(100,)) + arr.create_index(kind="medium") + + query_value = values[137].item() + indexed = arr[arr == query_value].compute()[:] + expected = values[values == query_value] + + np.testing.assert_array_equal(indexed, expected) + + +@pytest.mark.parametrize("dtype", [np.int32, np.uint32, np.float32, np.float64]) +def test_light_numeric_dtype_query_matches_scan(dtype): + values = np.arange(2_000, dtype=dtype) + if np.issubdtype(dtype, np.floating): + values = values / dtype(10) + + arr = blosc2.asarray(values, chunks=(500,), blocks=(100,)) + arr.create_index(kind="light") + + lower = values[137].item() + upper = values[163].item() + indexed = arr[(arr >= lower) & (arr < upper)].compute()[:] + expected = values[(values >= lower) & (values < upper)] + + np.testing.assert_array_equal(indexed, expected) + + +def test_numeric_unsupported_dtype_fallback_matches_scan(): + values = (np.arange(2_000, dtype=np.float16) / np.float16(10)).astype(np.float16) + + arr = blosc2.asarray(values, chunks=(500,), blocks=(100,)) + arr.create_index(kind="medium") + + query_value = values[137].item() + indexed = arr[arr == query_value].compute()[:] + expected = values[values == query_value] + + np.testing.assert_array_equal(indexed, expected) + + +def test_light_lossy_integer_values_match_scan(): + rng = np.random.default_rng(2) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(180_000, dtype=dtype) + data["id"] = np.arange(-90_000, 90_000, dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, chunks=(18_000,), blocks=(3_000,)) + descriptor = arr.create_index(field="id", kind="light", optlevel=0) + + assert descriptor["light"]["value_lossy_bits"] == 8 + + expr = blosc2.lazyexpr("(id >= -123) & (id < 456)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["id"] >= -123) & (data["id"] < 456)]) + + +def test_light_lossy_float_values_match_scan(): + rng = np.random.default_rng(3) + dtype = np.dtype([("x", np.float64), ("payload", np.float32)]) + data = np.zeros(160_000, dtype=dtype) + data["x"] = np.linspace(-5000.0, 5000.0, data.shape[0], dtype=np.float64) + rng.shuffle(data["x"]) + + arr = blosc2.asarray(data, chunks=(16_000,), blocks=(4_000,)) + descriptor = arr.create_index(field="x", kind="light", optlevel=0) + + assert descriptor["light"]["value_lossy_bits"] == 8 + + expr = blosc2.lazyexpr("(x >= -12.5) & (x < 17.25)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, data[(data["x"] >= -12.5) & (data["x"] < 17.25)]) + + +def test_ultralight_threaded_downstream_order_matches_scan(monkeypatch): + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.zeros(240_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + arr.create_index(field="id", kind="ultralight") + + indexing = __import__("blosc2.indexing", fromlist=["INDEX_QUERY_MIN_CHUNKS_PER_THREAD"]) + monkeypatch.setattr(indexing, "INDEX_QUERY_MIN_CHUNKS_PER_THREAD", 1) + monkeypatch.setattr(blosc2, "nthreads", 4) + + expr = blosc2.lazyexpr("(id >= 60_000) & (id < 180_000)", arr.fields).where(arr) + explanation = expr.explain() + + assert explanation["will_use_index"] is True + assert explanation["level"] == "chunk" + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(data["id"] >= 60_000) & (data["id"] < 180_000)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + +def test_light_threaded_downstream_order_matches_scan(monkeypatch): + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.zeros(240_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + arr.create_index(field="id", kind="light", in_mem=True) + + indexing = __import__("blosc2.indexing", fromlist=["INDEX_QUERY_MIN_CHUNKS_PER_THREAD"]) + monkeypatch.setattr(indexing, "INDEX_QUERY_MIN_CHUNKS_PER_THREAD", 1) + monkeypatch.setattr(blosc2, "nthreads", 4) + + expr = blosc2.lazyexpr("(id >= 60_000) & (id < 180_000)", arr.fields).where(arr) + explanation = expr.explain() + + assert explanation["will_use_index"] is True + assert explanation["lookup_path"] == "chunk-nav" + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(data["id"] >= 60_000) & (data["id"] < 180_000)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_persistent_index_survives_reopen(tmp_path, kind): + path = tmp_path / "indexed_array.b2nd" + data = np.arange(80_000, dtype=np.int64) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) + descriptor = arr.create_index(kind=kind) + + if kind == "light": + assert descriptor["light"]["values_path"] is not None + elif kind == "medium": + assert descriptor["reduced"]["values_path"] is not None + else: + assert descriptor["full"]["values_path"] is not None + + del arr + reopened = blosc2.open(path, mode="a") + assert len(reopened.indexes) == 1 + if kind == "light": + assert reopened.indexes[0]["light"]["values_path"] == descriptor["light"]["values_path"] + elif kind == "medium": + assert reopened.indexes[0]["reduced"]["values_path"] == descriptor["reduced"]["values_path"] + else: + assert reopened.indexes[0]["full"]["values_path"] == descriptor["full"]["values_path"] + + expr = (reopened >= 72_000).where(reopened) + assert expr.will_use_index() is True + np.testing.assert_array_equal(expr.compute()[:], data[data >= 72_000]) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_default_ooc_persistent_index_matches_scan_and_rebuilds(tmp_path, kind): + path = tmp_path / f"indexed_ooc_{kind}.b2nd" + rng = np.random.default_rng(7) + dtype = np.dtype([("id", np.int64), ("payload", np.float32)]) + data = np.zeros(240_000, dtype=dtype) + data["id"] = np.arange(data.shape[0], dtype=np.int64) + rng.shuffle(data["id"]) + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(24_000,), blocks=(4_000,)) + descriptor = arr.create_index(field="id", kind=kind) + + assert descriptor["ooc"] is True + + del arr + reopened = blosc2.open(path, mode="a") + assert reopened.indexes[0]["ooc"] is True + + expr = blosc2.lazyexpr("(id >= 123_456) & (id < 124_321)", reopened.fields).where(reopened) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(data["id"] >= 123_456) & (data["id"] < 124_321)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + rebuilt = reopened.rebuild_index() + assert rebuilt["ooc"] is True + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_persistent_chunk_local_ooc_builds_do_not_use_temp_memmap(tmp_path, kind): + path = tmp_path / f"persistent_no_memmap_{kind}.b2nd" + data = np.arange(120_000, dtype=np.int64) + indexing = __import__("blosc2.indexing", fromlist=["_segment_row_count"]) + assert not hasattr(indexing, "_open_temp_memmap") + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(12_000,), blocks=(2_000,)) + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + meta = descriptor["light"] if kind == "light" else descriptor["reduced"] + assert meta["values_path"] is not None + + del arr + reopened = blosc2.open(path, mode="a") + expr = ((reopened >= 55_000) & (reopened < 55_010)).where(reopened) + np.testing.assert_array_equal(expr.compute()[:], data[(data >= 55_000) & (data < 55_010)]) + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_in_memory_chunk_local_ooc_builds_do_not_use_temp_memmap(kind): + data = np.arange(120_000, dtype=np.int64) + indexing = __import__("blosc2.indexing", fromlist=["_segment_row_count"]) + assert not hasattr(indexing, "_open_temp_memmap") + + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(2_000,)) + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + expr = ((arr >= 55_000) & (arr < 55_010)).where(arr) + np.testing.assert_array_equal(expr.compute()[:], data[(data >= 55_000) & (data < 55_010)]) + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_chunk_local_index_descriptor_and_lookup_path(tmp_path, kind): + path = tmp_path / f"chunk_local_{kind}.b2nd" + rng = np.random.default_rng(11) + data = np.arange(240_000, dtype=np.int64) + rng.shuffle(data) + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(24_000,), blocks=(4_000,)) + descriptor = arr.create_index(kind=kind) + meta = descriptor["light"] if kind == "light" else descriptor["reduced"] + + assert meta["layout"] == "chunk-local-v1" + assert meta["chunk_len"] == arr.chunks[0] + expected_nav_len = ( + arr.blocks[0] if kind == "light" else max(arr.blocks[0] // 4, math.ceil(arr.chunks[0] / 2048)) + ) + assert meta["nav_segment_len"] == expected_nav_len + assert meta["l1_path"] is not None + assert meta["l2_path"] is not None + + if kind == "medium": + assert meta["nav_segment_divisor"] == 4 + + del arr + reopened = blosc2.open(path, mode="a") + expr = (reopened == 123_456).where(reopened) + explanation = expr.explain() + + assert explanation["lookup_path"] == "chunk-nav-ooc" + assert explanation["candidate_nav_segments"] is not None + np.testing.assert_array_equal(expr.compute()[:], data[data == 123_456]) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_small_default_index_builder_uses_ooc(kind): + data = np.arange(100_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(10_000,), blocks=(2_000,)) + + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_in_mem_override_disables_ooc_builder(kind): + data = np.arange(120_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(12_000,), blocks=(3_000,)) + + descriptor = arr.create_index(kind=kind, in_mem=True) + + assert descriptor["ooc"] is False + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_chunk_local_ooc_intra_chunk_build_uses_thread_pool_when_threads_forced(monkeypatch, kind): + if blosc2.IS_WASM: + pytest.skip("wasm32 does not use Python thread pools for index building") + data = np.arange(48_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(48_000,), blocks=(1_500,)) + indexing = __import__("blosc2.indexing", fromlist=["ThreadPoolExecutor"]) + observed_workers = [] + + class FakeExecutor: + def __init__(self, *, max_workers): + observed_workers.append(max_workers) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def map(self, fn, iterable): + return [fn(item) for item in iterable] + + monkeypatch.setenv("BLOSC2_INDEX_BUILD_THREADS", "2") + monkeypatch.setattr(indexing, "ThreadPoolExecutor", FakeExecutor) + + descriptor = arr.create_index(kind=kind) + + assert descriptor["ooc"] is True + assert observed_workers + assert observed_workers[0] == 2 + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_in_memory_chunk_local_build_uses_cparams_nthreads(monkeypatch, kind): + if blosc2.IS_WASM: + pytest.skip("wasm32 does not use Python thread pools for index building") + data = np.arange(48_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(48_000,), blocks=(1_500,)) + indexing = __import__("blosc2.indexing", fromlist=["ThreadPoolExecutor"]) + observed_workers = [] + + class FakeExecutor: + def __init__(self, *, max_workers): + observed_workers.append(max_workers) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc, tb): + return False + + def map(self, fn, iterable): + return [fn(item) for item in iterable] + + monkeypatch.setattr(indexing, "ThreadPoolExecutor", FakeExecutor) + + descriptor = arr.create_index(kind=kind, in_mem=True, cparams=blosc2.CParams(nthreads=2)) + + assert descriptor["ooc"] is False + assert observed_workers + assert observed_workers[0] == 2 + + +@pytest.mark.parametrize("kind", ["light", "medium"]) +def test_persistent_chunk_local_sidecars_use_cparams(tmp_path, kind): + path = tmp_path / f"persistent_cparams_{kind}.b2nd" + data = np.arange(48_000, dtype=np.int64) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(12_000,), blocks=(2_000,)) + cparams = blosc2.CParams(codec=blosc2.Codec.LZ4, clevel=2, nthreads=3) + + descriptor = arr.create_index(kind=kind, cparams=cparams) + meta = descriptor["light"] if kind == "light" else descriptor["reduced"] + aux_key = "bucket_positions_path" if kind == "light" else "positions_path" + + values_sidecar = blosc2.open(meta["values_path"]) + aux_sidecar = blosc2.open(meta[aux_key]) + + for sidecar in (values_sidecar, aux_sidecar): + assert sidecar.cparams.codec == blosc2.Codec.LZ4 + assert sidecar.cparams.clevel == 2 + + +def test_intra_chunk_sort_run_matches_numpy_stable_order(): + indexing_ext = __import__("blosc2.indexing_ext", fromlist=["intra_chunk_sort_run"]) + values = np.array([4.0, np.nan, 2.0, 2.0, np.nan, 1.0, 4.0], dtype=np.float64) + + sorted_values, positions = indexing_ext.intra_chunk_sort_run(values, 0, np.dtype(np.uint16)) + + order = np.argsort(values, kind="stable") + np.testing.assert_array_equal(sorted_values, values[order]) + np.testing.assert_array_equal(positions, order.astype(np.uint16, copy=False)) + + +def test_intra_chunk_merge_sorted_slices_matches_lexsort_merge(): + indexing_ext = __import__("blosc2.indexing_ext", fromlist=["intra_chunk_merge_sorted_slices"]) + left_values = np.array([1.0, 2.0, 2.0, np.nan], dtype=np.float64) + left_positions = np.array([0, 2, 3, 6], dtype=np.uint16) + right_values = np.array([1.0, 2.0, 3.0, np.nan], dtype=np.float64) + right_positions = np.array([1, 4, 5, 7], dtype=np.uint16) + + merged_values, merged_positions = indexing_ext.intra_chunk_merge_sorted_slices( + left_values, left_positions, right_values, right_positions, np.dtype(np.uint16) + ) + + all_values = np.concatenate((left_values, right_values)) + all_positions = np.concatenate((left_positions, right_positions)) + order = np.lexsort((all_positions, all_values)) + np.testing.assert_array_equal(merged_values, all_values[order]) + np.testing.assert_array_equal(merged_positions, all_positions[order]) + + +def test_mutation_marks_index_stale_and_rebuild_restores_it(): + data = np.arange(50_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,)) + arr.create_index(kind="full") + + arr[:25] = -1 + assert arr.indexes[0]["stale"] is True + + expr = (arr < 0).where(arr) + assert expr.will_use_index() is False + np.testing.assert_array_equal(expr.compute()[:], np.full(25, -1, dtype=np.int64)) + + rebuilt = arr.rebuild_index() + assert rebuilt["stale"] is False + assert expr.will_use_index() is True + + +def test_full_index_reuses_primary_order_for_indices_and_sort(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array( + [(2, 9), (1, 8), (2, 7), (1, 6), (2, 5), (1, 4), (2, 3), (1, 2), (2, 1), (1, 0)], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_csindex("a") + + np.testing.assert_array_equal(arr.indices(order=["a", "b"])[:], np.argsort(data, order=["a", "b"])) + np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], np.sort(data, order=["a", "b"])) + + +def test_filtered_ordered_queries_support_cross_field_exact_indexes(): + dtype = np.dtype([("a", np.int64), ("b", np.int64), ("payload", np.int32)]) + data = np.array( + [ + (2, 9, 10), + (1, 8, 11), + (2, 7, 12), + (1, 6, 13), + (2, 5, 14), + (1, 4, 15), + (2, 3, 16), + (1, 2, 17), + (2, 1, 18), + (1, 0, 19), + ], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_csindex("a") + arr.create_csindex("b") + + expr = blosc2.lazyexpr("(a >= 1) & (a < 3) & (b >= 2) & (b < 8)", arr.fields).where(arr) + mask = (data["a"] >= 1) & (data["a"] < 3) & (data["b"] >= 2) & (data["b"] < 8) + expected_indices = np.where(mask)[0] + expected_order = np.argsort(data[mask], order=["a", "b"]) + + np.testing.assert_array_equal( + expr.indices(order=["a", "b"]).compute()[:], expected_indices[expected_order] + ) + np.testing.assert_array_equal( + expr.sort(order=["a", "b"]).compute()[:], np.sort(data[mask], order=["a", "b"]) + ) + + explained = expr.sort(order=["a", "b"]).explain() + assert explained["will_use_index"] is True + assert explained["ordered_access"] is True + assert explained["field"] == "a" + assert explained["target"] == {"source": "field", "field": "a"} + assert explained["secondary_refinement"] is True + assert explained["filter_reason"] == "multi-field exact indexes selected" + + +def test_itersorted_matches_numpy_sorted_order(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array( + [(3, 2), (1, 9), (2, 4), (1, 3), (3, 1), (2, 6), (1, 5), (2, 0), (3, 8), (1, 7)], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_csindex("a") + + rows = np.array(list(arr.itersorted(order=["a", "b"], batch_size=3)), dtype=dtype) + np.testing.assert_array_equal(rows, np.sort(data, order=["a", "b"])) + + +def test_ordered_explain_reports_missing_full_index(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 2), (1, 9), (2, 4), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_index(field="b", kind="medium") + + expr = blosc2.lazyexpr("b >= 0", arr.fields).where(arr).sort(order="a") + explained = expr.explain() + + assert explained["will_use_index"] is False + assert explained["ordered_access"] is True + assert explained["reason"] == "no matching full index was found for ordered access" + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_append_keeps_index_current(kind): + rng = np.random.default_rng(4) + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.zeros(32, dtype=dtype) + data["id"] = np.arange(32, dtype=np.int64) + rng.shuffle(data["id"]) + data["payload"] = np.arange(32, dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(8,), blocks=(4,)) + arr.create_index(field="id", kind=kind) + + appended = np.array([(33, 100), (35, 101), (34, 102), (32, 103)], dtype=dtype) + all_data = np.concatenate((data, appended)) + arr.append(appended) + + assert arr.indexes[0]["stale"] is False + + expr = blosc2.lazyexpr("(id >= 31) & (id < 36)", arr.fields).where(arr) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = all_data[(all_data["id"] >= 31) & (all_data["id"] < 36)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + +def test_append_keeps_full_index_sorted_access_current(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(2, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + appended = np.array([(0, 100), (3, 101), (1, 5)], dtype=dtype) + arr.append(appended) + + expected = np.sort(np.concatenate((data, appended)), order=["a", "b"]) + np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], expected) + + +def test_repeated_appends_keep_full_index_current(): + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + batches = [ + np.array([(0, 100), (3, 101)], dtype=dtype), + np.array([(2, 102), (1, 103), (4, 104)], dtype=dtype), + ] + expected = data + for nrun, batch in enumerate(batches, start=1): + arr.append(batch) + expected = np.concatenate((expected, batch)) + assert len(arr.indexes[0]["full"]["runs"]) == nrun + + expr = blosc2.lazyexpr("(a >= 1) & (a < 4)", arr.fields).where(arr) + assert expr.will_use_index() is True + + expected_mask = (expected["a"] >= 1) & (expected["a"] < 4) + np.testing.assert_array_equal(arr.sort(order=["a", "b"])[:], np.sort(expected, order=["a", "b"])) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_persistent_full_index_runs_survive_reopen(tmp_path): + path = tmp_path / "full_index_runs.b2nd" + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + batch1 = np.array([(0, 100), (3, 101)], dtype=dtype) + batch2 = np.array([(2, 102), (1, 103), (4, 104)], dtype=dtype) + expected = np.concatenate((data, batch1, batch2)) + arr.append(batch1) + arr.append(batch2) + + del arr + reopened = blosc2.open(path, mode="a") + assert len(reopened.indexes[0]["full"]["runs"]) == 2 + + expr = blosc2.lazyexpr("(a >= 1) & (a < 4)", reopened.fields).where(reopened) + expected_mask = (expected["a"] >= 1) & (expected["a"] < 4) + np.testing.assert_array_equal(reopened.sort(order=["a", "b"])[:], np.sort(expected, order=["a", "b"])) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_persistent_compact_full_exact_query_avoids_whole_sidecar_load(monkeypatch, tmp_path): + path = tmp_path / "full_selective_ooc.b2nd" + rng = np.random.default_rng(12) + data = np.arange(120_000, dtype=np.int64) + rng.shuffle(data) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(12_000,), blocks=(2_000,)) + arr.create_csindex() + + del arr + reopened = blosc2.open(path, mode="a") + indexing = __import__("blosc2.indexing", fromlist=["_load_array_sidecar"]) + original_load = indexing._load_array_sidecar + + def guarded_load(array, token, category, name, sidecar_path): + if category == "full" and name in {"values", "positions"}: + raise AssertionError("compact full exact lookup should not whole-load full sidecars") + return original_load(array, token, category, name, sidecar_path) + + monkeypatch.setattr(indexing, "_load_array_sidecar", guarded_load) + + expr = ((reopened >= 50_000) & (reopened < 50_010)).where(reopened) + explained = expr.explain() + assert explained["lookup_path"] == "compact-selective-ooc" + np.testing.assert_array_equal(expr.compute()[:], data[(data >= 50_000) & (data < 50_010)]) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_expression_index_matches_scan(kind): + rng = np.random.default_rng(9) + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.zeros(150_000, dtype=dtype) + data["x"] = np.arange(-75_000, 75_000, dtype=np.int64) + rng.shuffle(data["x"]) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, chunks=(15_000,), blocks=(3_000,)) + descriptor = arr.create_expr_index("abs(x)", kind=kind) + + assert descriptor["target"]["source"] == "expression" + assert descriptor["target"]["expression_key"] == "abs(x)" + assert descriptor["target"]["dependencies"] == ["x"] + + expr = blosc2.lazyexpr("(abs(x) >= 123) & (abs(x) < 456)", arr.fields).where(arr) + assert expr.will_use_index() is True + + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = data[(np.abs(data["x"]) >= 123) & (np.abs(data["x"]) < 456)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + +def test_full_expression_index_reuses_ordered_access(): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array( + [(-8, 0), (5, 1), (-2, 2), (11, 3), (3, 4), (-3, 5), (2, 6), (-5, 7)], + dtype=dtype, + ) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full", name="abs_x") + + expected_positions = np.argsort(np.abs(data["x"]), kind="stable") + np.testing.assert_array_equal(arr.indices(order="abs(x)")[:], expected_positions) + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], data[expected_positions]) + + expr = blosc2.lazyexpr("(abs(x) >= 2) & (abs(x) < 8)", arr.fields).where(arr) + mask = (np.abs(data["x"]) >= 2) & (np.abs(data["x"]) < 8) + filtered_positions = np.where(mask)[0] + filtered_order = np.argsort(np.abs(data["x"][mask]), kind="stable") + np.testing.assert_array_equal( + expr.indices(order="abs(x)").compute()[:], filtered_positions[filtered_order] + ) + np.testing.assert_array_equal( + expr.sort(order="abs(x)").compute()[:], data[filtered_positions[filtered_order]] + ) + + explained = expr.sort(order="abs(x)").explain() + assert explained["will_use_index"] is True + assert explained["ordered_access"] is True + assert explained["target"]["source"] == "expression" + assert explained["target"]["expression_key"] == "abs(x)" + + +def test_persistent_expression_index_survives_reopen(tmp_path): + path = tmp_path / "expr_indexed_array.b2nd" + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.zeros(80_000, dtype=dtype) + data["x"] = np.arange(-40_000, 40_000, dtype=np.int64) + data["payload"] = np.arange(data.shape[0], dtype=np.int32) + + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(8_000,), blocks=(2_000,)) + descriptor = arr.create_expr_index("abs(x)", kind="medium") + + del arr + reopened = blosc2.open(path, mode="a") + assert reopened.indexes[0]["target"]["source"] == "expression" + assert reopened.indexes[0]["target"]["expression_key"] == "abs(x)" + assert reopened.indexes[0]["reduced"]["values_path"] == descriptor["reduced"]["values_path"] + + expr = blosc2.lazyexpr("(abs(x) >= 777) & (abs(x) < 999)", reopened.fields).where(reopened) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + np.testing.assert_array_equal(indexed, scanned) + + +@pytest.mark.parametrize("kind", ["light", "medium", "full"]) +def test_append_keeps_expression_index_current(kind): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3), (-6, 4), (9, 5)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind=kind) + + appended = np.array([(-4, 6), (12, 7), (-11, 8), (5, 9)], dtype=dtype) + all_data = np.concatenate((data, appended)) + arr.append(appended) + + assert arr.indexes[0]["stale"] is False + + expr = blosc2.lazyexpr("(abs(x) >= 4) & (abs(x) < 12)", arr.fields).where(arr) + indexed = expr.compute()[:] + scanned = expr.compute(_use_index=False)[:] + expected = all_data[(np.abs(all_data["x"]) >= 4) & (np.abs(all_data["x"]) < 12)] + + np.testing.assert_array_equal(indexed, scanned) + np.testing.assert_array_equal(indexed, expected) + + if kind == "full": + expected_positions = np.argsort(np.abs(all_data["x"]), kind="stable") + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], all_data[expected_positions]) + + +def test_repeated_appends_keep_full_expression_index_current(): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full") + + batches = [ + np.array([(-4, 4), (12, 5)], dtype=dtype), + np.array([(-11, 6), (5, 7)], dtype=dtype), + ] + expected = data + for nrun, batch in enumerate(batches, start=1): + arr.append(batch) + expected = np.concatenate((expected, batch)) + assert len(arr.indexes[0]["full"]["runs"]) == nrun + + expr = blosc2.lazyexpr("(abs(x) >= 4) & (abs(x) < 12)", arr.fields).where(arr) + expected_mask = (np.abs(expected["x"]) >= 4) & (np.abs(expected["x"]) < 12) + expected_positions = np.argsort(np.abs(expected["x"]), kind="stable") + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], expected[expected_positions]) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_compact_full_index_clears_runs_and_preserves_results(tmp_path): + path = tmp_path / "compact_full_runs.b2nd" + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(3, 9), (1, 8), (2, 7), (1, 6)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(2,), blocks=(2,)) + arr.create_csindex("a") + + batch1 = np.array([(0, 100), (3, 101)], dtype=dtype) + batch2 = np.array([(2, 102), (1, 103), (4, 104)], dtype=dtype) + expected = np.concatenate((data, batch1, batch2)) + arr.append(batch1) + arr.append(batch2) + + before = arr.indexes[0] + assert len(before["full"]["runs"]) == 2 + run_paths = [(run["values_path"], run["positions_path"]) for run in before["full"]["runs"]] + + compacted = arr.compact_index("a") + assert compacted["kind"] == "full" + assert compacted["full"]["runs"] == [] + assert compacted["full"]["l1_path"] is not None + assert compacted["full"]["l2_path"] is not None + + del arr + reopened = blosc2.open(path, mode="a") + assert reopened.indexes[0]["full"]["runs"] == [] + for values_path, positions_path in run_paths: + with pytest.raises(FileNotFoundError): + blosc2.open(values_path) + with pytest.raises(FileNotFoundError): + blosc2.open(positions_path) + + expr = blosc2.lazyexpr("(a >= 1) & (a < 4)", reopened.fields).where(reopened) + explained = expr.explain() + assert explained["full_runs"] == 0 + assert explained["lookup_path"] == "compact-selective-ooc" + expected_mask = (expected["a"] >= 1) & (expected["a"] < 4) + np.testing.assert_array_equal(reopened.sort(order=["a", "b"])[:], np.sort(expected, order=["a", "b"])) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_compact_full_expression_index_preserves_results(): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(2,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full") + + batch1 = np.array([(-4, 4), (12, 5)], dtype=dtype) + batch2 = np.array([(-11, 6), (5, 7)], dtype=dtype) + expected = np.concatenate((data, batch1, batch2)) + arr.append(batch1) + arr.append(batch2) + + compacted = arr.compact_index() + assert compacted["full"]["runs"] == [] + + expr = blosc2.lazyexpr("(abs(x) >= 4) & (abs(x) < 12)", arr.fields).where(arr) + expected_mask = (np.abs(expected["x"]) >= 4) & (np.abs(expected["x"]) < 12) + expected_positions = np.argsort(np.abs(expected["x"]), kind="stable") + np.testing.assert_array_equal(arr.sort(order="abs(x)")[:], expected[expected_positions]) + np.testing.assert_array_equal(expr.compute()[:], expected[expected_mask]) + + +def test_persistent_large_run_full_query_uses_bounded_fallback(monkeypatch, tmp_path): + path = tmp_path / "large_run_fallback.b2nd" + dtype = np.dtype([("id", np.int64), ("payload", np.int32)]) + data = np.array([(10, 0), (20, 1), (30, 2), (40, 3)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(4,), blocks=(2,)) + arr.create_index(field="id", kind="full") + + for run in range(8): + batch = np.array([(100 + run, 10 + run)], dtype=dtype) + arr.append(batch) + + del arr + reopened = blosc2.open(path, mode="a") + indexing = __import__("blosc2.indexing", fromlist=["_load_full_arrays"]) + + def guarded_load_full_arrays(*args, **kwargs): + raise AssertionError("large-run bounded fallback should avoid _load_full_arrays") + + monkeypatch.setattr(indexing, "_load_full_arrays", guarded_load_full_arrays) + expr = blosc2.lazyexpr("(id >= 103) & (id <= 106)", reopened.fields).where(reopened) + explained = expr.explain() + assert explained["lookup_path"] == "run-bounded-ooc" + snapshot = reopened[:] + expected = snapshot[(snapshot["id"] >= 103) & (snapshot["id"] <= 106)] + np.testing.assert_array_equal(expr.compute()[:], expected) + + +def test_large_run_full_expression_query_uses_bounded_fallback(monkeypatch): + dtype = np.dtype([("x", np.int64), ("payload", np.int32)]) + data = np.array([(-10, 0), (7, 1), (-3, 2), (1, 3)], dtype=dtype) + arr = blosc2.asarray(data, chunks=(4,), blocks=(2,)) + arr.create_expr_index("abs(x)", kind="full") + + for run, value in enumerate(range(20, 28)): + arr.append(np.array([(value, 10 + run)], dtype=dtype)) + + indexing = __import__("blosc2.indexing", fromlist=["_load_full_arrays"]) + + def guarded_load_full_arrays(*args, **kwargs): + raise AssertionError("large-run bounded fallback should avoid _load_full_arrays") + + monkeypatch.setattr(indexing, "_load_full_arrays", guarded_load_full_arrays) + expr = blosc2.lazyexpr("(abs(x) >= 22) & (abs(x) <= 25)", arr.fields).where(arr) + explained = expr.explain() + assert explained["lookup_path"] == "run-bounded-ooc" + snapshot = arr[:] + expected = snapshot[(np.abs(snapshot["x"]) >= 22) & (np.abs(snapshot["x"]) <= 25)] + np.testing.assert_array_equal(expr.compute()[:], expected) + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _make_persistent_array(tmpdir, n=50_000): + """Create a persistent structured NDArray with a full index.""" + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data = np.empty(n, dtype=dtype) + data["id"] = np.arange(n, dtype=np.int64) + data["val"] = np.linspace(0, 1, n, dtype=np.float32) + urlpath = str(Path(tmpdir) / "arr.b2nd") + arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,), urlpath=urlpath, mode="w") + arr.create_index(field="id", kind="full") + return arr, urlpath + + +def _make_scalar_persistent_array(tmpdir, n=50_000): + """Create a persistent 1-D int64 NDArray with a full index.""" + data = np.arange(n, dtype=np.int64) + urlpath = str(Path(tmpdir) / "scalar.b2nd") + arr = blosc2.asarray(data, chunks=(5_000,), blocks=(1_000,), urlpath=urlpath, mode="w") + arr.create_index(kind="full") + return arr, urlpath + + +def _clear_caches(): + """Clear all in-process index and query caches between tests.""" + indexing._hot_cache_clear() + indexing._QUERY_CACHE_STORE_HANDLES.clear() + indexing._PERSISTENT_INDEXES.clear() + + +# --------------------------------------------------------------------------- +# Stage 2 – Cache key normalization +# --------------------------------------------------------------------------- + + +def test_canonical_digest_is_stable(): + """The same query always hashes to the same digest.""" + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + assert indexing._query_cache_digest(d1) == indexing._query_cache_digest(d2) + + +def test_canonical_digest_differs_on_expression_change(): + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 7)", ["__self__"], None) + assert indexing._query_cache_digest(d1) != indexing._query_cache_digest(d2) + + +def test_canonical_digest_differs_on_order_change(): + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], ["id"]) + assert indexing._query_cache_digest(d1) != indexing._query_cache_digest(d2) + + +def test_canonical_digest_preserves_order_field_sequence(): + d1 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], ["a", "b"]) + d2 = indexing._normalize_query_descriptor("(id >= 3) & (id < 6)", ["__self__"], ["b", "a"]) + assert indexing._query_cache_digest(d1) != indexing._query_cache_digest(d2) + + +def test_ast_normalization_ignores_whitespace(): + """ast.unparse normalizes whitespace so queries match regardless of spacing.""" + d1 = indexing._normalize_query_descriptor("(id>=3)&(id<6)", ["__self__"], None) + d2 = indexing._normalize_query_descriptor("( id >= 3 ) & ( id < 6 )", ["__self__"], None) + assert indexing._query_cache_digest(d1) == indexing._query_cache_digest(d2) + + +# --------------------------------------------------------------------------- +# Stage 3 – Payload encode / decode +# --------------------------------------------------------------------------- + + +def test_encode_decode_roundtrip_u4(): + coords = np.array([0, 5, 100, 200], dtype=np.int64) + payload = indexing._encode_coords_payload(coords) + assert payload["dtype"] == " 131072); expect oldest evicted. + entry_size = 100 + for i in range(165): + coords = np.arange(entry_size, dtype=np.int64) + indexing._hot_cache_put(f"key{i}", coords) + + # First keys should have been evicted. + assert indexing._hot_cache_get("key0") is None + # Most recent keys should still be present. + assert indexing._hot_cache_get("key164") is not None + assert indexing._HOT_CACHE_BYTES <= indexing.QUERY_CACHE_MAX_MEM_NBYTES + + +def test_hot_cache_clear(): + _clear_caches() + indexing._hot_cache_put("k1", np.array([1, 2, 3], dtype=np.int64)) + indexing._hot_cache_clear() + assert indexing._hot_cache_get("k1") is None + assert indexing._HOT_CACHE_BYTES == 0 + + +# --------------------------------------------------------------------------- +# Stage 4 – End-to-end: cache miss then hit (in-memory array, hot cache only) +# --------------------------------------------------------------------------- + + +def test_in_memory_array_hot_cache_hit(): + """A second identical .indices().compute() reuses the hot cache.""" + _clear_caches() + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data = np.empty(30_000, dtype=dtype) + data["id"] = np.arange(30_000, dtype=np.int64) + data["val"] = np.zeros(30_000, dtype=np.float32) + arr = blosc2.asarray(data, chunks=(3_000,), blocks=(600,)) + arr.create_index(field="id", kind="full") + + expr = blosc2.lazyexpr("(id >= 10_000) & (id < 15_000)", arr.fields).where(arr) + result1 = expr.indices().compute() + + assert indexing._HOT_CACHE_BYTES > 0, "hot cache should be populated after first query" + + result2 = expr.indices().compute() + np.testing.assert_array_equal(result1, result2) + + +# --------------------------------------------------------------------------- +# Stage 4 – Persistent cache: cross-session hit +# --------------------------------------------------------------------------- + + +def test_persistent_cache_survives_reopen(tmp_path): + """After reopening the array the persistent cache should serve the result.""" + arr, urlpath = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 20_000) & (id < 25_000)", arr.fields).where(arr) + result1 = expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists(), "persistent payload store should be created" + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + # Re-open the array in a fresh process-local state. + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + result2 = blosc2.lazyexpr("(id >= 20_000) & (id < 25_000)", arr2.fields).where(arr2).indices().compute() + + np.testing.assert_array_equal(result1, result2) + + +def test_persistent_cache_not_created_for_non_persistent_array(): + _clear_caches() + data = np.arange(10_000, dtype=np.int64) + arr = blosc2.asarray(data, chunks=(1_000,), blocks=(200,)) + arr.create_index(kind="full") + result = indexing._persistent_cache_lookup(arr, "any_digest") + assert result is None + + +# --------------------------------------------------------------------------- +# Stage 3 – Per-entry logical-byte size limit +# --------------------------------------------------------------------------- + + +def test_persistent_entry_size_limit_rejected(tmp_path): + """Entries whose logical int64 position bytes exceed the entry limit must not be stored.""" + arr, _ = _make_persistent_array(tmp_path, n=50_000) + _clear_caches() + + # 10k coordinates imply 80 KB of logical int64 positions and should exceed the 64 KB limit. + rng = np.random.default_rng(42) + coords = np.sort(rng.choice(50_000, size=10_000, replace=False)).astype(np.int64) + + entry_nbytes = indexing._query_cache_entry_nbytes(coords) + assert entry_nbytes > indexing.QUERY_CACHE_MAX_ENTRY_NBYTES, ( + f"test setup error: logical size {entry_nbytes} must exceed " + f"{indexing.QUERY_CACHE_MAX_ENTRY_NBYTES} for this test to be meaningful" + ) + + descriptor = indexing._normalize_query_descriptor("(id >= 0) & (id < 50000)", ["__self__"], None) + digest = indexing._query_cache_digest(descriptor) + + result = indexing._persistent_cache_insert(arr, digest, coords, descriptor) + assert result is False, "oversized entry must be rejected" + + +def test_persistent_cache_overflow_nukes_persistent_entries_and_keeps_newest(tmp_path, monkeypatch): + arr, urlpath = _make_persistent_array(tmp_path, n=8_000) + _clear_caches() + + rng = np.random.default_rng(123) + payloads = [] + for i in range(3): + coords = np.sort(rng.choice(8_000, size=256, replace=False)).astype(np.int64) + descriptor = indexing._normalize_query_descriptor( + f"(id >= {i}) & (id < {i + 1})", ["__self__"], None + ) + digest = indexing._query_cache_digest(descriptor) + nbytes = indexing._query_cache_entry_nbytes(coords) + payloads.append((digest, descriptor, coords, nbytes)) + + budget = max(payloads[0][3] + payloads[1][3], payloads[1][3] + payloads[2][3]) + monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_NBYTES", budget) + + for digest, descriptor, coords, _ in payloads: + assert indexing._persistent_cache_insert(arr, digest, coords, descriptor) is True + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert catalog["max_persistent_nbytes"] == budget + assert set(catalog["entries"]) == {payloads[2][0]} + assert catalog["entries"][payloads[2][0]]["slot"] == 0 + assert catalog["next_slot"] == 1 + assert catalog["persistent_nbytes"] == payloads[2][3] + + assert indexing._persistent_cache_lookup(arr, payloads[0][0]) is None + assert indexing._persistent_cache_lookup(arr, payloads[1][0]) is None + np.testing.assert_array_equal(indexing._persistent_cache_lookup(arr, payloads[2][0]), payloads[2][2]) + + _clear_caches() + reopened = blosc2.open(urlpath, mode="r") + assert indexing._persistent_cache_lookup(reopened, payloads[1][0]) is None + np.testing.assert_array_equal( + indexing._persistent_cache_lookup(reopened, payloads[2][0]), payloads[2][2] + ) + + +def test_persistent_cache_overflow_preserves_hot_cache(tmp_path, monkeypatch): + arr, _ = _make_persistent_array(tmp_path, n=8_000) + _clear_caches() + + coords1 = np.arange(0, 256, dtype=np.int64) + coords2 = np.arange(256, 512, dtype=np.int64) + expr1 = "(id >= 0) & (id < 256)" + expr2 = "(id >= 256) & (id < 512)" + + budget = indexing._query_cache_entry_nbytes(coords1) + monkeypatch.setattr(indexing, "QUERY_CACHE_MAX_PERSISTENT_NBYTES", budget) + + indexing.store_cached_coords(arr, expr1, [indexing.SELF_TARGET_NAME], None, coords1) + indexing.store_cached_coords(arr, expr2, [indexing.SELF_TARGET_NAME], None, coords2) + + assert ( + indexing._persistent_cache_lookup( + arr, + indexing._query_cache_digest( + indexing._normalize_query_descriptor(expr1, [indexing.SELF_TARGET_NAME], None) + ), + ) + is None + ) + np.testing.assert_array_equal( + indexing.get_cached_coords(arr, expr1, [indexing.SELF_TARGET_NAME], None), coords1 + ) + np.testing.assert_array_equal( + indexing.get_cached_coords(arr, expr2, [indexing.SELF_TARGET_NAME], None), coords2 + ) + + +# --------------------------------------------------------------------------- +# Stage 5 – Invalidation +# --------------------------------------------------------------------------- + + +def test_invalidation_on_drop_index(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + arr.drop_index() + assert not Path(payload_path).exists(), "payload file should be removed after drop_index" + assert indexing._HOT_CACHE_BYTES == 0 + assert indexing._load_query_cache_catalog(arr) is None + + +def test_invalidation_on_rebuild_index(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + arr.rebuild_index() + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +def test_invalidation_on_compact_index(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + arr.compact_index() + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +def test_invalidation_on_mark_indexes_stale(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + indexing.mark_indexes_stale(arr) + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +def test_invalidation_on_append(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr.indices().compute() + + payload_path = indexing._query_cache_payload_path(arr) + assert Path(payload_path).exists() + + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + extra = np.empty(1_000, dtype=dtype) + extra["id"] = np.arange(50_000, 51_000, dtype=np.int64) + extra["val"] = np.zeros(1_000, dtype=np.float32) + arr.append(extra) + # append calls append_to_indexes which calls _invalidate_query_cache. + assert not Path(payload_path).exists() + assert indexing._HOT_CACHE_BYTES == 0 + + +# --------------------------------------------------------------------------- +# Stage 4 – Ordered-coordinate query caching +# --------------------------------------------------------------------------- + + +def test_ordered_query_indices_cached(tmp_path): + """Ordered .indices(order=...).compute() results are cached and reused.""" + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + lazy = blosc2.lazyexpr("(id >= 10_000) & (id < 20_000)", arr.fields).where(arr) + result1 = lazy.indices(order="id").compute() + + assert indexing._HOT_CACHE_BYTES > 0 + + _clear_caches() + arr2 = blosc2.open(arr.urlpath, mode="r") + result2 = ( + blosc2.lazyexpr("(id >= 10_000) & (id < 20_000)", arr2.fields) + .where(arr2) + .indices(order="id") + .compute() + ) + + np.testing.assert_array_equal(result1, result2) + + +def test_ordered_query_cache_distinguishes_order_sequences(tmp_path): + path = tmp_path / "ordered_sequences.b2nd" + dtype = np.dtype([("a", np.int64), ("b", np.int64)]) + data = np.array([(1, 2), (1, 1), (2, 1), (2, 2)], dtype=dtype) + arr = blosc2.asarray(data, urlpath=path, mode="w", chunks=(4,), blocks=(2,)) + arr.create_index(field="a", kind="full") + arr.create_index(field="b", kind="full") + _clear_caches() + + expr = blosc2.lazyexpr("(a >= 1)", arr.fields).where(arr) + ordered_ab = expr.indices(order=["a", "b"]).compute()[:] + ordered_ba = expr.indices(order=["b", "a"]).compute()[:] + + np.testing.assert_array_equal(ordered_ab, np.argsort(data, order=["a", "b"])) + np.testing.assert_array_equal(ordered_ba, np.argsort(data, order=["b", "a"])) + + +# --------------------------------------------------------------------------- +# Stage 4 – Multiple distinct queries stored in same array cache +# --------------------------------------------------------------------------- + + +def test_multiple_distinct_queries_in_same_cache(tmp_path): + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr1 = blosc2.lazyexpr("(id >= 5_000) & (id < 10_000)", arr.fields).where(arr) + expr2 = blosc2.lazyexpr("(id >= 20_000) & (id < 25_000)", arr.fields).where(arr) + + r1 = expr1.indices().compute() + r2 = expr2.indices().compute() + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 2 + + # Verify both results are consistent with scan. + dtype = arr.dtype + data = arr[:] + np.testing.assert_array_equal(r1, np.where((data["id"] >= 5_000) & (data["id"] < 10_000))[0]) + np.testing.assert_array_equal(r2, np.where((data["id"] >= 20_000) & (data["id"] < 25_000))[0]) + + +# --------------------------------------------------------------------------- +# Stage 4 – In-memory (hot cache only) for structured array query +# --------------------------------------------------------------------------- + + +def test_hot_cache_avoids_recompute(tmp_path): + """Second call returns cached result without re-planning the index.""" + arr, _ = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr.fields).where(arr) + result1 = expr.indices().compute() + hot_bytes_after_first = indexing._HOT_CACHE_BYTES + assert hot_bytes_after_first > 0 + + result2 = expr.indices().compute() + # Hot cache should not have grown (same digest, same entry). + assert hot_bytes_after_first == indexing._HOT_CACHE_BYTES + np.testing.assert_array_equal(result1, result2) + + +# --------------------------------------------------------------------------- +# Value-path (arr[cond][:]) caching for persistent arrays +# --------------------------------------------------------------------------- + + +def test_value_path_cache_hit_persistent(tmp_path): + """arr[cond][:] on a persistent full-indexed array caches coords and serves warm calls.""" + arr, urlpath = _make_persistent_array(tmp_path) + _clear_caches() + + cond = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr.fields) + result1 = arr[cond][:] + + # After first call, cache should have an entry. + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + # Warm call: serve from cache. + _clear_caches() # only clears hot cache; persistent VLArray remains + arr2 = blosc2.open(urlpath, mode="r") + cond2 = blosc2.lazyexpr("(id >= 10_000) & (id < 12_000)", arr2.fields) + result2 = arr2[cond2][:] + + np.testing.assert_array_equal(result1, result2) + # Verify against scan. + data = arr[:] + expected = data[(data["id"] >= 10_000) & (data["id"] < 12_000)] + np.testing.assert_array_equal(result1, expected) + + +# =========================================================================== +# In-memory vs on-disk cache scenarios (value path: arr[cond][:]) +# =========================================================================== + +# --------------------------------------------------------------------------- +# Shared helpers +# --------------------------------------------------------------------------- + + +def _make_structured_array(tmpdir=None, n=20_000, kind="full"): + """Create a structured NDArray (persistent if tmpdir, in-memory otherwise).""" + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data = np.empty(n, dtype=dtype) + data["id"] = np.arange(n, dtype=np.int64) + data["val"] = np.linspace(0.0, 1.0, n, dtype=np.float32) + kwargs = {} + if tmpdir is not None: + kwargs["urlpath"] = str(Path(tmpdir) / f"arr_{kind}.b2nd") + kwargs["mode"] = "w" + arr = blosc2.asarray(data, chunks=(2_000,), blocks=(500,), **kwargs) + arr.create_index(field="id", kind=kind) + return arr + + +def _make_scalar_array(tmpdir=None, n=20_000, kind="full"): + """Create a 1-D int64 NDArray (persistent if tmpdir, in-memory otherwise).""" + data = np.arange(n, dtype=np.int64) + kwargs = {} + if tmpdir is not None: + kwargs["urlpath"] = str(Path(tmpdir) / f"scalar_{kind}.b2nd") + kwargs["mode"] = "w" + arr = blosc2.asarray(data, chunks=(2_000,), blocks=(500,), **kwargs) + arr.create_index(kind=kind) + return arr + + +def _value_query(arr, lo=5_000, hi=7_000): + """Run arr[cond][:] and return the values.""" + cond = blosc2.lazyexpr(f"(id >= {lo}) & (id < {hi})", arr.fields) + return arr[cond][:] + + +def _scalar_value_query(arr, lo=5_000, hi=7_000): + """Run arr[cond][:] for a scalar (non-structured) array.""" + cond = (arr >= lo) & (arr < hi) + return arr[cond][:] + + +# --------------------------------------------------------------------------- +# In-memory arrays – value path +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) +def test_inmem_value_path_correct(kind): + """In-memory value-path queries return correct results for all index kinds.""" + arr = _make_structured_array(kind=kind) + _clear_caches() + + result = _value_query(arr) + data = arr[:] + expected = data[(data["id"] >= 5_000) & (data["id"] < 7_000)] + np.testing.assert_array_equal(result, expected) + + +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) +def test_inmem_value_path_repeated_calls_stable(kind): + """Repeated in-memory value-path calls on the same object are stable.""" + arr = _make_structured_array(kind=kind) + _clear_caches() + + r1 = _value_query(arr) + r2 = _value_query(arr) + np.testing.assert_array_equal(r1, r2) + + +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) +def test_inmem_value_path_hot_cache_hit(kind): + """Second in-memory arr[cond][:] call should reuse the scoped hot cache.""" + arr = _make_structured_array(kind=kind) + _clear_caches() + + r1 = _value_query(arr) + hot_before = indexing._HOT_CACHE_BYTES + assert hot_before > 0 + + r2 = _value_query(arr) + assert hot_before == indexing._HOT_CACHE_BYTES + np.testing.assert_array_equal(r1, r2) + + +def test_inmem_value_path_no_cross_array_contamination(): + """Different in-memory arrays with the same expression never share cache entries. + + This guards against the Python id() address-reuse bug: when array A is GC'd + and array B reuses the same address, a stale hot-cache hit must not occur. + """ + # int32 array: values 0..19999; query value 137 → exactly 1 match + arr_i32 = blosc2.asarray(np.arange(20_000, dtype=np.int32), chunks=(2_000,), blocks=(500,)) + arr_i32.create_index(kind="full") + _clear_caches() + cond_i32 = arr_i32 == np.int32(137) + r1 = arr_i32[cond_i32][:] + assert len(r1) == 1, "int32 query should find exactly 1 match" + + # GC the first array so Python may reuse its id() + del arr_i32, cond_i32 + gc.collect() + + # uint8 array with same values 0..19999 (wraps every 256): 137 matches 78 times + arr_u8 = blosc2.asarray(np.arange(20_000, dtype=np.uint8), chunks=(2_000,), blocks=(500,)) + arr_u8.create_index(kind="full") + cond_u8 = arr_u8 == np.uint8(137) + r2 = arr_u8[cond_u8][:] + expected_count = int(np.sum(np.arange(20_000, dtype=np.uint8) == 137)) + assert len(r2) == expected_count, f"Expected {expected_count} matches for uint8==137, got {len(r2)}" + + +# --------------------------------------------------------------------------- +# On-disk arrays – value path +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("kind", ["ultralight", "full", "medium", "light"]) +def test_ondisk_value_path_correct(tmp_path, kind): + """On-disk value-path queries return correct results for all index kinds.""" + arr = _make_structured_array(tmp_path, kind=kind) + _clear_caches() + + result = _value_query(arr) + data = arr[:] + expected = data[(data["id"] >= 5_000) & (data["id"] < 7_000)] + np.testing.assert_array_equal(result, expected) + + +def test_ondisk_value_path_full_warm_hits_cache(tmp_path): + """After the first on-disk full-index value query, warm calls use the cache.""" + arr = _make_structured_array(tmp_path, kind="full") + urlpath = arr.urlpath + _clear_caches() + + # Cold call – populates persistent cache + r1 = _value_query(arr) + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + # Warm call after clearing hot cache (simulates a new process re-opening the file) + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + r2 = _value_query(arr2) + np.testing.assert_array_equal(r1, r2) + + +@pytest.mark.parametrize("kind", ["ultralight", "light"]) +def test_ondisk_value_path_non_exact_warm_hits_cache(tmp_path, kind): + """Ultralight/light on-disk value queries should populate the coordinate cache.""" + arr = _make_structured_array(tmp_path, kind=kind) + urlpath = arr.urlpath + _clear_caches() + + r1 = _value_query(arr) + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + r2 = _value_query(arr2) + + np.testing.assert_array_equal(r1, r2) + + +@pytest.mark.parametrize("kind", ["medium", "light"]) +def test_ondisk_value_path_non_full_correct(tmp_path, kind): + """Light/medium on-disk value queries are correct.""" + arr = _make_structured_array(tmp_path, kind=kind) + _clear_caches() + + r1 = _value_query(arr) + r2 = _value_query(arr) # repeated call + data = arr[:] + expected = data[(data["id"] >= 5_000) & (data["id"] < 7_000)] + np.testing.assert_array_equal(r1, expected) + np.testing.assert_array_equal(r2, expected) + + +# --------------------------------------------------------------------------- +# On-disk arrays – indices path (.indices().compute()) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize("kind", ["full"]) +def test_ondisk_indices_path_warm_hits_cache(tmp_path, kind): + """After the first on-disk .indices().compute(), warm calls use the cache.""" + arr = _make_structured_array(tmp_path, kind=kind) + urlpath = arr.urlpath + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr.fields).where(arr) + r1 = expr.indices().compute() + + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + expr2 = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr2.fields).where(arr2) + r2 = expr2.indices().compute() + + np.testing.assert_array_equal(r1, r2) + # Verify against scan. + data = arr[:] + expected = np.where((data["id"] >= 5_000) & (data["id"] < 7_000))[0] + np.testing.assert_array_equal(r1, expected) + + +# --------------------------------------------------------------------------- +# In-memory arrays – indices path (.indices().compute()) +# --------------------------------------------------------------------------- + + +def test_inmem_indices_path_hot_cache_hit(): + """Second .indices().compute() call on an in-memory array is served from hot cache.""" + arr = _make_structured_array(kind="full") + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr.fields).where(arr) + r1 = expr.indices().compute() + hot_before = indexing._HOT_CACHE_BYTES + + r2 = expr.indices().compute() + assert hot_before == indexing._HOT_CACHE_BYTES # no new entry added + np.testing.assert_array_equal(r1, r2) + + data = arr[:] + expected = np.where((data["id"] >= 5_000) & (data["id"] < 7_000))[0] + np.testing.assert_array_equal(r1, expected) + + +def test_inmem_indices_cache_entries_are_dropped_on_gc(): + arr = _make_structured_array(kind="full") + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 5_000) & (id < 7_000)", arr.fields).where(arr) + result = expr.indices().compute() + assert result.shape[0] == 2_000 + assert indexing._HOT_CACHE_BYTES > 0 + + del expr, result, arr + gc.collect() + + assert indexing._HOT_CACHE_BYTES == 0 + assert indexing._HOT_CACHE == {} + + +def test_ondisk_indices_path_no_cross_array_hot_cache_contamination(tmp_path): + dtype = np.dtype([("id", np.int64), ("val", np.float32)]) + data1 = np.empty(1_000, dtype=dtype) + data2 = np.empty(1_000, dtype=dtype) + data1["id"] = np.arange(1_000, dtype=np.int64) + data2["id"] = np.arange(1_000, dtype=np.int64) + 1_000 + data1["val"] = 0 + data2["val"] = 0 + + arr1 = blosc2.asarray(data1, urlpath=tmp_path / "arr1.b2nd", mode="w", chunks=(200,), blocks=(50,)) + arr2 = blosc2.asarray(data2, urlpath=tmp_path / "arr2.b2nd", mode="w", chunks=(200,), blocks=(50,)) + arr1.create_index(field="id", kind="full") + arr2.create_index(field="id", kind="full") + _clear_caches() + + expr1 = blosc2.lazyexpr("(id >= 10) & (id < 20)", arr1.fields).where(arr1) + expr2 = blosc2.lazyexpr("(id >= 10) & (id < 20)", arr2.fields).where(arr2) + + r1 = expr1.indices().compute()[:] + r2 = expr2.indices().compute()[:] + + np.testing.assert_array_equal(r1, np.arange(10, 20, dtype=np.int64)) + assert r2.size == 0 + + +def test_ondisk_empty_indices_result_cached(tmp_path): + arr, urlpath = _make_persistent_array(tmp_path) + _clear_caches() + + expr = blosc2.lazyexpr("(id >= 60_000) & (id < 61_000)", arr.fields).where(arr) + result1 = expr.indices().compute()[:] + assert result1.size == 0 + + catalog = indexing._load_query_cache_catalog(arr) + assert catalog is not None + assert len(catalog["entries"]) == 1 + + _clear_caches() + arr2 = blosc2.open(urlpath, mode="r") + result2 = ( + blosc2.lazyexpr("(id >= 60_000) & (id < 61_000)", arr2.fields).where(arr2).indices().compute()[:] + ) + assert result2.size == 0 diff --git a/tests/ndarray/test_ndarray.py b/tests/ndarray/test_ndarray.py index b557bb65..cbdf1af7 100644 --- a/tests/ndarray/test_ndarray.py +++ b/tests/ndarray/test_ndarray.py @@ -103,6 +103,31 @@ def test_asarray(a): np.testing.assert_allclose(a, b[:]) +def test_asarray_ndarray_persists_copy_when_urlpath_requested(tmp_path): + array = blosc2.asarray(np.arange(10, dtype=np.int64), chunks=(5,), blocks=(2,)) + path = tmp_path / "persisted_copy.b2nd" + + persisted = blosc2.asarray(array, urlpath=path, mode="w") + + assert persisted is not array + assert persisted.urlpath == str(path) + assert path.exists() + np.testing.assert_array_equal(persisted[:], array[:]) + + +def test_asarray_ndarray_copies_for_dtype_changes_and_rejects_copy_false(tmp_path): + array = blosc2.asarray(np.arange(10, dtype=np.int64), chunks=(5,), blocks=(2,)) + + cast = blosc2.asarray(array, dtype=np.float32) + + assert cast is not array + assert cast.dtype == np.float32 + np.testing.assert_allclose(cast[:], array[:].astype(np.float32)) + + with pytest.raises(ValueError, match="copy=False"): + blosc2.asarray(array, urlpath=tmp_path / "persisted_copy_false.b2nd", mode="w", copy=False) + + def test_ndarray_info_has_human_sizes(): array = blosc2.asarray(np.arange(16, dtype=np.int32)) @@ -115,6 +140,21 @@ def test_ndarray_info_has_human_sizes(): assert "cbytes" in text +def test_fields_assignment_requires_field_view_slice(): + dtype = np.dtype([("id", np.float64), ("payload", np.int32)]) + array = blosc2.zeros(4, dtype=dtype) + + with pytest.raises( + TypeError, match=r'assign through the field view, e\.g\. array\.fields\["id"\]\[:\] = values' + ): + array.fields["id"] = np.arange(4, dtype=np.float64) + + np.testing.assert_array_equal(array[:], np.zeros(4, dtype=dtype)) + + array.fields["id"][:] = np.arange(4, dtype=np.float64) + np.testing.assert_array_equal(array.fields["id"][:], np.arange(4, dtype=np.float64)) + + @pytest.mark.parametrize( ("shape", "newshape", "chunks", "blocks"), [