diff --git a/changes/3712.misc.md b/changes/3712.misc.md new file mode 100644 index 0000000000..8fa2f2d2f7 --- /dev/null +++ b/changes/3712.misc.md @@ -0,0 +1 @@ +Added benchmarks for Morton order computation in sharded arrays. diff --git a/tests/benchmarks/test_indexing.py b/tests/benchmarks/test_indexing.py index 9ca0d8e1af..dff2269dcb 100644 --- a/tests/benchmarks/test_indexing.py +++ b/tests/benchmarks/test_indexing.py @@ -50,3 +50,218 @@ def test_slice_indexing( data[:] = 1 benchmark(getitem, data, indexer) + + +# Benchmark for Morton order optimization with power-of-2 shards +# Morton order is used internally by sharding codec for chunk iteration +morton_shards = ( + (16,) * 3, # With 2x2x2 chunks: 8x8x8 = 512 chunks per shard + (32,) * 3, # With 2x2x2 chunks: 16x16x16 = 4096 chunks per shard +) + + +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +@pytest.mark.parametrize("shards", morton_shards, ids=str) +def test_sharded_morton_indexing( + store: Store, + shards: tuple[int, ...], + benchmark: BenchmarkFixture, +) -> None: + """Benchmark sharded array indexing with power-of-2 chunks per shard. + + This benchmark exercises the Morton order iteration path in the sharding + codec, which benefits from the hypercube and vectorization optimizations. + The Morton order cache is cleared before each iteration to measure the + full computation cost. + """ + from zarr.core.indexing import _morton_order + + # Create array where each shard contains many small chunks + # e.g., shards=(32,32,32) with chunks=(2,2,2) means 16x16x16 = 4096 chunks per shard + shape = tuple(s * 2 for s in shards) # 2 shards per dimension + chunks = (2,) * 3 # Small chunks to maximize chunks per shard + + data = create_array( + store=store, + shape=shape, + dtype="uint8", + chunks=chunks, + shards=shards, + compressors=None, + filters=None, + fill_value=0, + ) + + data[:] = 1 + # Read a sub-shard region to exercise Morton order iteration + indexer = (slice(shards[0]),) * 3 + + def read_with_cache_clear() -> None: + _morton_order.cache_clear() + getitem(data, indexer) + + benchmark(read_with_cache_clear) + + +# Benchmark with larger chunks_per_shard to make Morton order impact more visible +large_morton_shards = ( + (32,) * 3, # With 1x1x1 chunks: 32x32x32 = 32768 chunks per shard +) + + +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +@pytest.mark.parametrize("shards", large_morton_shards, ids=str) +def test_sharded_morton_indexing_large( + store: Store, + shards: tuple[int, ...], + benchmark: BenchmarkFixture, +) -> None: + """Benchmark sharded array indexing with large chunks_per_shard. + + Uses 1x1x1 chunks to maximize chunks_per_shard (32^3 = 32768), making + the Morton order computation a more significant portion of total time. + The Morton order cache is cleared before each iteration. + """ + from zarr.core.indexing import _morton_order + + # 1x1x1 chunks means chunks_per_shard equals shard shape + shape = tuple(s * 2 for s in shards) # 2 shards per dimension + chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards + + data = create_array( + store=store, + shape=shape, + dtype="uint8", + chunks=chunks, + shards=shards, + compressors=None, + filters=None, + fill_value=0, + ) + + data[:] = 1 + # Read one full shard + indexer = (slice(shards[0]),) * 3 + + def read_with_cache_clear() -> None: + _morton_order.cache_clear() + getitem(data, indexer) + + benchmark(read_with_cache_clear) + + +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +@pytest.mark.parametrize("shards", large_morton_shards, ids=str) +def test_sharded_morton_single_chunk( + store: Store, + shards: tuple[int, ...], + benchmark: BenchmarkFixture, +) -> None: + """Benchmark reading a single chunk from a large shard. + + This isolates the Morton order computation overhead by minimizing I/O. + Reading one chunk from a shard with 32^3 = 32768 chunks still requires + computing the full Morton order, making the optimization impact clear. + The Morton order cache is cleared before each iteration. + """ + from zarr.core.indexing import _morton_order + + # 1x1x1 chunks means chunks_per_shard equals shard shape + shape = tuple(s * 2 for s in shards) # 2 shards per dimension + chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards + + data = create_array( + store=store, + shape=shape, + dtype="uint8", + chunks=chunks, + shards=shards, + compressors=None, + filters=None, + fill_value=0, + ) + + data[:] = 1 + # Read only a single chunk (1x1x1) from the shard + indexer = (slice(1),) * 3 + + def read_with_cache_clear() -> None: + _morton_order.cache_clear() + getitem(data, indexer) + + benchmark(read_with_cache_clear) + + +# Benchmark for morton_order_iter directly (no I/O) +morton_iter_shapes = ( + (8, 8, 8), # 512 elements + (16, 16, 16), # 4096 elements + (32, 32, 32), # 32768 elements +) + + +@pytest.mark.parametrize("shape", morton_iter_shapes, ids=str) +def test_morton_order_iter( + shape: tuple[int, ...], + benchmark: BenchmarkFixture, +) -> None: + """Benchmark morton_order_iter directly without I/O. + + This isolates the Morton order computation to measure the + optimization impact without array read/write overhead. + The cache is cleared before each iteration. + """ + from zarr.core.indexing import _morton_order, morton_order_iter + + def compute_morton_order() -> None: + _morton_order.cache_clear() + # Consume the iterator to force computation + list(morton_order_iter(shape)) + + benchmark(compute_morton_order) + + +@pytest.mark.parametrize("store", ["memory"], indirect=["store"]) +@pytest.mark.parametrize("shards", large_morton_shards, ids=str) +def test_sharded_morton_write_single_chunk( + store: Store, + shards: tuple[int, ...], + benchmark: BenchmarkFixture, +) -> None: + """Benchmark writing a single chunk to a large shard. + + This is the clearest end-to-end demonstration of Morton order optimization. + Writing a single chunk to a shard with 32^3 = 32768 chunks requires + computing the full Morton order, but minimizes I/O overhead. + + Expected improvement: ~160ms (matching Morton computation speedup of ~178ms). + The Morton order cache is cleared before each iteration. + """ + import numpy as np + + from zarr.core.indexing import _morton_order + + # 1x1x1 chunks means chunks_per_shard equals shard shape + shape = tuple(s * 2 for s in shards) # 2 shards per dimension + chunks = (1,) * 3 # 1x1x1 chunks: chunks_per_shard = shards + + data = create_array( + store=store, + shape=shape, + dtype="uint8", + chunks=chunks, + shards=shards, + compressors=None, + filters=None, + fill_value=0, + ) + + # Write data for a single chunk + write_data = np.ones((1, 1, 1), dtype="uint8") + indexer = (slice(1), slice(1), slice(1)) + + def write_with_cache_clear() -> None: + _morton_order.cache_clear() + data[indexer] = write_data + + benchmark(write_with_cache_clear)