From 18317433cc3d90ba614336d7b52347696ee68fd4 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 11:13:54 +0000 Subject: [PATCH 1/7] Add aggregate max divan benchmark Adds a divan benchmark exercising the min/max aggregation over primitive arrays (i32/i64/f64, with and without nulls) so we can measure and inspect the codegen of the max reduction path. Signed-off-by: Joe Isaacs --- vortex-array/Cargo.toml | 4 + vortex-array/benches/aggregate_max.rs | 109 ++++++++++++++++++++++++++ 2 files changed, 113 insertions(+) create mode 100644 vortex-array/benches/aggregate_max.rs diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index 78a3bb0481f..beceaafe454 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -97,6 +97,10 @@ serde_json = { workspace = true } serde_test = { workspace = true } vortex-array = { path = ".", features = ["_test-harness", "table-display"] } +[[bench]] +name = "aggregate_max" +harness = false + [[bench]] name = "cast_primitive" harness = false diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs new file mode 100644 index 00000000000..f59f0c87ee0 --- /dev/null +++ b/vortex-array/benches/aggregate_max.rs @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use divan::Bencher; +use rand::prelude::*; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; + +fn main() { + divan::main(); +} + +const N: usize = 1_000_000; + +fn gen_i32(null_density: f64, seed: u64) -> Vec> { + let mut rng = StdRng::seed_from_u64(seed); + (0..N) + .map(|_| { + if rng.random_bool(null_density) { + None + } else { + Some(rng.random::()) + } + }) + .collect() +} + +fn gen_i64(null_density: f64, seed: u64) -> Vec> { + let mut rng = StdRng::seed_from_u64(seed); + (0..N) + .map(|_| { + if rng.random_bool(null_density) { + None + } else { + Some(rng.random::()) + } + }) + .collect() +} + +fn gen_f64(null_density: f64, seed: u64) -> Vec> { + let mut rng = StdRng::seed_from_u64(seed); + (0..N) + .map(|_| { + if rng.random_bool(null_density) { + None + } else { + Some(rng.random::()) + } + }) + .collect() +} + +#[divan::bench] +fn max_i32_all_valid(bencher: Bencher) { + let data = gen_i32(0.0, 1); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +#[divan::bench] +fn max_i32_half_null(bencher: Bencher) { + let data = gen_i32(0.5, 2); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +#[divan::bench] +fn max_i64_all_valid(bencher: Bencher) { + let data = gen_i64(0.0, 3); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +#[divan::bench] +fn max_f64_all_valid(bencher: Bencher) { + let data = gen_f64(0.0, 4); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +#[divan::bench] +fn max_f64_half_null(bencher: Bencher) { + let data = gen_f64(0.5, 5); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} From e9a31adb651a36e7b446fc00bdf21600bbf5ef37 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 12:26:10 +0000 Subject: [PATCH 2/7] Vectorize integer min/max for all-valid arrays The all-valid primitive min/max path used `itertools::minmax_by` with a `total_compare` closure preceded by a NaN filter, which the autovectorizer could not lower to packed min/max, leaving a scalar cmov reduction. Route the all-true mask case for integer ptypes through a plain reduction. Integers have no NaNs, so the NaN filter is unnecessary and LLVM vectorizes the loop (pmaxub/pmaxsw, and pcmpgtd-based blends for i32/i64). Floats keep the existing NaN-aware path. Benchmarked over 1M elements: i32 all-valid ~2.93ms -> ~0.36ms, i64 ~3.02ms -> ~0.55ms. Signed-off-by: Joe Isaacs --- .../src/aggregate_fn/fns/min_max/primitive.rs | 33 ++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs index 3470ba3728d..4d1b4242ef2 100644 --- a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs +++ b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs @@ -41,7 +41,16 @@ where .validity()? .execute_mask(array.as_ref().len(), ctx)? { - Mask::AllTrue(_) => compute_min_max(array.as_slice::().iter()), + Mask::AllTrue(_) => { + let slice = array.as_slice::(); + // Integers have no NaNs, so a plain min/max reduction is correct and, unlike the + // `itertools::minmax_by` + NaN-filter path, autovectorizes to packed min/max. + if T::PTYPE.is_int() { + integer_min_max(slice) + } else { + compute_min_max(slice.iter()) + } + } Mask::AllFalse(_) => None, Mask::Values(v) => compute_min_max( array @@ -54,6 +63,28 @@ where ) } +fn integer_min_max(slice: &[T]) -> Option +where + T: NativePType, + PValue: From, +{ + let (&first, rest) = slice.split_first()?; + let mut min = first; + let mut max = first; + for &v in rest { + if v.is_lt(min) { + min = v; + } + if v.is_gt(max) { + max = v; + } + } + Some(MinMaxResult { + min: Scalar::primitive(min, NonNullable), + max: Scalar::primitive(max, NonNullable), + }) +} + fn compute_min_max<'a, T>(iter: impl Iterator) -> Option where T: NativePType, From ac47a14a577dd66c9f95cdd4e9e83c2b3b63c1d2 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 13:18:31 +0000 Subject: [PATCH 3/7] Simplify aggregate max benchmark to one bench per type Keep a single all-valid bench for i32, i64, and f64 instead of the per-type all-valid/half-null pairs. Signed-off-by: Joe Isaacs --- vortex-array/benches/aggregate_max.rs | 82 ++++----------------------- 1 file changed, 12 insertions(+), 70 deletions(-) diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs index f59f0c87ee0..7fd3541fc89 100644 --- a/vortex-array/benches/aggregate_max.rs +++ b/vortex-array/benches/aggregate_max.rs @@ -14,61 +14,12 @@ fn main() { const N: usize = 1_000_000; -fn gen_i32(null_density: f64, seed: u64) -> Vec> { - let mut rng = StdRng::seed_from_u64(seed); - (0..N) - .map(|_| { - if rng.random_bool(null_density) { - None - } else { - Some(rng.random::()) - } - }) - .collect() -} - -fn gen_i64(null_density: f64, seed: u64) -> Vec> { - let mut rng = StdRng::seed_from_u64(seed); - (0..N) - .map(|_| { - if rng.random_bool(null_density) { - None - } else { - Some(rng.random::()) - } - }) - .collect() -} - -fn gen_f64(null_density: f64, seed: u64) -> Vec> { - let mut rng = StdRng::seed_from_u64(seed); - (0..N) - .map(|_| { - if rng.random_bool(null_density) { - None - } else { - Some(rng.random::()) - } - }) - .collect() -} - -#[divan::bench] -fn max_i32_all_valid(bencher: Bencher) { - let data = gen_i32(0.0, 1); - bencher - .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) - .bench_refs(|a| { - a.statistics() - .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) - }); -} - #[divan::bench] -fn max_i32_half_null(bencher: Bencher) { - let data = gen_i32(0.5, 2); +fn max_i32(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(1); + let data: Vec = (0..N).map(|_| rng.random::()).collect(); bencher - .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) @@ -76,10 +27,11 @@ fn max_i32_half_null(bencher: Bencher) { } #[divan::bench] -fn max_i64_all_valid(bencher: Bencher) { - let data = gen_i64(0.0, 3); +fn max_i64(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(2); + let data: Vec = (0..N).map(|_| rng.random::()).collect(); bencher - .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) @@ -87,21 +39,11 @@ fn max_i64_all_valid(bencher: Bencher) { } #[divan::bench] -fn max_f64_all_valid(bencher: Bencher) { - let data = gen_f64(0.0, 4); - bencher - .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) - .bench_refs(|a| { - a.statistics() - .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) - }); -} - -#[divan::bench] -fn max_f64_half_null(bencher: Bencher) { - let data = gen_f64(0.5, 5); +fn max_f64(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(3); + let data: Vec = (0..N).map(|_| rng.random::()).collect(); bencher - .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) From 08abd6abac3ee066b4af84a425fd8abe5492a4cf Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 13:19:32 +0000 Subject: [PATCH 4/7] Reduce aggregate max benchmark array length to 100k Signed-off-by: Joe Isaacs --- vortex-array/benches/aggregate_max.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs index 7fd3541fc89..39e7734f19d 100644 --- a/vortex-array/benches/aggregate_max.rs +++ b/vortex-array/benches/aggregate_max.rs @@ -12,7 +12,7 @@ fn main() { divan::main(); } -const N: usize = 1_000_000; +const N: usize = 100_000; #[divan::bench] fn max_i32(bencher: Bencher) { From 8b98b5db7b7588f1b340fe46193841acfae3debd Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 22 May 2026 18:01:34 +0000 Subject: [PATCH 5/7] Vectorize integer sum via chunked widening accumulator The all-valid integer sum did a per-element `checked_add`, whose overflow early-return branch blocked autovectorization, leaving a scalar loop. Sum narrower-than-64-bit integers in chunks of 65536 into a widened 64-bit accumulator with no per-element check: a chunk of <64-bit values cannot overflow the 64-bit accumulator (2^16 * (2^32-1) < 2^64), so only one checked add per chunk is needed. This lets the inner loop vectorize to packed widening adds (paddq + unpck). 64-bit inputs keep the per-element checked path since a chunk of 64-bit values could itself overflow. This observes overflow at chunk boundaries rather than per element, so a signed sum whose running total transiently leaves i64 range but ends in range now returns the true total instead of null. The final result is unchanged whenever the existing per-batch combine did not already overflow. Benchmarked over 100k elements: sum_i32 ~19us, sum_u32 ~15us, sum_i64 ~51us. Signed-off-by: Joe Isaacs --- vortex-array/Cargo.toml | 4 ++ vortex-array/benches/aggregate_sum.rs | 52 ++++++++++++++ .../src/aggregate_fn/fns/sum/primitive.rs | 72 ++++++++++++++----- 3 files changed, 112 insertions(+), 16 deletions(-) create mode 100644 vortex-array/benches/aggregate_sum.rs diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml index beceaafe454..666a23c02c4 100644 --- a/vortex-array/Cargo.toml +++ b/vortex-array/Cargo.toml @@ -101,6 +101,10 @@ vortex-array = { path = ".", features = ["_test-harness", "table-display"] } name = "aggregate_max" harness = false +[[bench]] +name = "aggregate_sum" +harness = false + [[bench]] name = "cast_primitive" harness = false diff --git a/vortex-array/benches/aggregate_sum.rs b/vortex-array/benches/aggregate_sum.rs new file mode 100644 index 00000000000..d52961355c2 --- /dev/null +++ b/vortex-array/benches/aggregate_sum.rs @@ -0,0 +1,52 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use divan::Bencher; +use rand::prelude::*; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::expr::stats::Stat; + +fn main() { + divan::main(); +} + +const N: usize = 100_000; + +#[divan::bench] +fn sum_i32(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(1); + let data: Vec = (0..N).map(|_| rng.random_range(-1000..1000)).collect(); + bencher + .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +#[divan::bench] +fn sum_u32(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(2); + let data: Vec = (0..N).map(|_| rng.random_range(0..2000)).collect(); + bencher + .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +#[divan::bench] +fn sum_i64(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(3); + let data: Vec = (0..N).map(|_| rng.random_range(-1000..1000)).collect(); + bencher + .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + }); +} diff --git a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs index 19c414b8204..b78156d6639 100644 --- a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs +++ b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs @@ -2,6 +2,7 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors use itertools::Itertools; +use num_traits::AsPrimitive; use num_traits::ToPrimitive; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -13,8 +14,14 @@ use super::checked_add_i64; use super::checked_add_u64; use crate::ExecutionCtx; use crate::arrays::PrimitiveArray; +use crate::dtype::NativePType; +use crate::dtype::PType; use crate::match_each_native_ptype; +/// Number of elements summed without an overflow check. Chosen so that a chunk of values narrower +/// than 64 bits cannot overflow the 64-bit accumulator: `2^16 * (2^32 - 1) < 2^64`. +const SUM_CHUNK: usize = 1 << 16; + pub(super) fn accumulate_primitive( inner: &mut SumState, p: &PrimitiveArray, @@ -31,27 +38,13 @@ pub(super) fn accumulate_primitive( fn accumulate_primitive_all(inner: &mut SumState, p: &PrimitiveArray) -> VortexResult { match inner { SumState::Unsigned(acc) => match_each_native_ptype!(p.ptype(), - unsigned: |T| { - for &v in p.as_slice::() { - if checked_add_u64(acc, v.to_u64().vortex_expect("unsigned to u64")) { - return Ok(true); - } - } - Ok(false) - }, + unsigned: |T| { Ok(sum_unsigned_all(acc, p.as_slice::())) }, signed: |_T| { vortex_panic!("unsigned sum state with signed input") }, floating: |_T| { vortex_panic!("unsigned sum state with float input") } ), SumState::Signed(acc) => match_each_native_ptype!(p.ptype(), unsigned: |_T| { vortex_panic!("signed sum state with unsigned input") }, - signed: |T| { - for &v in p.as_slice::() { - if checked_add_i64(acc, v.to_i64().vortex_expect("signed to i64")) { - return Ok(true); - } - } - Ok(false) - }, + signed: |T| { Ok(sum_signed_all(acc, p.as_slice::())) }, floating: |_T| { vortex_panic!("signed sum state with float input") } ), SumState::Float(acc) => match_each_native_ptype!(p.ptype(), @@ -70,6 +63,53 @@ fn accumulate_primitive_all(inner: &mut SumState, p: &PrimitiveArray) -> VortexR } } +/// Sum all values into a `u64` accumulator. For types narrower than 64 bits, values are summed in +/// chunks of [`SUM_CHUNK`] with a single checked add per chunk, which lets the inner loop vectorize +/// to packed widening adds. `u64` input keeps a per-element checked add since a chunk of `u64`s +/// could itself overflow. Returns `true` on overflow. +fn sum_unsigned_all(acc: &mut u64, slice: &[T]) -> bool +where + T: NativePType + AsPrimitive, +{ + if T::PTYPE == PType::U64 { + for &v in slice { + if checked_add_u64(acc, v.as_()) { + return true; + } + } + return false; + } + for chunk in slice.chunks(SUM_CHUNK) { + let chunk_sum: u64 = chunk.iter().map(|&v| v.as_()).sum(); + if checked_add_u64(acc, chunk_sum) { + return true; + } + } + false +} + +/// Signed counterpart of [`sum_unsigned_all`]. +fn sum_signed_all(acc: &mut i64, slice: &[T]) -> bool +where + T: NativePType + AsPrimitive, +{ + if T::PTYPE == PType::I64 { + for &v in slice { + if checked_add_i64(acc, v.as_()) { + return true; + } + } + return false; + } + for chunk in slice.chunks(SUM_CHUNK) { + let chunk_sum: i64 = chunk.iter().map(|&v| v.as_()).sum(); + if checked_add_i64(acc, chunk_sum) { + return true; + } + } + false +} + fn accumulate_primitive_valid( inner: &mut SumState, p: &PrimitiveArray, From 6a44f511adb5747829494fb058ae063b3d041a35 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 23 May 2026 11:47:45 +0000 Subject: [PATCH 6/7] Vectorize integer sum and min/max over nullable arrays The with-nulls paths for primitive sum and min/max walked the values zipped with a per-element validity bit, which kept them scalar even though their all-valid counterparts already autovectorize to packed widening adds (sum) and packed min/max. Drive both null paths from the validity mask's contiguous `[start, end)` runs (`Mask::slices`, computed once and cached). Each run is fully valid, so it reuses the existing vectorized all-valid reduction: sum folds each run through the chunked widening accumulator; min/max folds the native per-run integer min/max, with floats chaining the runs through the NaN-filtering reduction. Results are unchanged. To support the fold, integer min/max now returns native `(min, max)` (`integer_min_max_raw`) which both the all-valid and run paths reduce before building the result scalar. Benchmarked over 100k i32 elements (added nullable bench cases): - clustered nulls: sum 106us -> 29us, max 159us -> 35us - scattered ~50% nulls: no regression (sum 584us -> 530us, max 606us -> 593us) Signed-off-by: Joe Isaacs --- vortex-array/benches/aggregate_max.rs | 36 ++++++++++++++ vortex-array/benches/aggregate_sum.rs | 38 ++++++++++++++ .../src/aggregate_fn/fns/min_max/mod.rs | 26 ++++++++++ .../src/aggregate_fn/fns/min_max/primitive.rs | 49 ++++++++++++++----- .../src/aggregate_fn/fns/sum/primitive.rs | 49 ++++++++++++++----- 5 files changed, 175 insertions(+), 23 deletions(-) diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs index 39e7734f19d..f5a10b3ae76 100644 --- a/vortex-array/benches/aggregate_max.rs +++ b/vortex-array/benches/aggregate_max.rs @@ -49,3 +49,39 @@ fn max_f64(bencher: Bencher) { .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) }); } + +// Clustered nulls: long valid runs broken up by null blocks (run-based path's best case). +#[divan::bench] +fn max_i32_nulls_clustered(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(4); + let data: Vec> = (0..N) + .map(|i| { + if (i / 64) % 10 == 0 { + None + } else { + Some(rng.random::()) + } + }) + .collect(); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +// Scattered nulls: ~50% random nulls producing many short runs (run-based path's worst case). +#[divan::bench] +fn max_i32_nulls_scattered(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(5); + let data: Vec> = (0..N) + .map(|_| rng.random_bool(0.5).then(|| rng.random::())) + .collect(); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + }); +} diff --git a/vortex-array/benches/aggregate_sum.rs b/vortex-array/benches/aggregate_sum.rs index d52961355c2..f298230788d 100644 --- a/vortex-array/benches/aggregate_sum.rs +++ b/vortex-array/benches/aggregate_sum.rs @@ -50,3 +50,41 @@ fn sum_i64(bencher: Bencher) { .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) }); } + +// Clustered nulls: long runs of valid values broken up by occasional null blocks. This is the +// case the run-based valid path is expected to accelerate. +#[divan::bench] +fn sum_i32_nulls_clustered(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(4); + let data: Vec> = (0..N) + .map(|i| { + if (i / 64) % 10 == 0 { + None + } else { + Some(rng.random_range(-1000..1000)) + } + }) + .collect(); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + }); +} + +// Scattered nulls: ~50% nulls placed at random, producing many short runs. This is the worst case +// for a run-based valid path, used to guard against regressions versus a per-element loop. +#[divan::bench] +fn sum_i32_nulls_scattered(bencher: Bencher) { + let mut rng = StdRng::seed_from_u64(5); + let data: Vec> = (0..N) + .map(|_| rng.random_bool(0.5).then(|| rng.random_range(-1000..1000))) + .collect(); + bencher + .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) + .bench_refs(|a| { + a.statistics() + .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + }); +} diff --git a/vortex-array/src/aggregate_fn/fns/min_max/mod.rs b/vortex-array/src/aggregate_fn/fns/min_max/mod.rs index 03b10e32301..b9663baa84d 100644 --- a/vortex-array/src/aggregate_fn/fns/min_max/mod.rs +++ b/vortex-array/src/aggregate_fn/fns/min_max/mod.rs @@ -330,6 +330,32 @@ mod tests { Ok(()) } + #[test] + fn test_prim_min_max_multiple_null_runs() -> VortexResult<()> { + // Several disjoint valid runs separated by nulls exercise the per-run fold; the extrema + // (min 1, max 9) fall in different runs. + let p = PrimitiveArray::from_option_iter([ + Some(5i32), + Some(3), + None, + None, + Some(9), + None, + Some(1), + Some(7), + ]) + .into_array(); + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + assert_eq!( + min_max(&p, &mut ctx)?, + Some(MinMaxResult { + min: 1.into(), + max: 9.into() + }) + ); + Ok(()) + } + #[test] fn test_bool_min_max() -> VortexResult<()> { let mut ctx = LEGACY_SESSION.create_execution_ctx(); diff --git a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs index 4d1b4242ef2..06b9749678d 100644 --- a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs +++ b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs @@ -46,27 +46,44 @@ where // Integers have no NaNs, so a plain min/max reduction is correct and, unlike the // `itertools::minmax_by` + NaN-filter path, autovectorizes to packed min/max. if T::PTYPE.is_int() { - integer_min_max(slice) + integer_min_max_raw(slice).map(min_max_result) } else { compute_min_max(slice.iter()) } } Mask::AllFalse(_) => None, - Mask::Values(v) => compute_min_max( - array - .as_slice::() - .iter() - .zip(v.bit_buffer().iter()) - .filter_map(|(v, m)| m.then_some(v)), - ), + Mask::Values(v) => { + let slice = array.as_slice::(); + // Each `[start, end)` run is fully valid, so integers can reuse the vectorized + // packed min/max per run and fold the run results; floats chain the runs through + // the NaN-filtering reduction. + if T::PTYPE.is_int() { + v.slices() + .iter() + .filter_map(|&(start, end)| integer_min_max_raw(&slice[start..end])) + .reduce(|(amin, amax), (rmin, rmax)| { + ( + if rmin.is_lt(amin) { rmin } else { amin }, + if rmax.is_gt(amax) { rmax } else { amax }, + ) + }) + .map(min_max_result) + } else { + compute_min_max( + v.slices() + .iter() + .flat_map(|&(start, end)| slice[start..end].iter()), + ) + } + } }, ) } -fn integer_min_max(slice: &[T]) -> Option +/// Min/max of an all-valid integer slice as native values. Autovectorizes to packed min/max. +fn integer_min_max_raw(slice: &[T]) -> Option<(T, T)> where T: NativePType, - PValue: From, { let (&first, rest) = slice.split_first()?; let mut min = first; @@ -79,10 +96,18 @@ where max = v; } } - Some(MinMaxResult { + Some((min, max)) +} + +fn min_max_result((min, max): (T, T)) -> MinMaxResult +where + T: NativePType, + PValue: From, +{ + MinMaxResult { min: Scalar::primitive(min, NonNullable), max: Scalar::primitive(max, NonNullable), - }) + } } fn compute_min_max<'a, T>(iter: impl Iterator) -> Option diff --git a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs index b78156d6639..44418fb5628 100644 --- a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs +++ b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs @@ -1,7 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use itertools::Itertools; use num_traits::AsPrimitive; use num_traits::ToPrimitive; use vortex_error::VortexExpect; @@ -28,10 +27,10 @@ pub(super) fn accumulate_primitive( ctx: &mut ExecutionCtx, ) -> VortexResult { let mask = p.as_ref().validity()?.execute_mask(p.as_ref().len(), ctx)?; - match mask.bit_buffer() { + match mask.slices() { AllOr::None => Ok(false), AllOr::All => accumulate_primitive_all(inner, p), - AllOr::Some(validity) => accumulate_primitive_valid(inner, p, validity), + AllOr::Some(slices) => accumulate_primitive_valid(inner, p, slices), } } @@ -110,16 +109,20 @@ where false } +/// Sum the valid elements, described as contiguous `[start, end)` runs of set validity bits. Each +/// run is a slice of fully-valid values, so it reuses the same vectorized reduction as the +/// all-valid path instead of a per-element validity branch. fn accumulate_primitive_valid( inner: &mut SumState, p: &PrimitiveArray, - validity: &vortex_buffer::BitBuffer, + slices: &[(usize, usize)], ) -> VortexResult { match inner { SumState::Unsigned(acc) => match_each_native_ptype!(p.ptype(), unsigned: |T| { - for (&v, valid) in p.as_slice::().iter().zip_eq(validity.iter()) { - if valid && checked_add_u64(acc, v.to_u64().vortex_expect("unsigned to u64")) { + let values = p.as_slice::(); + for &(start, end) in slices { + if sum_unsigned_all(acc, &values[start..end]) { return Ok(true); } } @@ -131,8 +134,9 @@ fn accumulate_primitive_valid( SumState::Signed(acc) => match_each_native_ptype!(p.ptype(), unsigned: |_T| { vortex_panic!("signed sum state with unsigned input") }, signed: |T| { - for (&v, valid) in p.as_slice::().iter().zip_eq(validity.iter()) { - if valid && checked_add_i64(acc, v.to_i64().vortex_expect("signed to i64")) { + let values = p.as_slice::(); + for &(start, end) in slices { + if sum_signed_all(acc, &values[start..end]) { return Ok(true); } } @@ -144,9 +148,12 @@ fn accumulate_primitive_valid( unsigned: |_T| { vortex_panic!("float sum state with unsigned input") }, signed: |_T| { vortex_panic!("float sum state with signed input") }, floating: |T| { - for (&v, valid) in p.as_slice::().iter().zip_eq(validity.iter()) { - if valid && !v.is_nan() { - *acc += ToPrimitive::to_f64(&v).vortex_expect("float to f64"); + let values = p.as_slice::(); + for &(start, end) in slices { + for &v in &values[start..end] { + if !v.is_nan() { + *acc += ToPrimitive::to_f64(&v).vortex_expect("float to f64"); + } } } Ok(false) @@ -210,6 +217,26 @@ mod tests { Ok(()) } + #[test] + fn sum_multiple_null_runs() -> VortexResult<()> { + // Several disjoint valid runs separated by nulls exercise the per-run fold. + let arr = PrimitiveArray::from_option_iter([ + Some(1i32), + Some(2), + None, + None, + Some(3), + None, + Some(4), + Some(5), + Some(6), + ]) + .into_array(); + let result = sum(&arr, &mut LEGACY_SESSION.create_execution_ctx())?; + assert_eq!(result.as_primitive().typed_value::(), Some(21)); + Ok(()) + } + #[test] fn sum_all_null() -> VortexResult<()> { let arr = PrimitiveArray::from_option_iter([None::, None, None]).into_array(); From 3d86cfe265d779acdf469791e1514fb39bc7e259 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Tue, 26 May 2026 14:22:14 +0100 Subject: [PATCH 7/7] fix Signed-off-by: Joe Isaacs --- vortex-array/benches/aggregate_max.rs | 18 ++++++++++++------ vortex-array/benches/aggregate_sum.rs | 18 ++++++++++++------ 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs index f5a10b3ae76..8962ade2974 100644 --- a/vortex-array/benches/aggregate_max.rs +++ b/vortex-array/benches/aggregate_max.rs @@ -1,12 +1,15 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::LazyLock; + use divan::Bencher; use rand::prelude::*; use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; +use vortex_array::session::ArraySession; +use vortex_session::VortexSession; fn main() { divan::main(); @@ -14,6 +17,9 @@ fn main() { const N: usize = 100_000; +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + #[divan::bench] fn max_i32(bencher: Bencher) { let mut rng = StdRng::seed_from_u64(1); @@ -22,7 +28,7 @@ fn max_i32(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + .compute_max::(&mut SESSION.create_execution_ctx()) }); } @@ -34,7 +40,7 @@ fn max_i64(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + .compute_max::(&mut SESSION.create_execution_ctx()) }); } @@ -46,7 +52,7 @@ fn max_f64(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + .compute_max::(&mut SESSION.create_execution_ctx()) }); } @@ -67,7 +73,7 @@ fn max_i32_nulls_clustered(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + .compute_max::(&mut SESSION.create_execution_ctx()) }); } @@ -82,6 +88,6 @@ fn max_i32_nulls_scattered(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_max::(&mut LEGACY_SESSION.create_execution_ctx()) + .compute_max::(&mut SESSION.create_execution_ctx()) }); } diff --git a/vortex-array/benches/aggregate_sum.rs b/vortex-array/benches/aggregate_sum.rs index f298230788d..db4bf5b284a 100644 --- a/vortex-array/benches/aggregate_sum.rs +++ b/vortex-array/benches/aggregate_sum.rs @@ -1,13 +1,16 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +use std::sync::LazyLock; + use divan::Bencher; use rand::prelude::*; use vortex_array::IntoArray; -use vortex_array::LEGACY_SESSION; use vortex_array::VortexSessionExecute; use vortex_array::arrays::PrimitiveArray; use vortex_array::expr::stats::Stat; +use vortex_array::session::ArraySession; +use vortex_session::VortexSession; fn main() { divan::main(); @@ -15,6 +18,9 @@ fn main() { const N: usize = 100_000; +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + #[divan::bench] fn sum_i32(bencher: Bencher) { let mut rng = StdRng::seed_from_u64(1); @@ -23,7 +29,7 @@ fn sum_i32(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + .compute_as::(Stat::Sum, &mut SESSION.create_execution_ctx()) }); } @@ -35,7 +41,7 @@ fn sum_u32(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + .compute_as::(Stat::Sum, &mut SESSION.create_execution_ctx()) }); } @@ -47,7 +53,7 @@ fn sum_i64(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + .compute_as::(Stat::Sum, &mut SESSION.create_execution_ctx()) }); } @@ -69,7 +75,7 @@ fn sum_i32_nulls_clustered(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + .compute_as::(Stat::Sum, &mut SESSION.create_execution_ctx()) }); } @@ -85,6 +91,6 @@ fn sum_i32_nulls_scattered(bencher: Bencher) { .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array()) .bench_refs(|a| { a.statistics() - .compute_as::(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx()) + .compute_as::(Stat::Sum, &mut SESSION.create_execution_ctx()) }); }