From 18317433cc3d90ba614336d7b52347696ee68fd4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 11:13:54 +0000
Subject: [PATCH 1/7] Add aggregate max divan benchmark

Adds a divan benchmark exercising the min/max aggregation over primitive
arrays (i32/i64/f64, with and without nulls) so we can measure and inspect
the codegen of the max reduction path.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/Cargo.toml               |   4 +
 vortex-array/benches/aggregate_max.rs | 109 ++++++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 vortex-array/benches/aggregate_max.rs
diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
index 78a3bb0481f..beceaafe454 100644
--- a/vortex-array/Cargo.toml
+++ b/vortex-array/Cargo.toml
@@ -97,6 +97,10 @@ serde_json = { workspace = true }
 serde_test = { workspace = true }
 vortex-array = { path = ".", features = ["_test-harness", "table-display"] }
 
+[[bench]]
+name = "aggregate_max"
+harness = false
+
 [[bench]]
 name = "cast_primitive"
 harness = false
diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs
new file mode 100644
index 00000000000..f59f0c87ee0
--- /dev/null
+++ b/vortex-array/benches/aggregate_max.rs
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use divan::Bencher;
+use rand::prelude::*;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+
+fn main() {
+    divan::main();
+}
+
+const N: usize = 1_000_000;
+
+fn gen_i32(null_density: f64, seed: u64) -> Vec<Option<i32>> {
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..N)
+        .map(|_| {
+            if rng.random_bool(null_density) {
+                None
+            } else {
+                Some(rng.random::<i32>())
+            }
+        })
+        .collect()
+}
+
+fn gen_i64(null_density: f64, seed: u64) -> Vec<Option<i64>> {
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..N)
+        .map(|_| {
+            if rng.random_bool(null_density) {
+                None
+            } else {
+                Some(rng.random::<i64>())
+            }
+        })
+        .collect()
+}
+
+fn gen_f64(null_density: f64, seed: u64) -> Vec<Option<f64>> {
+    let mut rng = StdRng::seed_from_u64(seed);
+    (0..N)
+        .map(|_| {
+            if rng.random_bool(null_density) {
+                None
+            } else {
+                Some(rng.random::<f64>())
+            }
+        })
+        .collect()
+}
+
+#[divan::bench]
+fn max_i32_all_valid(bencher: Bencher) {
+    let data = gen_i32(0.0, 1);
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn max_i32_half_null(bencher: Bencher) {
+    let data = gen_i32(0.5, 2);
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn max_i64_all_valid(bencher: Bencher) {
+    let data = gen_i64(0.0, 3);
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i64>(&mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn max_f64_all_valid(bencher: Bencher) {
+    let data = gen_f64(0.0, 4);
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<f64>(&mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn max_f64_half_null(bencher: Bencher) {
+    let data = gen_f64(0.5, 5);
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<f64>(&mut LEGACY_SESSION.create_execution_ctx())
+        });
+}

From e9a31adb651a36e7b446fc00bdf21600bbf5ef37 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 12:26:10 +0000
Subject: [PATCH 2/7] Vectorize integer min/max for all-valid arrays

The all-valid primitive min/max path used `itertools::minmax_by` with a
`total_compare` closure preceded by a NaN filter, which the autovectorizer
could not lower to packed min/max, leaving a scalar cmov reduction.

Route the all-true mask case for integer ptypes through a plain reduction.
Integers have no NaNs, so the NaN filter is unnecessary and LLVM vectorizes
the loop (pmaxub/pmaxsw, and pcmpgtd-based blends for i32/i64). Floats keep
the existing NaN-aware path.

Benchmarked over 1M elements: i32 all-valid ~2.93ms -> ~0.36ms, i64
~3.02ms -> ~0.55ms.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/aggregate_fn/fns/min_max/primitive.rs | 33 ++++++++++++++++++-
 1 file changed, 32 insertions(+), 1 deletion(-)

diff --git a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs
index 3470ba3728d..4d1b4242ef2 100644
--- a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs
+++ b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs
@@ -41,7 +41,16 @@ where
             .validity()?
             .execute_mask(array.as_ref().len(), ctx)?
         {
-            Mask::AllTrue(_) => compute_min_max(array.as_slice::<T>().iter()),
+            Mask::AllTrue(_) => {
+                let slice = array.as_slice::<T>();
+                // Integers have no NaNs, so a plain min/max reduction is correct and, unlike the
+                // `itertools::minmax_by` + NaN-filter path, autovectorizes to packed min/max.
+                if T::PTYPE.is_int() {
+                    integer_min_max(slice)
+                } else {
+                    compute_min_max(slice.iter())
+                }
+            }
             Mask::AllFalse(_) => None,
             Mask::Values(v) => compute_min_max(
                 array
@@ -54,6 +63,28 @@ where
     )
 }
 
+fn integer_min_max<T>(slice: &[T]) -> Option<MinMaxResult>
+where
+    T: NativePType,
+    PValue: From<T>,
+{
+    let (&first, rest) = slice.split_first()?;
+    let mut min = first;
+    let mut max = first;
+    for &v in rest {
+        if v.is_lt(min) {
+            min = v;
+        }
+        if v.is_gt(max) {
+            max = v;
+        }
+    }
+    Some(MinMaxResult {
+        min: Scalar::primitive(min, NonNullable),
+        max: Scalar::primitive(max, NonNullable),
+    })
+}
+
 fn compute_min_max<'a, T>(iter: impl Iterator<Item = &'a T>) -> Option<MinMaxResult>
 where
     T: NativePType,

From ac47a14a577dd66c9f95cdd4e9e83c2b3b63c1d2 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 13:18:31 +0000
Subject: [PATCH 3/7] Simplify aggregate max benchmark to one bench per type

Keep a single all-valid bench for i32, i64, and f64 instead of the
per-type all-valid/half-null pairs.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/aggregate_max.rs | 82 ++++-----------------------
 1 file changed, 12 insertions(+), 70 deletions(-)

diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs
index f59f0c87ee0..7fd3541fc89 100644
--- a/vortex-array/benches/aggregate_max.rs
+++ b/vortex-array/benches/aggregate_max.rs
@@ -14,61 +14,12 @@ fn main() {
 
 const N: usize = 1_000_000;
 
-fn gen_i32(null_density: f64, seed: u64) -> Vec<Option<i32>> {
-    let mut rng = StdRng::seed_from_u64(seed);
-    (0..N)
-        .map(|_| {
-            if rng.random_bool(null_density) {
-                None
-            } else {
-                Some(rng.random::<i32>())
-            }
-        })
-        .collect()
-}
-
-fn gen_i64(null_density: f64, seed: u64) -> Vec<Option<i64>> {
-    let mut rng = StdRng::seed_from_u64(seed);
-    (0..N)
-        .map(|_| {
-            if rng.random_bool(null_density) {
-                None
-            } else {
-                Some(rng.random::<i64>())
-            }
-        })
-        .collect()
-}
-
-fn gen_f64(null_density: f64, seed: u64) -> Vec<Option<f64>> {
-    let mut rng = StdRng::seed_from_u64(seed);
-    (0..N)
-        .map(|_| {
-            if rng.random_bool(null_density) {
-                None
-            } else {
-                Some(rng.random::<f64>())
-            }
-        })
-        .collect()
-}
-
-#[divan::bench]
-fn max_i32_all_valid(bencher: Bencher) {
-    let data = gen_i32(0.0, 1);
-    bencher
-        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
-        .bench_refs(|a| {
-            a.statistics()
-                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
-        });
-}
-
 #[divan::bench]
-fn max_i32_half_null(bencher: Bencher) {
-    let data = gen_i32(0.5, 2);
+fn max_i32(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(1);
+    let data: Vec<i32> = (0..N).map(|_| rng.random::<i32>()).collect();
     bencher
-        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
                 .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
@@ -76,10 +27,11 @@ fn max_i32_half_null(bencher: Bencher) {
 }
 
 #[divan::bench]
-fn max_i64_all_valid(bencher: Bencher) {
-    let data = gen_i64(0.0, 3);
+fn max_i64(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(2);
+    let data: Vec<i64> = (0..N).map(|_| rng.random::<i64>()).collect();
     bencher
-        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
                 .compute_max::<i64>(&mut LEGACY_SESSION.create_execution_ctx())
@@ -87,21 +39,11 @@ fn max_i64_all_valid(bencher: Bencher) {
 }
 
 #[divan::bench]
-fn max_f64_all_valid(bencher: Bencher) {
-    let data = gen_f64(0.0, 4);
-    bencher
-        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
-        .bench_refs(|a| {
-            a.statistics()
-                .compute_max::<f64>(&mut LEGACY_SESSION.create_execution_ctx())
-        });
-}
-
-#[divan::bench]
-fn max_f64_half_null(bencher: Bencher) {
-    let data = gen_f64(0.5, 5);
+fn max_f64(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(3);
+    let data: Vec<f64> = (0..N).map(|_| rng.random::<f64>()).collect();
     bencher
-        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
                 .compute_max::<f64>(&mut LEGACY_SESSION.create_execution_ctx())

From 08abd6abac3ee066b4af84a425fd8abe5492a4cf Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 13:19:32 +0000
Subject: [PATCH 4/7] Reduce aggregate max benchmark array length to 100k

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/aggregate_max.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs
index 7fd3541fc89..39e7734f19d 100644
--- a/vortex-array/benches/aggregate_max.rs
+++ b/vortex-array/benches/aggregate_max.rs
@@ -12,7 +12,7 @@ fn main() {
     divan::main();
 }
 
-const N: usize = 1_000_000;
+const N: usize = 100_000;
 
 #[divan::bench]
 fn max_i32(bencher: Bencher) {

From 8b98b5db7b7588f1b340fe46193841acfae3debd Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 22 May 2026 18:01:34 +0000
Subject: [PATCH 5/7] Vectorize integer sum via chunked widening accumulator

The all-valid integer sum did a per-element `checked_add`, whose overflow
early-return branch blocked autovectorization, leaving a scalar loop.

Sum narrower-than-64-bit integers in chunks of 65536 into a widened 64-bit
accumulator with no per-element check: a chunk of <64-bit values cannot
overflow the 64-bit accumulator (2^16 * (2^32-1) < 2^64), so only one
checked add per chunk is needed. This lets the inner loop vectorize to
packed widening adds (paddq + unpck). 64-bit inputs keep the per-element
checked path since a chunk of 64-bit values could itself overflow.

This observes overflow at chunk boundaries rather than per element, so a
signed sum whose running total transiently leaves i64 range but ends in
range now returns the true total instead of null. The final result is
unchanged whenever the existing per-batch combine did not already overflow.

Benchmarked over 100k elements: sum_i32 ~19us, sum_u32 ~15us, sum_i64 ~51us.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/Cargo.toml                       |  4 ++
 vortex-array/benches/aggregate_sum.rs         | 52 ++++++++++++++
 .../src/aggregate_fn/fns/sum/primitive.rs     | 72 ++++++++++++++-----
 3 files changed, 112 insertions(+), 16 deletions(-)
 create mode 100644 vortex-array/benches/aggregate_sum.rs

diff --git a/vortex-array/Cargo.toml b/vortex-array/Cargo.toml
index beceaafe454..666a23c02c4 100644
--- a/vortex-array/Cargo.toml
+++ b/vortex-array/Cargo.toml
@@ -101,6 +101,10 @@ vortex-array = { path = ".", features = ["_test-harness", "table-display"] }
 name = "aggregate_max"
 harness = false
 
+[[bench]]
+name = "aggregate_sum"
+harness = false
+
 [[bench]]
 name = "cast_primitive"
 harness = false
diff --git a/vortex-array/benches/aggregate_sum.rs b/vortex-array/benches/aggregate_sum.rs
new file mode 100644
index 00000000000..d52961355c2
--- /dev/null
+++ b/vortex-array/benches/aggregate_sum.rs
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use divan::Bencher;
+use rand::prelude::*;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::expr::stats::Stat;
+
+fn main() {
+    divan::main();
+}
+
+const N: usize = 100_000;
+
+#[divan::bench]
+fn sum_i32(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(1);
+    let data: Vec<i32> = (0..N).map(|_| rng.random_range(-1000..1000)).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn sum_u32(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(2);
+    let data: Vec<u32> = (0..N).map(|_| rng.random_range(0..2000)).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<u64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+#[divan::bench]
+fn sum_i64(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(3);
+    let data: Vec<i64> = (0..N).map(|_| rng.random_range(-1000..1000)).collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
diff --git a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs
index 19c414b8204..b78156d6639 100644
--- a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs
+++ b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs
@@ -2,6 +2,7 @@
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
 use itertools::Itertools;
+use num_traits::AsPrimitive;
 use num_traits::ToPrimitive;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
@@ -13,8 +14,14 @@ use super::checked_add_i64;
 use super::checked_add_u64;
 use crate::ExecutionCtx;
 use crate::arrays::PrimitiveArray;
+use crate::dtype::NativePType;
+use crate::dtype::PType;
 use crate::match_each_native_ptype;
 
+/// Number of elements summed without an overflow check. Chosen so that a chunk of values narrower
+/// than 64 bits cannot overflow the 64-bit accumulator: `2^16 * (2^32 - 1) < 2^64`.
+const SUM_CHUNK: usize = 1 << 16;
+
 pub(super) fn accumulate_primitive(
     inner: &mut SumState,
     p: &PrimitiveArray,
@@ -31,27 +38,13 @@ pub(super) fn accumulate_primitive(
 fn accumulate_primitive_all(inner: &mut SumState, p: &PrimitiveArray) -> VortexResult<bool> {
     match inner {
         SumState::Unsigned(acc) => match_each_native_ptype!(p.ptype(),
-            unsigned: |T| {
-                for &v in p.as_slice::<T>() {
-                    if checked_add_u64(acc, v.to_u64().vortex_expect("unsigned to u64")) {
-                        return Ok(true);
-                    }
-                }
-                Ok(false)
-            },
+            unsigned: |T| { Ok(sum_unsigned_all(acc, p.as_slice::<T>())) },
             signed: |_T| { vortex_panic!("unsigned sum state with signed input") },
             floating: |_T| { vortex_panic!("unsigned sum state with float input") }
         ),
         SumState::Signed(acc) => match_each_native_ptype!(p.ptype(),
             unsigned: |_T| { vortex_panic!("signed sum state with unsigned input") },
-            signed: |T| {
-                for &v in p.as_slice::<T>() {
-                    if checked_add_i64(acc, v.to_i64().vortex_expect("signed to i64")) {
-                        return Ok(true);
-                    }
-                }
-                Ok(false)
-            },
+            signed: |T| { Ok(sum_signed_all(acc, p.as_slice::<T>())) },
             floating: |_T| { vortex_panic!("signed sum state with float input") }
         ),
         SumState::Float(acc) => match_each_native_ptype!(p.ptype(),
@@ -70,6 +63,53 @@ fn accumulate_primitive_all(inner: &mut SumState, p: &PrimitiveArray) -> VortexR
     }
 }
 
+/// Sum all values into a `u64` accumulator. For types narrower than 64 bits, values are summed in
+/// chunks of [`SUM_CHUNK`] with a single checked add per chunk, which lets the inner loop vectorize
+/// to packed widening adds. `u64` input keeps a per-element checked add since a chunk of `u64`s
+/// could itself overflow. Returns `true` on overflow.
+fn sum_unsigned_all<T>(acc: &mut u64, slice: &[T]) -> bool
+where
+    T: NativePType + AsPrimitive<u64>,
+{
+    if T::PTYPE == PType::U64 {
+        for &v in slice {
+            if checked_add_u64(acc, v.as_()) {
+                return true;
+            }
+        }
+        return false;
+    }
+    for chunk in slice.chunks(SUM_CHUNK) {
+        let chunk_sum: u64 = chunk.iter().map(|&v| v.as_()).sum();
+        if checked_add_u64(acc, chunk_sum) {
+            return true;
+        }
+    }
+    false
+}
+
+/// Signed counterpart of [`sum_unsigned_all`].
+fn sum_signed_all<T>(acc: &mut i64, slice: &[T]) -> bool
+where
+    T: NativePType + AsPrimitive<i64>,
+{
+    if T::PTYPE == PType::I64 {
+        for &v in slice {
+            if checked_add_i64(acc, v.as_()) {
+                return true;
+            }
+        }
+        return false;
+    }
+    for chunk in slice.chunks(SUM_CHUNK) {
+        let chunk_sum: i64 = chunk.iter().map(|&v| v.as_()).sum();
+        if checked_add_i64(acc, chunk_sum) {
+            return true;
+        }
+    }
+    false
+}
+
 fn accumulate_primitive_valid(
     inner: &mut SumState,
     p: &PrimitiveArray,

From 6a44f511adb5747829494fb058ae063b3d041a35 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sat, 23 May 2026 11:47:45 +0000
Subject: [PATCH 6/7] Vectorize integer sum and min/max over nullable arrays

The with-nulls paths for primitive sum and min/max walked the values
zipped with a per-element validity bit, which kept them scalar even
though their all-valid counterparts already autovectorize to packed
widening adds (sum) and packed min/max.

Drive both null paths from the validity mask's contiguous `[start, end)`
runs (`Mask::slices`, computed once and cached). Each run is fully valid,
so it reuses the existing vectorized all-valid reduction: sum folds each
run through the chunked widening accumulator; min/max folds the native
per-run integer min/max, with floats chaining the runs through the
NaN-filtering reduction. Results are unchanged.

To support the fold, integer min/max now returns native `(min, max)`
(`integer_min_max_raw`) which both the all-valid and run paths reduce
before building the result scalar.

Benchmarked over 100k i32 elements (added nullable bench cases):
- clustered nulls: sum 106us -> 29us, max 159us -> 35us
- scattered ~50% nulls: no regression (sum 584us -> 530us, max 606us -> 593us)

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/aggregate_max.rs         | 36 ++++++++++++++
 vortex-array/benches/aggregate_sum.rs         | 38 ++++++++++++++
 .../src/aggregate_fn/fns/min_max/mod.rs       | 26 ++++++++++
 .../src/aggregate_fn/fns/min_max/primitive.rs | 49 ++++++++++++++-----
 .../src/aggregate_fn/fns/sum/primitive.rs     | 49 ++++++++++++++-----
 5 files changed, 175 insertions(+), 23 deletions(-)

diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs
index 39e7734f19d..f5a10b3ae76 100644
--- a/vortex-array/benches/aggregate_max.rs
+++ b/vortex-array/benches/aggregate_max.rs
@@ -49,3 +49,39 @@ fn max_f64(bencher: Bencher) {
                 .compute_max::<f64>(&mut LEGACY_SESSION.create_execution_ctx())
         });
 }
+
+// Clustered nulls: long valid runs broken up by null blocks (run-based path's best case).
+#[divan::bench]
+fn max_i32_nulls_clustered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(4);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|i| {
+            if (i / 64) % 10 == 0 {
+                None
+            } else {
+                Some(rng.random::<i32>())
+            }
+        })
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+// Scattered nulls: ~50% random nulls producing many short runs (run-based path's worst case).
+#[divan::bench]
+fn max_i32_nulls_scattered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(5);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|_| rng.random_bool(0.5).then(|| rng.random::<i32>()))
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
diff --git a/vortex-array/benches/aggregate_sum.rs b/vortex-array/benches/aggregate_sum.rs
index d52961355c2..f298230788d 100644
--- a/vortex-array/benches/aggregate_sum.rs
+++ b/vortex-array/benches/aggregate_sum.rs
@@ -50,3 +50,41 @@ fn sum_i64(bencher: Bencher) {
                 .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
         });
 }
+
+// Clustered nulls: long runs of valid values broken up by occasional null blocks. This is the
+// case the run-based valid path is expected to accelerate.
+#[divan::bench]
+fn sum_i32_nulls_clustered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(4);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|i| {
+            if (i / 64) % 10 == 0 {
+                None
+            } else {
+                Some(rng.random_range(-1000..1000))
+            }
+        })
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
+
+// Scattered nulls: ~50% nulls placed at random, producing many short runs. This is the worst case
+// for a run-based valid path, used to guard against regressions versus a per-element loop.
+#[divan::bench]
+fn sum_i32_nulls_scattered(bencher: Bencher) {
+    let mut rng = StdRng::seed_from_u64(5);
+    let data: Vec<Option<i32>> = (0..N)
+        .map(|_| rng.random_bool(0.5).then(|| rng.random_range(-1000..1000)))
+        .collect();
+    bencher
+        .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
+        .bench_refs(|a| {
+            a.statistics()
+                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+        });
+}
diff --git a/vortex-array/src/aggregate_fn/fns/min_max/mod.rs b/vortex-array/src/aggregate_fn/fns/min_max/mod.rs
index 03b10e32301..b9663baa84d 100644
--- a/vortex-array/src/aggregate_fn/fns/min_max/mod.rs
+++ b/vortex-array/src/aggregate_fn/fns/min_max/mod.rs
@@ -330,6 +330,32 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn test_prim_min_max_multiple_null_runs() -> VortexResult<()> {
+        // Several disjoint valid runs separated by nulls exercise the per-run fold; the extrema
+        // (min 1, max 9) fall in different runs.
+        let p = PrimitiveArray::from_option_iter([
+            Some(5i32),
+            Some(3),
+            None,
+            None,
+            Some(9),
+            None,
+            Some(1),
+            Some(7),
+        ])
+        .into_array();
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        assert_eq!(
+            min_max(&p, &mut ctx)?,
+            Some(MinMaxResult {
+                min: 1.into(),
+                max: 9.into()
+            })
+        );
+        Ok(())
+    }
+
     #[test]
     fn test_bool_min_max() -> VortexResult<()> {
         let mut ctx = LEGACY_SESSION.create_execution_ctx();
diff --git a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs
index 4d1b4242ef2..06b9749678d 100644
--- a/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs
+++ b/vortex-array/src/aggregate_fn/fns/min_max/primitive.rs
@@ -46,27 +46,44 @@ where
                 // Integers have no NaNs, so a plain min/max reduction is correct and, unlike the
                 // `itertools::minmax_by` + NaN-filter path, autovectorizes to packed min/max.
                 if T::PTYPE.is_int() {
-                    integer_min_max(slice)
+                    integer_min_max_raw(slice).map(min_max_result)
                 } else {
                     compute_min_max(slice.iter())
                 }
             }
             Mask::AllFalse(_) => None,
-            Mask::Values(v) => compute_min_max(
-                array
-                    .as_slice::<T>()
-                    .iter()
-                    .zip(v.bit_buffer().iter())
-                    .filter_map(|(v, m)| m.then_some(v)),
-            ),
+            Mask::Values(v) => {
+                let slice = array.as_slice::<T>();
+                // Each `[start, end)` run is fully valid, so integers can reuse the vectorized
+                // packed min/max per run and fold the run results; floats chain the runs through
+                // the NaN-filtering reduction.
+                if T::PTYPE.is_int() {
+                    v.slices()
+                        .iter()
+                        .filter_map(|&(start, end)| integer_min_max_raw(&slice[start..end]))
+                        .reduce(|(amin, amax), (rmin, rmax)| {
+                            (
+                                if rmin.is_lt(amin) { rmin } else { amin },
+                                if rmax.is_gt(amax) { rmax } else { amax },
+                            )
+                        })
+                        .map(min_max_result)
+                } else {
+                    compute_min_max(
+                        v.slices()
+                            .iter()
+                            .flat_map(|&(start, end)| slice[start..end].iter()),
+                    )
+                }
+            }
         },
     )
 }
 
-fn integer_min_max<T>(slice: &[T]) -> Option<MinMaxResult>
+/// Min/max of an all-valid integer slice as native values. Autovectorizes to packed min/max.
+fn integer_min_max_raw<T>(slice: &[T]) -> Option<(T, T)>
 where
     T: NativePType,
-    PValue: From<T>,
 {
     let (&first, rest) = slice.split_first()?;
     let mut min = first;
@@ -79,10 +96,18 @@ where
             max = v;
         }
     }
-    Some(MinMaxResult {
+    Some((min, max))
+}
+
+fn min_max_result<T>((min, max): (T, T)) -> MinMaxResult
+where
+    T: NativePType,
+    PValue: From<T>,
+{
+    MinMaxResult {
         min: Scalar::primitive(min, NonNullable),
         max: Scalar::primitive(max, NonNullable),
-    })
+    }
 }
 
 fn compute_min_max<'a, T>(iter: impl Iterator<Item = &'a T>) -> Option<MinMaxResult>
diff --git a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs
index b78156d6639..44418fb5628 100644
--- a/vortex-array/src/aggregate_fn/fns/sum/primitive.rs
+++ b/vortex-array/src/aggregate_fn/fns/sum/primitive.rs
@@ -1,7 +1,6 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-use itertools::Itertools;
 use num_traits::AsPrimitive;
 use num_traits::ToPrimitive;
 use vortex_error::VortexExpect;
@@ -28,10 +27,10 @@ pub(super) fn accumulate_primitive(
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<bool> {
     let mask = p.as_ref().validity()?.execute_mask(p.as_ref().len(), ctx)?;
-    match mask.bit_buffer() {
+    match mask.slices() {
         AllOr::None => Ok(false),
         AllOr::All => accumulate_primitive_all(inner, p),
-        AllOr::Some(validity) => accumulate_primitive_valid(inner, p, validity),
+        AllOr::Some(slices) => accumulate_primitive_valid(inner, p, slices),
     }
 }
 
@@ -110,16 +109,20 @@ where
     false
 }
 
+/// Sum the valid elements, described as contiguous `[start, end)` runs of set validity bits. Each
+/// run is a slice of fully-valid values, so it reuses the same vectorized reduction as the
+/// all-valid path instead of a per-element validity branch.
 fn accumulate_primitive_valid(
     inner: &mut SumState,
     p: &PrimitiveArray,
-    validity: &vortex_buffer::BitBuffer,
+    slices: &[(usize, usize)],
 ) -> VortexResult<bool> {
     match inner {
         SumState::Unsigned(acc) => match_each_native_ptype!(p.ptype(),
             unsigned: |T| {
-                for (&v, valid) in p.as_slice::<T>().iter().zip_eq(validity.iter()) {
-                    if valid && checked_add_u64(acc, v.to_u64().vortex_expect("unsigned to u64")) {
+                let values = p.as_slice::<T>();
+                for &(start, end) in slices {
+                    if sum_unsigned_all(acc, &values[start..end]) {
                         return Ok(true);
                     }
                 }
@@ -131,8 +134,9 @@ fn accumulate_primitive_valid(
         SumState::Signed(acc) => match_each_native_ptype!(p.ptype(),
             unsigned: |_T| { vortex_panic!("signed sum state with unsigned input") },
             signed: |T| {
-                for (&v, valid) in p.as_slice::<T>().iter().zip_eq(validity.iter()) {
-                    if valid && checked_add_i64(acc, v.to_i64().vortex_expect("signed to i64")) {
+                let values = p.as_slice::<T>();
+                for &(start, end) in slices {
+                    if sum_signed_all(acc, &values[start..end]) {
                         return Ok(true);
                     }
                 }
@@ -144,9 +148,12 @@ fn accumulate_primitive_valid(
             unsigned: |_T| { vortex_panic!("float sum state with unsigned input") },
             signed: |_T| { vortex_panic!("float sum state with signed input") },
             floating: |T| {
-                for (&v, valid) in p.as_slice::<T>().iter().zip_eq(validity.iter()) {
-                    if valid && !v.is_nan() {
-                        *acc += ToPrimitive::to_f64(&v).vortex_expect("float to f64");
+                let values = p.as_slice::<T>();
+                for &(start, end) in slices {
+                    for &v in &values[start..end] {
+                        if !v.is_nan() {
+                            *acc += ToPrimitive::to_f64(&v).vortex_expect("float to f64");
+                        }
                     }
                 }
                 Ok(false)
@@ -210,6 +217,26 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn sum_multiple_null_runs() -> VortexResult<()> {
+        // Several disjoint valid runs separated by nulls exercise the per-run fold.
+        let arr = PrimitiveArray::from_option_iter([
+            Some(1i32),
+            Some(2),
+            None,
+            None,
+            Some(3),
+            None,
+            Some(4),
+            Some(5),
+            Some(6),
+        ])
+        .into_array();
+        let result = sum(&arr, &mut LEGACY_SESSION.create_execution_ctx())?;
+        assert_eq!(result.as_primitive().typed_value::<i64>(), Some(21));
+        Ok(())
+    }
+
     #[test]
     fn sum_all_null() -> VortexResult<()> {
         let arr = PrimitiveArray::from_option_iter([None::<i32>, None, None]).into_array();

From 3d86cfe265d779acdf469791e1514fb39bc7e259 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Tue, 26 May 2026 14:22:14 +0100
Subject: [PATCH 7/7] fix

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 vortex-array/benches/aggregate_max.rs | 18 ++++++++++++------
 vortex-array/benches/aggregate_sum.rs | 18 ++++++++++++------
 2 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/vortex-array/benches/aggregate_max.rs b/vortex-array/benches/aggregate_max.rs
index f5a10b3ae76..8962ade2974 100644
--- a/vortex-array/benches/aggregate_max.rs
+++ b/vortex-array/benches/aggregate_max.rs
@@ -1,12 +1,15 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::sync::LazyLock;
+
 use divan::Bencher;
 use rand::prelude::*;
 use vortex_array::IntoArray;
-use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::session::ArraySession;
+use vortex_session::VortexSession;
 
 fn main() {
     divan::main();
@@ -14,6 +17,9 @@ fn main() {
 
 const N: usize = 100_000;
 
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
 #[divan::bench]
 fn max_i32(bencher: Bencher) {
     let mut rng = StdRng::seed_from_u64(1);
@@ -22,7 +28,7 @@ fn max_i32(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
+                .compute_max::<i32>(&mut SESSION.create_execution_ctx())
         });
 }
 
@@ -34,7 +40,7 @@ fn max_i64(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_max::<i64>(&mut LEGACY_SESSION.create_execution_ctx())
+                .compute_max::<i64>(&mut SESSION.create_execution_ctx())
         });
 }
 
@@ -46,7 +52,7 @@ fn max_f64(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_max::<f64>(&mut LEGACY_SESSION.create_execution_ctx())
+                .compute_max::<f64>(&mut SESSION.create_execution_ctx())
         });
 }
 
@@ -67,7 +73,7 @@ fn max_i32_nulls_clustered(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
+                .compute_max::<i32>(&mut SESSION.create_execution_ctx())
         });
 }
 
@@ -82,6 +88,6 @@ fn max_i32_nulls_scattered(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_max::<i32>(&mut LEGACY_SESSION.create_execution_ctx())
+                .compute_max::<i32>(&mut SESSION.create_execution_ctx())
         });
 }
diff --git a/vortex-array/benches/aggregate_sum.rs b/vortex-array/benches/aggregate_sum.rs
index f298230788d..db4bf5b284a 100644
--- a/vortex-array/benches/aggregate_sum.rs
+++ b/vortex-array/benches/aggregate_sum.rs
@@ -1,13 +1,16 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+use std::sync::LazyLock;
+
 use divan::Bencher;
 use rand::prelude::*;
 use vortex_array::IntoArray;
-use vortex_array::LEGACY_SESSION;
 use vortex_array::VortexSessionExecute;
 use vortex_array::arrays::PrimitiveArray;
 use vortex_array::expr::stats::Stat;
+use vortex_array::session::ArraySession;
+use vortex_session::VortexSession;
 
 fn main() {
     divan::main();
@@ -15,6 +18,9 @@ fn main() {
 
 const N: usize = 100_000;
 
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
 #[divan::bench]
 fn sum_i32(bencher: Bencher) {
     let mut rng = StdRng::seed_from_u64(1);
@@ -23,7 +29,7 @@ fn sum_i32(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
         });
 }
 
@@ -35,7 +41,7 @@ fn sum_u32(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_as::<u64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+                .compute_as::<u64>(Stat::Sum, &mut SESSION.create_execution_ctx())
         });
 }
 
@@ -47,7 +53,7 @@ fn sum_i64(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
         });
 }
 
@@ -69,7 +75,7 @@ fn sum_i32_nulls_clustered(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
         });
 }
 
@@ -85,6 +91,6 @@ fn sum_i32_nulls_scattered(bencher: Bencher) {
         .with_inputs(|| PrimitiveArray::from_option_iter(data.iter().copied()).into_array())
         .bench_refs(|a| {
             a.statistics()
-                .compute_as::<i64>(Stat::Sum, &mut LEGACY_SESSION.create_execution_ctx())
+                .compute_as::<i64>(Stat::Sum, &mut SESSION.create_execution_ctx())
         });
 }