From c982fc848369d64ca60641141e462879d5f285f3 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 08:57:29 +0000
Subject: [PATCH 01/13] Add between pushdown kernel for DecimalByteParts

DecimalByteParts already pushed `compare` against a constant down to its
numeric MSP child. This adds the symmetric `between` kernel so that
bounded-range predicates are evaluated directly on the compact MSP
representation instead of canonicalizing to a wide DecimalArray.

Both bounds are converted to the MSP's physical integer type and the
comparison is delegated to the MSP's own `between`. When a bound falls
outside the MSP's integer range the kernel falls back to the canonical
decimal `between`, which already handles the overflow directions.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/decimal-byte-parts/public-api.lock  |   4 +
 .../src/decimal_byte_parts/compute/between.rs | 223 ++++++++++++++++++
 .../src/decimal_byte_parts/compute/compare.rs |   9 +-
 .../src/decimal_byte_parts/compute/kernel.rs  |   2 +
 .../src/decimal_byte_parts/compute/mod.rs     |   3 +-
 5 files changed, 237 insertions(+), 4 deletions(-)
 create mode 100644 encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
diff --git a/encodings/decimal-byte-parts/public-api.lock b/encodings/decimal-byte-parts/public-api.lock
index 985172de937..372706034dc 100644
--- a/encodings/decimal-byte-parts/public-api.lock
+++ b/encodings/decimal-byte-parts/public-api.lock
@@ -64,6 +64,10 @@ impl vortex_array::arrays::slice::SliceReduce for vortex_decimal_byte_parts::Dec
 
 pub fn vortex_decimal_byte_parts::DecimalByteParts::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range<usize>) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
 
+impl vortex_array::scalar_fn::fns::between::kernel::BetweenKernel for vortex_decimal_byte_parts::DecimalByteParts
+
+pub fn vortex_decimal_byte_parts::DecimalByteParts::between(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, &vortex_array::array::erased::ArrayRef, &vortex_array::scalar_fn::fns::between::BetweenOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
+
 impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_decimal_byte_parts::DecimalByteParts
 
 pub fn vortex_decimal_byte_parts::DecimalByteParts::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult<core::option::Option<vortex_array::array::erased::ArrayRef>>
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
new file mode 100644
index 00000000000..78d4aa47689
--- /dev/null
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+use vortex_array::ArrayRef;
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::between::BetweenKernel;
+use vortex_array::scalar_fn::fns::between::BetweenOptions;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+
+use crate::DecimalByteParts;
+use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
+use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive;
+
+impl BetweenKernel for DecimalByteParts {
+    fn between(
+        arr: ArrayView<'_, Self>,
+        lower: &ArrayRef,
+        upper: &ArrayRef,
+        options: &BetweenOptions,
+        _ctx: &mut ExecutionCtx,
+    ) -> VortexResult<Option<ArrayRef>> {
+        // We can only push the comparison down into the MSP when both bounds are constant.
+        let (Some(lower_const), Some(upper_const)) = (lower.as_constant(), upper.as_constant())
+        else {
+            return Ok(None);
+        };
+
+        // NOTE: the `between` entrypoint precondition already replaced null bounds with an
+        // all-null result, so both bounds are guaranteed to be non-null here.
+        let lower_decimal = lower_const
+            .as_decimal()
+            .decimal_value()
+            .vortex_expect("checked for null in entry func");
+        let upper_decimal = upper_const
+            .as_decimal()
+            .decimal_value()
+            .vortex_expect("checked for null in entry func");
+
+        let nullability =
+            arr.dtype().nullability() | lower.dtype().nullability() | upper.dtype().nullability();
+        let scalar_type = arr.msp().dtype().with_nullability(nullability);
+        let msp_ptype = arr.msp().as_primitive_typed().ptype();
+
+        // If either bound falls outside the MSP's physical integer range we cannot push the
+        // comparison down losslessly. Fall back to the canonical decimal `between`, which handles
+        // the overflow directions (all-true / all-false constraints) correctly.
+        let (Ok(lower_value), Ok(upper_value)) = (
+            decimal_value_wrapper_to_primitive(lower_decimal, msp_ptype),
+            decimal_value_wrapper_to_primitive(upper_decimal, msp_ptype),
+        ) else {
+            return Ok(None);
+        };
+
+        let lower_const = ConstantArray::new(
+            Scalar::try_new(scalar_type.clone(), Some(lower_value))?,
+            arr.len(),
+        );
+        let upper_const =
+            ConstantArray::new(Scalar::try_new(scalar_type, Some(upper_value))?, arr.len());
+
+        arr.msp()
+            .clone()
+            .between(
+                lower_const.into_array(),
+                upper_const.into_array(),
+                options.clone(),
+            )
+            .map(Some)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use vortex_array::ArrayRef;
+    use vortex_array::IntoArray;
+    use vortex_array::arrays::BoolArray;
+    use vortex_array::arrays::ConstantArray;
+    use vortex_array::arrays::PrimitiveArray;
+    use vortex_array::assert_arrays_eq;
+    use vortex_array::builtins::ArrayBuiltins;
+    use vortex_array::dtype::DecimalDType;
+    use vortex_array::dtype::Nullability;
+    use vortex_array::scalar::DecimalValue;
+    use vortex_array::scalar::Scalar;
+    use vortex_array::scalar_fn::fns::between::BetweenOptions;
+    use vortex_array::scalar_fn::fns::between::StrictComparison;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::buffer;
+    use vortex_error::VortexResult;
+
+    use crate::DecimalByteParts;
+
+    fn decimal_const(value: DecimalValue, decimal_type: DecimalDType, len: usize) -> ArrayRef {
+        ConstantArray::new(
+            Scalar::decimal(value, decimal_type, Nullability::NonNullable),
+            len,
+        )
+        .into_array()
+    }
+
+    #[test]
+    fn between_decimal_const() -> VortexResult<()> {
+        let decimal_type = DecimalDType::new(8, 2);
+        let arr = DecimalByteParts::try_new(
+            PrimitiveArray::new(buffer![100i32, 200, 300, 400, 500], Validity::AllValid)
+                .into_array(),
+            decimal_type,
+        )?
+        .into_array();
+
+        let lower = decimal_const(DecimalValue::I64(200), decimal_type, arr.len());
+        let upper = decimal_const(DecimalValue::I64(400), decimal_type, arr.len());
+
+        // 200 <= value <= 400
+        let res = arr.clone().between(
+            lower.clone(),
+            upper.clone(),
+            BetweenOptions {
+                lower_strict: StrictComparison::NonStrict,
+                upper_strict: StrictComparison::NonStrict,
+            },
+        )?;
+        assert_arrays_eq!(
+            res,
+            BoolArray::from_iter([Some(false), Some(true), Some(true), Some(true), Some(false)])
+        );
+
+        // 200 < value < 400
+        let res = arr.between(
+            lower,
+            upper,
+            BetweenOptions {
+                lower_strict: StrictComparison::Strict,
+                upper_strict: StrictComparison::Strict,
+            },
+        )?;
+        assert_arrays_eq!(
+            res,
+            BoolArray::from_iter([
+                Some(false),
+                Some(false),
+                Some(true),
+                Some(false),
+                Some(false)
+            ])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn between_decimal_nullable() -> VortexResult<()> {
+        let decimal_type = DecimalDType::new(8, 2);
+        let arr = DecimalByteParts::try_new(
+            PrimitiveArray::new(
+                buffer![100i32, 200, 300, 400],
+                Validity::Array(BoolArray::from_iter([false, true, true, true]).into_array()),
+            )
+            .into_array(),
+            decimal_type,
+        )?
+        .into_array();
+
+        let lower = decimal_const(DecimalValue::I64(100), decimal_type, arr.len());
+        let upper = decimal_const(DecimalValue::I64(300), decimal_type, arr.len());
+
+        let res = arr.between(
+            lower,
+            upper,
+            BetweenOptions {
+                lower_strict: StrictComparison::NonStrict,
+                upper_strict: StrictComparison::NonStrict,
+            },
+        )?;
+        assert_arrays_eq!(
+            res,
+            BoolArray::from_iter([None, Some(true), Some(true), Some(false)])
+        );
+
+        Ok(())
+    }
+
+    /// Bounds that do not fit in the MSP's physical type must fall back to the canonical decimal
+    /// `between`, which handles the overflow directions. Here the array uses i32 storage but the
+    /// upper bound only fits in i128, so the upper constraint is always satisfied.
+    #[test]
+    fn between_decimal_unconvertible_bound() -> VortexResult<()> {
+        let decimal_type = DecimalDType::new(38, 2);
+        let arr = DecimalByteParts::try_new(
+            PrimitiveArray::new(buffer![100i32, 200, 300], Validity::AllValid).into_array(),
+            decimal_type,
+        )?
+        .into_array();
+
+        let lower = decimal_const(DecimalValue::I64(150), decimal_type, arr.len());
+        let upper = decimal_const(
+            DecimalValue::I128(9_999_999_999_999_999_999),
+            decimal_type,
+            arr.len(),
+        );
+
+        let res = arr.between(
+            lower,
+            upper,
+            BetweenOptions {
+                lower_strict: StrictComparison::NonStrict,
+                upper_strict: StrictComparison::NonStrict,
+            },
+        )?;
+        assert_arrays_eq!(
+            res,
+            BoolArray::from_iter([Some(false), Some(true), Some(true)])
+        );
+
+        Ok(())
+    }
+}
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
index e6cc8d4d72d..041fcd864a1 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
@@ -84,7 +84,7 @@ impl CompareKernel for DecimalByteParts {
 // Used to represent the overflow direction when trying to
 // convert into the scalar type.
 #[derive(Debug)]
-enum Sign {
+pub(crate) enum Sign {
     Positive,
     Negative,
 }
@@ -102,8 +102,11 @@ fn unconvertible_value(sign: Sign, operator: CompareOperator, nullability: Nulla
     }
 }
 
-// this value return None is the decimal scalar cannot be cast the ptype.
-fn decimal_value_wrapper_to_primitive(
+// Converts a decimal value into the integer `ScalarValue` of the MSP's physical `ptype`.
+//
+// Returns `Err(Sign)` when the value does not fit in `ptype`, where the sign indicates whether
+// the value overflowed past `ptype`'s maximum (`Positive`) or minimum (`Negative`).
+pub(crate) fn decimal_value_wrapper_to_primitive(
     decimal_value: DecimalValue,
     ptype: PType,
 ) -> Result<ScalarValue, Sign> {
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs
index 9e16b4f81dc..1ca17f99b8f 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs
@@ -3,11 +3,13 @@
 
 use vortex_array::arrays::dict::TakeExecuteAdaptor;
 use vortex_array::kernel::ParentKernelSet;
+use vortex_array::scalar_fn::fns::between::BetweenExecuteAdaptor;
 use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor;
 
 use crate::DecimalByteParts;
 
 pub(crate) const PARENT_KERNELS: ParentKernelSet<DecimalByteParts> = ParentKernelSet::new(&[
+    ParentKernelSet::lift(&BetweenExecuteAdaptor(DecimalByteParts)),
     ParentKernelSet::lift(&CompareExecuteAdaptor(DecimalByteParts)),
     ParentKernelSet::lift(&TakeExecuteAdaptor(DecimalByteParts)),
 ]);
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
index 35da5be40b6..8434b4f7dfb 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
+mod between;
 mod cast;
-mod compare;
+pub(crate) mod compare;
 mod filter;
 pub(crate) mod is_constant;
 pub(crate) mod kernel;

From 0c1f8f6567206224bde353fe22f0b8f29ef297d9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 13:29:25 +0000
Subject: [PATCH 02/13] Add decimal between benchmark vs arrow-rs

Benchmarks `between` over DecimalByteParts (i32/i64 MSP pushdown), the
canonical i128 DecimalArray, and arrow-rs Decimal128. Demonstrates that
pushing the comparison down to a narrow MSP beats arrow-rs (~2.4-2.7x at
1M rows), since arrow has no decimal storage narrower than 128 bits.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                    |   5 +
 encodings/decimal-byte-parts/Cargo.toml       |   9 +
 .../benches/decimal_between.rs                | 216 ++++++++++++++++++
 3 files changed, 230 insertions(+)
 create mode 100644 encodings/decimal-byte-parts/benches/decimal_between.rs

diff --git a/Cargo.lock b/Cargo.lock
index cbf73b2a20c..88cfe592aa2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -9586,8 +9586,13 @@ dependencies = [
 name = "vortex-decimal-byte-parts"
 version = "0.1.0"
 dependencies = [
+ "arrow-arith",
+ "arrow-array",
+ "arrow-ord",
+ "codspeed-divan-compat",
  "num-traits",
  "prost 0.14.3",
+ "rand 0.10.1",
  "rstest",
  "vortex-array",
  "vortex-buffer",
diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml
index 4934ec4fa27..549c0b05e7c 100644
--- a/encodings/decimal-byte-parts/Cargo.toml
+++ b/encodings/decimal-byte-parts/Cargo.toml
@@ -26,5 +26,14 @@ vortex-mask = { workspace = true }
 vortex-session = { workspace = true }
 
 [dev-dependencies]
+arrow-arith = { workspace = true }
+arrow-array = { workspace = true }
+arrow-ord = { workspace = true }
+divan = { workspace = true }
+rand = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { path = "../../vortex-array", features = ["_test-harness"] }
+
+[[bench]]
+name = "decimal_between"
+harness = false
diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs
new file mode 100644
index 00000000000..f1efb39bbc2
--- /dev/null
+++ b/encodings/decimal-byte-parts/benches/decimal_between.rs
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Compares `between` (`lower <= x <= upper`) over decimal data for:
+//!   * Vortex `DecimalByteParts` with an i32 most-significant-part (pushdown kernel),
+//!   * Vortex `DecimalByteParts` with an i64 most-significant-part (pushdown kernel),
+//!   * Vortex canonical `DecimalArray` (i128 storage),
+//!   * arrow-rs `Decimal128Array` via `gt_eq` + `lt_eq` + `and`.
+//!
+//! arrow-rs has no decimal storage narrower than 128 bits, so logically-small decimals that
+//! Vortex keeps in an i32/i64 MSP must be materialised as i128 in Arrow. This benchmark
+//! measures the resulting throughput difference.
+
+use arrow_arith::boolean::and;
+use arrow_array::Decimal128Array;
+use arrow_array::Scalar as ArrowScalar;
+use arrow_ord::cmp;
+use divan::Bencher;
+use divan::black_box;
+use rand::RngExt;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DecimalDType;
+use vortex_array::dtype::Nullability;
+use vortex_array::scalar::DecimalValue;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::between::BetweenOptions;
+use vortex_array::scalar_fn::fns::between::StrictComparison;
+use vortex_array::validity::Validity;
+use vortex_decimal_byte_parts::DecimalByteParts;
+
+fn main() {
+    divan::main();
+}
+
+const LENGTHS: &[usize] = &[1 << 16, 1 << 20];
+
+// Logical decimal range [0, 1000), precision 9 scale 2 (fits i32) and precision 18 (fits i64).
+const LOWER: i64 = 250;
+const UPPER: i64 = 750;
+
+const OPTIONS: BetweenOptions = BetweenOptions {
+    lower_strict: StrictComparison::NonStrict,
+    upper_strict: StrictComparison::NonStrict,
+};
+
+fn values(len: usize) -> Vec<i64> {
+    let mut rng = StdRng::seed_from_u64(0x5eed);
+    (0..len).map(|_| rng.random_range(0..1000i64)).collect()
+}
+
+// ---- Vortex DecimalByteParts (i32 MSP) ----
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_byteparts_i32(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(9, 2);
+    let msp = PrimitiveArray::from_iter(values(len).into_iter().map(|v| v as i32)).into_array();
+    let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array();
+    let lower = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I32(LOWER as i32),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+    let upper = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I32(UPPER as i32),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                lower.clone(),
+                upper.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, lower, upper, mut ctx)| {
+            black_box(
+                arr.between(lower, upper, OPTIONS)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+// ---- Vortex DecimalByteParts (i64 MSP) ----
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_byteparts_i64(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(18, 2);
+    let msp = PrimitiveArray::from_iter(values(len)).into_array();
+    let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array();
+    let lower = ConstantArray::new(
+        Scalar::decimal(DecimalValue::I64(LOWER), dt, Nullability::NonNullable),
+        len,
+    )
+    .into_array();
+    let upper = ConstantArray::new(
+        Scalar::decimal(DecimalValue::I64(UPPER), dt, Nullability::NonNullable),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                lower.clone(),
+                upper.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, lower, upper, mut ctx)| {
+            black_box(
+                arr.between(lower, upper, OPTIONS)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+// ---- Vortex canonical DecimalArray (i128 storage) ----
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_canonical_i128(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(9, 2);
+    let arr = DecimalArray::new(
+        values(len).into_iter().map(i128::from).collect(),
+        dt,
+        Validity::NonNullable,
+    )
+    .into_array();
+    let lower = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I128(LOWER as i128),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+    let upper = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I128(UPPER as i128),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                lower.clone(),
+                upper.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, lower, upper, mut ctx)| {
+            black_box(
+                arr.between(lower, upper, OPTIONS)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+// ---- arrow-rs Decimal128 (gt_eq + lt_eq + and) ----
+
+#[divan::bench(args = LENGTHS)]
+fn arrow_decimal128(bencher: Bencher, len: usize) {
+    let arr = Decimal128Array::from_iter_values(values(len).into_iter().map(i128::from))
+        .with_precision_and_scale(9, 2)
+        .unwrap();
+    let lower = ArrowScalar::new(
+        Decimal128Array::from_iter_values([LOWER as i128])
+            .with_precision_and_scale(9, 2)
+            .unwrap(),
+    );
+    let upper = ArrowScalar::new(
+        Decimal128Array::from_iter_values([UPPER as i128])
+            .with_precision_and_scale(9, 2)
+            .unwrap(),
+    );
+
+    bencher
+        .with_inputs(|| (arr.clone(), lower.clone(), upper.clone()))
+        .bench_values(|(arr, lower, upper)| {
+            let ge = cmp::gt_eq(&arr, &lower).unwrap();
+            let le = cmp::lt_eq(&arr, &upper).unwrap();
+            black_box(and(&ge, &le).unwrap())
+        });
+}

From 77a32cd94415e7e9816933ef7515ab9d061542c6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 14:16:32 +0000
Subject: [PATCH 03/13] Cap decimal between bench sizes so all cases stay under
 1ms

Drop the 1M-row case (arrow/canonical exceeded 1ms there) and use 64k/128k
rows, keeping all four engines under a 1ms-per-op budget while preserving
the cross-engine comparison.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/decimal-byte-parts/benches/decimal_between.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs
index f1efb39bbc2..5b8efdffd5f 100644
--- a/encodings/decimal-byte-parts/benches/decimal_between.rs
+++ b/encodings/decimal-byte-parts/benches/decimal_between.rs
@@ -41,7 +41,7 @@ fn main() {
     divan::main();
 }
 
-const LENGTHS: &[usize] = &[1 << 16, 1 << 20];
+const LENGTHS: &[usize] = &[1 << 16, 1 << 17];
 
 // Logical decimal range [0, 1000), precision 9 scale 2 (fits i32) and precision 18 (fits i64).
 const LOWER: i64 = 250;

From 1f2f901f2669006c1e8d594326625d806d403523 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 17:23:41 +0000
Subject: [PATCH 04/13] Add two-limb i128 representation and lexicographic
 between to DecimalByteParts

Implements the previously-scaffolded second limb: a decimal i128 can now be
stored as a signed i64 high limb (msp, carrying validity) plus a non-nullable
u64 low limb, reconstructed as (msp as i128) << 64 | low. Wires the lower limb
through the constructor, validation, serde, canonicalization, scalar_at, and
slice/filter/take/mask/cast; compare and is_constant defer to the canonical
path for two-limb arrays.

Adds a lexicographic two-limb `between` pushdown that compares the limbs with
native-width integer ops. Correctness is verified against the canonical i128
implementation across strictness modes and nulls, plus two-limb consistency
cases.

Benchmark note: the pushdown composes generic array ops (~11 passes with
intermediate Bool allocations) and is currently slower than both arrow-rs
Decimal128 (~2.7x) and the canonical i128 path on wide values. Beating arrow
requires a fused single-pass SIMD kernel rather than composed expressions; the
benchmark (decimal_between, wide cases) documents the current state.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../benches/decimal_between.rs                | 127 ++++++++++
 encodings/decimal-byte-parts/public-api.lock  |   8 +-
 .../src/decimal_byte_parts/compute/between.rs | 199 +++++++++++++++-
 .../src/decimal_byte_parts/compute/cast.rs    |  13 +-
 .../src/decimal_byte_parts/compute/compare.rs |   5 +
 .../src/decimal_byte_parts/compute/filter.rs  |  22 +-
 .../decimal_byte_parts/compute/is_constant.rs |   5 +
 .../src/decimal_byte_parts/compute/mask.rs    |  23 +-
 .../src/decimal_byte_parts/compute/mod.rs     |  26 +++
 .../src/decimal_byte_parts/compute/take.rs    |  22 +-
 .../src/decimal_byte_parts/mod.rs             | 219 ++++++++++++------
 .../src/decimal_byte_parts/rules.rs           |   5 +-
 .../src/decimal_byte_parts/slice.rs           |  22 +-
 13 files changed, 580 insertions(+), 116 deletions(-)

diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs
index 5b8efdffd5f..e55f0866c7c 100644
--- a/encodings/decimal-byte-parts/benches/decimal_between.rs
+++ b/encodings/decimal-byte-parts/benches/decimal_between.rs
@@ -11,6 +11,8 @@
 //! Vortex keeps in an i32/i64 MSP must be materialised as i128 in Arrow. This benchmark
 //! measures the resulting throughput difference.
 
+#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)]
+
 use arrow_arith::boolean::and;
 use arrow_array::Decimal128Array;
 use arrow_array::Scalar as ArrowScalar;
@@ -214,3 +216,128 @@ fn arrow_decimal128(bencher: Bencher, len: usize) {
             black_box(and(&ge, &le).unwrap())
         });
 }
+
+// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ----
+//
+// These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex
+// nor arrow can keep them in a narrow integer. The two-limb representation splits each value into
+// a signed i64 high limb and an unsigned u64 low limb, letting `between` run as native-width
+// (SIMD-friendly) integer comparisons instead of arrow's 128-bit comparison.
+
+fn wide_values(len: usize) -> Vec<i128> {
+    let mut rng = StdRng::seed_from_u64(0x5eed);
+    (0..len)
+        .map(|_| {
+            let high = i128::from(rng.random_range(0..1000i64));
+            let low = i128::from(rng.random_range(0..u64::MAX));
+            (high << 64) | low
+        })
+        .collect()
+}
+
+// Bounds with non-zero low limbs so the low-limb tie-break is exercised at the high-limb edges.
+const WIDE_LOWER: i128 = (250i128 << 64) | 0x1234_5678;
+const WIDE_UPPER: i128 = (750i128 << 64) | 0x90ab_cdef;
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(38, 2);
+    let values = wide_values(len);
+    let highs = PrimitiveArray::from_iter(values.iter().map(|v| (v >> 64) as i64)).into_array();
+    let lows = PrimitiveArray::from_iter(values.iter().map(|v| *v as u64)).into_array();
+    let arr = DecimalByteParts::try_new_with_lower(highs, lows, dt)
+        .unwrap()
+        .into_array();
+    let lower = ConstantArray::new(
+        Scalar::decimal(DecimalValue::I128(WIDE_LOWER), dt, Nullability::NonNullable),
+        len,
+    )
+    .into_array();
+    let upper = ConstantArray::new(
+        Scalar::decimal(DecimalValue::I128(WIDE_UPPER), dt, Nullability::NonNullable),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                lower.clone(),
+                upper.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, lower, upper, mut ctx)| {
+            black_box(
+                arr.between(lower, upper, OPTIONS)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(38, 2);
+    let arr = DecimalArray::new(
+        wide_values(len).into_iter().collect(),
+        dt,
+        Validity::NonNullable,
+    )
+    .into_array();
+    let lower = ConstantArray::new(
+        Scalar::decimal(DecimalValue::I128(WIDE_LOWER), dt, Nullability::NonNullable),
+        len,
+    )
+    .into_array();
+    let upper = ConstantArray::new(
+        Scalar::decimal(DecimalValue::I128(WIDE_UPPER), dt, Nullability::NonNullable),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                lower.clone(),
+                upper.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, lower, upper, mut ctx)| {
+            black_box(
+                arr.between(lower, upper, OPTIONS)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+#[divan::bench(args = LENGTHS)]
+fn arrow_decimal128_wide(bencher: Bencher, len: usize) {
+    let arr = Decimal128Array::from_iter_values(wide_values(len))
+        .with_precision_and_scale(38, 2)
+        .unwrap();
+    let lower = ArrowScalar::new(
+        Decimal128Array::from_iter_values([WIDE_LOWER])
+            .with_precision_and_scale(38, 2)
+            .unwrap(),
+    );
+    let upper = ArrowScalar::new(
+        Decimal128Array::from_iter_values([WIDE_UPPER])
+            .with_precision_and_scale(38, 2)
+            .unwrap(),
+    );
+
+    bencher
+        .with_inputs(|| (arr.clone(), lower.clone(), upper.clone()))
+        .bench_values(|(arr, lower, upper)| {
+            let ge = cmp::gt_eq(&arr, &lower).unwrap();
+            let le = cmp::lt_eq(&arr, &upper).unwrap();
+            black_box(and(&ge, &le).unwrap())
+        });
+}
diff --git a/encodings/decimal-byte-parts/public-api.lock b/encodings/decimal-byte-parts/public-api.lock
index 372706034dc..3fdd0ea6013 100644
--- a/encodings/decimal-byte-parts/public-api.lock
+++ b/encodings/decimal-byte-parts/public-api.lock
@@ -6,6 +6,8 @@ impl vortex_decimal_byte_parts::DecimalByteParts
 
 pub fn vortex_decimal_byte_parts::DecimalByteParts::try_new(vortex_array::array::erased::ArrayRef, vortex_array::dtype::decimal::DecimalDType) -> vortex_error::VortexResult<vortex_decimal_byte_parts::DecimalBytePartsArray>
 
+pub fn vortex_decimal_byte_parts::DecimalByteParts::try_new_with_lower(vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::dtype::decimal::DecimalDType) -> vortex_error::VortexResult<vortex_decimal_byte_parts::DecimalBytePartsArray>
+
 impl core::clone::Clone for vortex_decimal_byte_parts::DecimalByteParts
 
 pub fn vortex_decimal_byte_parts::DecimalByteParts::clone(&self) -> vortex_decimal_byte_parts::DecimalByteParts
@@ -84,7 +86,7 @@ pub struct vortex_decimal_byte_parts::DecimalBytePartsData
 
 impl vortex_decimal_byte_parts::DecimalBytePartsData
 
-pub fn vortex_decimal_byte_parts::DecimalBytePartsData::validate(&vortex_array::array::erased::ArrayRef, vortex_array::dtype::decimal::DecimalDType, &vortex_array::dtype::DType, usize) -> vortex_error::VortexResult<()>
+pub fn vortex_decimal_byte_parts::DecimalBytePartsData::validate(&vortex_array::array::erased::ArrayRef, core::option::Option<&vortex_array::array::erased::ArrayRef>, vortex_array::dtype::decimal::DecimalDType, &vortex_array::dtype::DType, usize) -> vortex_error::VortexResult<()>
 
 impl core::clone::Clone for vortex_decimal_byte_parts::DecimalBytePartsData
 
@@ -138,10 +140,14 @@ pub fn vortex_decimal_byte_parts::DecimalBytesPartsMetadata::encoded_len(&self)
 
 pub trait vortex_decimal_byte_parts::DecimalBytePartsArrayExt: vortex_array::array::typed::TypedArrayRef<vortex_decimal_byte_parts::DecimalByteParts>
 
+pub fn vortex_decimal_byte_parts::DecimalBytePartsArrayExt::lower(&self) -> core::option::Option<&vortex_array::array::erased::ArrayRef>
+
 pub fn vortex_decimal_byte_parts::DecimalBytePartsArrayExt::msp(&self) -> &vortex_array::array::erased::ArrayRef
 
 impl<T: vortex_array::array::typed::TypedArrayRef<vortex_decimal_byte_parts::DecimalByteParts>> vortex_decimal_byte_parts::DecimalBytePartsArrayExt for T
 
+pub fn T::lower(&self) -> core::option::Option<&vortex_array::array::erased::ArrayRef>
+
 pub fn T::msp(&self) -> &vortex_array::array::erased::ArrayRef
 
 pub fn vortex_decimal_byte_parts::initialize(&vortex_session::VortexSession)
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
index 78d4aa47689..421677f3b49 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -7,14 +7,20 @@ use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::arrays::ConstantArray;
 use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
 use vortex_array::scalar::Scalar;
+use vortex_array::scalar::ScalarValue;
 use vortex_array::scalar_fn::fns::between::BetweenKernel;
 use vortex_array::scalar_fn::fns::between::BetweenOptions;
+use vortex_array::scalar_fn::fns::between::StrictComparison;
+use vortex_array::scalar_fn::fns::operators::Operator;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
 use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
+use crate::decimal_byte_parts::LOWER_PTYPE;
 use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive;
 
 impl BetweenKernel for DecimalByteParts {
@@ -25,7 +31,7 @@ impl BetweenKernel for DecimalByteParts {
         options: &BetweenOptions,
         _ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<ArrayRef>> {
-        // We can only push the comparison down into the MSP when both bounds are constant.
+        // We can only push the comparison down into the limbs when both bounds are constant.
         let (Some(lower_const), Some(upper_const)) = (lower.as_constant(), upper.as_constant())
         else {
             return Ok(None);
@@ -44,6 +50,24 @@ impl BetweenKernel for DecimalByteParts {
 
         let nullability =
             arr.dtype().nullability() | lower.dtype().nullability() | upper.dtype().nullability();
+
+        if arr.lower().is_some() {
+            // Two-limb representation: a lexicographic comparison over the (signed high, unsigned
+            // low) limbs. Both bounds must fit in i128 to be split into limbs.
+            let (Some(lower_i128), Some(upper_i128)) =
+                (lower_decimal.cast::<i128>(), upper_decimal.cast::<i128>())
+            else {
+                return Ok(None);
+            };
+            return Ok(Some(two_limb_between(
+                &arr,
+                lower_i128,
+                upper_i128,
+                options,
+                nullability,
+            )?));
+        }
+
         let scalar_type = arr.msp().dtype().with_nullability(nullability);
         let msp_ptype = arr.msp().as_primitive_typed().ptype();
 
@@ -75,12 +99,93 @@ impl BetweenKernel for DecimalByteParts {
     }
 }
 
+/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation.
+///
+/// With high limb `H` (signed i64) and low limb `L` (unsigned u64), the value `v = H<<64 | L`
+/// satisfies `v >= lower` iff `H > lo_h OR (H == lo_h AND L >=' lo_l)` and `v <= upper` iff
+/// `H < hi_h OR (H == hi_h AND L <=' hi_l)`, where `>='`/`<='` are strict when requested. Each
+/// limb comparison is a native-width integer compare that vectorizes far better than a 128-bit
+/// comparison.
+fn two_limb_between(
+    arr: &ArrayView<'_, DecimalByteParts>,
+    lower: i128,
+    upper: i128,
+    options: &BetweenOptions,
+    nullability: Nullability,
+) -> VortexResult<ArrayRef> {
+    let len = arr.len();
+    let high = arr.msp().clone();
+    let low = arr
+        .lower()
+        .vortex_expect("two-limb path requires a lower limb")
+        .clone();
+
+    let high_dtype = high.dtype().with_nullability(nullability);
+    let low_dtype = DType::Primitive(LOWER_PTYPE, nullability);
+
+    let high_const = |limb: i64| -> VortexResult<ArrayRef> {
+        Ok(ConstantArray::new(
+            Scalar::try_new(high_dtype.clone(), Some(ScalarValue::from(limb)))?,
+            len,
+        )
+        .into_array())
+    };
+    let low_const = |limb: u64| -> VortexResult<ArrayRef> {
+        Ok(ConstantArray::new(
+            Scalar::try_new(low_dtype.clone(), Some(ScalarValue::from(limb)))?,
+            len,
+        )
+        .into_array())
+    };
+
+    let lower_low_op = match options.lower_strict {
+        StrictComparison::Strict => Operator::Gt,
+        StrictComparison::NonStrict => Operator::Gte,
+    };
+    let upper_low_op = match options.upper_strict {
+        StrictComparison::Strict => Operator::Lt,
+        StrictComparison::NonStrict => Operator::Lte,
+    };
+
+    let (lower_high, lower_low) = split_i128(lower);
+    let (upper_high, upper_low) = split_i128(upper);
+
+    // value >= lower
+    let ge_lower = {
+        let h_gt = high.binary(high_const(lower_high)?, Operator::Gt)?;
+        let h_eq = high.binary(high_const(lower_high)?, Operator::Eq)?;
+        let l_cmp = low.binary(low_const(lower_low)?, lower_low_op)?;
+        h_gt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)?
+    };
+
+    // value <= upper
+    let le_upper = {
+        let h_lt = high.binary(high_const(upper_high)?, Operator::Lt)?;
+        let h_eq = high.binary(high_const(upper_high)?, Operator::Eq)?;
+        let l_cmp = low.binary(low_const(upper_low)?, upper_low_op)?;
+        h_lt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)?
+    };
+
+    ge_lower.binary(le_upper, Operator::And)
+}
+
+/// Split an i128 into its signed high and unsigned low 64-bit limbs. The truncating casts are the
+/// intended limb extraction: `>> 64` keeps the high 64 bits and `as u64` keeps the low 64 bits.
+#[allow(clippy::cast_possible_truncation)]
+fn split_i128(value: i128) -> (i64, u64) {
+    ((value >> 64) as i64, value as u64)
+}
+
 #[cfg(test)]
 mod tests {
+    use rstest::rstest;
     use vortex_array::ArrayRef;
     use vortex_array::IntoArray;
+    use vortex_array::LEGACY_SESSION;
+    use vortex_array::VortexSessionExecute;
     use vortex_array::arrays::BoolArray;
     use vortex_array::arrays::ConstantArray;
+    use vortex_array::arrays::DecimalArray;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_array::assert_arrays_eq;
     use vortex_array::builtins::ArrayBuiltins;
@@ -91,6 +196,7 @@ mod tests {
     use vortex_array::scalar_fn::fns::between::BetweenOptions;
     use vortex_array::scalar_fn::fns::between::StrictComparison;
     use vortex_array::validity::Validity;
+    use vortex_buffer::Buffer;
     use vortex_buffer::buffer;
     use vortex_error::VortexResult;
 
@@ -104,6 +210,97 @@ mod tests {
         .into_array()
     }
 
+    #[allow(clippy::cast_possible_truncation)]
+    fn two_limb(values: &[i128], decimal_type: DecimalDType) -> ArrayRef {
+        let highs: Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
+        let lows: Buffer<u64> = values.iter().map(|v| *v as u64).collect();
+        DecimalByteParts::try_new_with_lower(
+            PrimitiveArray::new(highs, Validity::NonNullable).into_array(),
+            PrimitiveArray::new(lows, Validity::NonNullable).into_array(),
+            decimal_type,
+        )
+        .unwrap()
+        .into_array()
+    }
+
+    /// The two-limb `between` pushdown must agree with the canonical i128 implementation across
+    /// values spanning the low-limb wraparound, the high limb, and negatives, for every strictness.
+    #[rstest]
+    #[case(StrictComparison::NonStrict, StrictComparison::NonStrict)]
+    #[case(StrictComparison::Strict, StrictComparison::NonStrict)]
+    #[case(StrictComparison::NonStrict, StrictComparison::Strict)]
+    #[case(StrictComparison::Strict, StrictComparison::Strict)]
+    fn two_limb_between_matches_canonical(
+        #[case] lower_strict: StrictComparison,
+        #[case] upper_strict: StrictComparison,
+    ) -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let decimal_type = DecimalDType::new(38, 0);
+        let values: Vec<i128> = vec![
+            0,
+            1,
+            -1,
+            i128::from(i64::MAX),
+            i128::from(i64::MAX) + 1,
+            (5i128 << 64) | 3,
+            (5i128 << 64) | 5,
+            (5i128 << 64) | 9,
+            (4i128 << 64) | i128::from(u64::MAX),
+            (6i128 << 64),
+            -(7i128 << 64) | 11,
+        ];
+        let lower = (5i128 << 64) | 3;
+        let upper = (5i128 << 64) | 9;
+        let len = values.len();
+        let options = BetweenOptions {
+            lower_strict,
+            upper_strict,
+        };
+
+        let lower_arr = decimal_const(DecimalValue::I128(lower), decimal_type, len);
+        let upper_arr = decimal_const(DecimalValue::I128(upper), decimal_type, len);
+
+        let got = two_limb(&values, decimal_type)
+            .between(lower_arr.clone(), upper_arr.clone(), options.clone())?
+            .execute::<BoolArray>(&mut ctx)?;
+
+        let canonical = DecimalArray::new(
+            values.iter().copied().collect::<Buffer<i128>>(),
+            decimal_type,
+            Validity::NonNullable,
+        )
+        .into_array();
+        let want = canonical
+            .between(lower_arr, upper_arr, options)?
+            .execute::<BoolArray>(&mut ctx)?;
+
+        assert_arrays_eq!(got, want);
+        Ok(())
+    }
+
+    /// A two-limb array must canonicalize to the same values as a canonical i128 `DecimalArray`.
+    #[test]
+    fn two_limb_canonicalizes_to_i128() -> VortexResult<()> {
+        let mut ctx = LEGACY_SESSION.create_execution_ctx();
+        let decimal_type = DecimalDType::new(38, 0);
+        let values: Vec<i128> = vec![
+            0,
+            -1,
+            i128::from(i64::MIN),
+            (3i128 << 64) | 42,
+            -(9i128 << 64) | 17,
+        ];
+
+        let got = two_limb(&values, decimal_type).execute::<DecimalArray>(&mut ctx)?;
+        let want = DecimalArray::new(
+            values.iter().copied().collect::<Buffer<i128>>(),
+            decimal_type,
+            Validity::NonNullable,
+        );
+        assert_arrays_eq!(got.into_array(), want.into_array());
+        Ok(())
+    }
+
     #[test]
     fn between_decimal_const() -> VortexResult<()> {
         let decimal_type = DecimalDType::new(8, 2);
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs
index 882d07bbaef..d0238f956d7 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs
@@ -24,14 +24,19 @@ impl CastReduce for DecimalByteParts {
             return Ok(None);
         };
 
-        // Cast the msp array to handle nullability change
+        // Cast the msp array to handle nullability change. Validity lives on the msp, so the lower
+        // limb (always non-nullable) is unchanged.
         let new_msp = array
             .msp()
             .cast(array.msp().dtype().with_nullability(*target_nullability))?;
 
-        Ok(Some(
-            DecimalByteParts::try_new(new_msp, *target_decimal)?.into_array(),
-        ))
+        let casted = match array.lower() {
+            None => DecimalByteParts::try_new(new_msp, *target_decimal)?,
+            Some(lower) => {
+                DecimalByteParts::try_new_with_lower(new_msp, lower.clone(), *target_decimal)?
+            }
+        };
+        Ok(Some(casted.into_array()))
     }
 }
 
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
index 041fcd864a1..58a1135702e 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
@@ -39,6 +39,11 @@ impl CompareKernel for DecimalByteParts {
             return Ok(None);
         };
 
+        // The two-limb representation only specializes `between`; defer comparisons to canonical.
+        if lhs.lower().is_some() {
+            return Ok(None);
+        }
+
         let nullability = lhs.dtype().nullability() | rhs.dtype().nullability();
         let scalar_type = lhs.msp().dtype().with_nullability(nullability);
 
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs
index c82171b0e13..91a836319d7 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs
@@ -13,14 +13,20 @@ use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 impl FilterReduce for DecimalByteParts {
     fn filter(array: ArrayView<'_, Self>, mask: &Mask) -> VortexResult<Option<ArrayRef>> {
-        DecimalByteParts::try_new(
-            array.msp().filter(mask.clone())?,
-            *array
-                .dtype()
-                .as_decimal_opt()
-                .vortex_expect("must be a decimal dtype"),
-        )
-        .map(|d| Some(d.into_array()))
+        let decimal_dtype = *array
+            .dtype()
+            .as_decimal_opt()
+            .vortex_expect("must be a decimal dtype");
+        let msp = array.msp().filter(mask.clone())?;
+        let filtered = match array.lower() {
+            None => DecimalByteParts::try_new(msp, decimal_dtype)?,
+            Some(lower) => DecimalByteParts::try_new_with_lower(
+                msp,
+                lower.filter(mask.clone())?,
+                decimal_dtype,
+            )?,
+        };
+        Ok(Some(filtered.into_array()))
     }
 }
 
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs
index f4da528a9b3..ccf28594691 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs
@@ -34,6 +34,11 @@ impl DynAggregateKernel for DecimalBytePartsIsConstantKernel {
             return Ok(None);
         };
 
+        // Constness of a two-limb array depends on both limbs; defer to the canonical path.
+        if array.lower().is_some() {
+            return Ok(None);
+        }
+
         let result = is_constant(array.msp(), ctx)?;
         Ok(Some(IsConstant::make_partial(batch, result, ctx)?))
     }
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs
index 0f007519894..efbdd02775b 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs
@@ -16,20 +16,23 @@ use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 
 impl MaskReduce for DecimalByteParts {
     fn mask(array: ArrayView<'_, Self>, mask: &ArrayRef) -> VortexResult<Option<ArrayRef>> {
+        let decimal_dtype = *array
+            .dtype()
+            .as_decimal_opt()
+            .vortex_expect("must be a decimal dtype");
+        // Validity is carried solely by the msp, so masking the msp is sufficient; the lower
+        // limb's values at masked positions become unobservable.
         let masked_msp = MaskExpr.try_new_array(
             array.msp().len(),
             EmptyOptions,
             [array.msp().clone(), mask.clone()],
         )?;
-        Ok(Some(
-            DecimalByteParts::try_new(
-                masked_msp,
-                *array
-                    .dtype()
-                    .as_decimal_opt()
-                    .vortex_expect("must be a decimal dtype"),
-            )?
-            .into_array(),
-        ))
+        let masked = match array.lower() {
+            None => DecimalByteParts::try_new(masked_msp, decimal_dtype)?,
+            Some(lower) => {
+                DecimalByteParts::try_new_with_lower(masked_msp, lower.clone(), decimal_dtype)?
+            }
+        };
+        Ok(Some(masked.into_array()))
     }
 }
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
index 8434b4f7dfb..fca746b7eb5 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
@@ -14,14 +14,29 @@ mod take;
 mod tests {
     use rstest::rstest;
     use vortex_array::IntoArray;
+    use vortex_array::arrays::BoolArray;
     use vortex_array::arrays::PrimitiveArray;
     use vortex_array::compute::conformance::consistency::test_array_consistency;
     use vortex_array::dtype::DecimalDType;
+    use vortex_array::validity::Validity;
+    use vortex_buffer::Buffer;
     use vortex_buffer::buffer;
 
     use crate::DecimalByteParts;
     use crate::DecimalBytePartsArray;
 
+    #[allow(clippy::cast_possible_truncation)]
+    fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> DecimalBytePartsArray {
+        let highs: Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
+        let lows: Buffer<u64> = values.iter().map(|v| *v as u64).collect();
+        DecimalByteParts::try_new_with_lower(
+            PrimitiveArray::new(highs, validity).into_array(),
+            PrimitiveArray::new(lows, Validity::NonNullable).into_array(),
+            dt,
+        )
+        .unwrap()
+    }
+
     #[rstest]
     // Basic decimal byte parts arrays
     #[case::decimal_i32(DecimalByteParts::try_new(
@@ -68,6 +83,17 @@ mod tests {
         PrimitiveArray::from_iter((0..2000i64).map(|i| i * 1000000)).into_array(),
         DecimalDType::new(19, 6)
     ).unwrap())]
+    // Two-limb i128 representation
+    #[case::two_limb(two_limb(
+        &[0, -1, (3i128 << 64) | 42, -(9i128 << 64) | 17, i128::from(i64::MIN), (7i128 << 64)],
+        Validity::NonNullable,
+        DecimalDType::new(38, 2),
+    ))]
+    #[case::two_limb_nullable(two_limb(
+        &[0, -1, (3i128 << 64) | 42, -(9i128 << 64) | 17, 5],
+        Validity::Array(BoolArray::from_iter([true, false, true, true, false]).into_array()),
+        DecimalDType::new(38, 2),
+    ))]
 
     fn test_decimal_byte_parts_consistency(#[case] array: DecimalBytePartsArray) {
         test_array_consistency(&array.into_array());
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs
index 807b660ec73..807bc48e847 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs
@@ -18,13 +18,19 @@ impl TakeExecute for DecimalByteParts {
         indices: &ArrayRef,
         _ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<ArrayRef>> {
-        DecimalByteParts::try_new(
-            array.msp().take(indices.clone())?,
-            *array
-                .dtype()
-                .as_decimal_opt()
-                .vortex_expect("must be a decimal dtype"),
-        )
-        .map(|a| Some(a.into_array()))
+        let decimal_dtype = *array
+            .dtype()
+            .as_decimal_opt()
+            .vortex_expect("must be a decimal dtype");
+        let msp = array.msp().take(indices.clone())?;
+        let taken = match array.lower() {
+            None => DecimalByteParts::try_new(msp, decimal_dtype)?,
+            Some(lower) => DecimalByteParts::try_new_with_lower(
+                msp,
+                lower.take(indices.clone())?,
+                decimal_dtype,
+            )?,
+        };
+        Ok(Some(taken.into_array()))
     }
 }
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs
index 76349f40910..f2f8eeb7a0f 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs
@@ -27,6 +27,7 @@ use vortex_array::arrays::PrimitiveArray;
 use vortex_array::buffer::BufferHandle;
 use vortex_array::dtype::DType;
 use vortex_array::dtype::DecimalDType;
+use vortex_array::dtype::Nullability;
 use vortex_array::dtype::PType;
 use vortex_array::match_each_signed_integer_ptype;
 use vortex_array::scalar::DecimalValue;
@@ -38,6 +39,7 @@ use vortex_array::vtable::OperationsVTable;
 use vortex_array::vtable::VTable;
 use vortex_array::vtable::ValidityChild;
 use vortex_array::vtable::ValidityVTableFromChild;
+use vortex_buffer::Buffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 use vortex_error::vortex_bail;
@@ -94,7 +96,8 @@ impl VTable for DecimalByteParts {
         let msp = slots[MSP_SLOT]
             .as_ref()
             .vortex_expect("DecimalBytePartsArray msp slot");
-        DecimalBytePartsData::validate(msp, *decimal_dtype, dtype, len)
+        let lower = slots.get(LOWER_SLOT).and_then(Option::as_ref);
+        DecimalBytePartsData::validate(msp, lower, *decimal_dtype, dtype, len)
     }
 
     fn nbuffers(_array: ArrayView<'_, Self>) -> usize {
@@ -116,7 +119,7 @@ impl VTable for DecimalByteParts {
         Ok(Some(
             DecimalBytesPartsMetadata {
                 zeroth_child_ptype: PType::try_from(array.msp().dtype())? as i32,
-                lower_part_count: 0,
+                lower_part_count: u32::from(array.lower().is_some()),
             }
             .encode_to_vec(),
         ))
@@ -140,14 +143,22 @@ impl VTable for DecimalByteParts {
 
         let msp = children.get(0, &encoded_dtype, len)?;
 
-        assert_eq!(
-            metadata.lower_part_count, 0,
-            "lower_part_count > 0 not currently supported"
-        );
+        let slots = match metadata.lower_part_count {
+            0 => smallvec![Some(msp.clone())],
+            1 => {
+                let lower_dtype = DType::Primitive(LOWER_PTYPE, Nullability::NonNullable);
+                let lower = children.get(1, &lower_dtype, len)?;
+                smallvec![Some(msp.clone()), Some(lower)]
+            }
+            n => vortex_bail!("decimal byte parts supports at most one lower limb, got {n}"),
+        };
 
-        let slots = smallvec![Some(msp.clone())];
-        let data = DecimalBytePartsData::try_new(msp.dtype(), msp.len(), *decimal_dtype)?;
-        Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots))
+        let lower = slots.as_slice().get(LOWER_SLOT).and_then(Option::as_ref);
+        DecimalBytePartsData::validate(&msp, lower, *decimal_dtype, dtype, len)?;
+        Ok(
+            ArrayParts::new(self.clone(), dtype.clone(), len, DecimalBytePartsData)
+                .with_slots(slots),
+        )
     }
 
     fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String {
@@ -176,23 +187,29 @@ impl VTable for DecimalByteParts {
     }
 }
 
-/// The most significant parts of the decimal values.
+/// The most significant part (high limb) of the decimal values.
 pub(super) const MSP_SLOT: usize = 0;
-pub(super) const NUM_SLOTS: usize = 1;
-pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["msp"];
-
-/// This array encodes decimals as between 1-4 columns of primitive typed children.
-/// The most significant part (msp) sorting the most significant decimal bits.
-/// This array must be signed and is nullable iff the decimal is nullable.
+/// The single lower limb, present only for the two-limb i128 representation.
+pub(super) const LOWER_SLOT: usize = 1;
+/// The maximum number of children an array of this encoding can hold.
+pub(super) const MAX_SLOTS: usize = 2;
+pub(super) const SLOT_NAMES: [&str; MAX_SLOTS] = ["msp", "lower"];
+
+/// The physical type of the lower limb in the two-limb representation.
+pub(super) const LOWER_PTYPE: PType = PType::U64;
+
+/// This array encodes decimals as 1 or 2 columns of primitive typed children.
+/// The most significant part (msp) stores the most significant decimal bits and is always a
+/// signed integer, nullable iff the decimal is nullable.
+///
+/// With a single child the decimal value is exactly the (sign-extended) msp. With two children the
+/// value is reconstructed as `(msp as i128) << 64 | (lower as u64 as i128)`, i.e. the msp is the
+/// signed high 64-bit limb and `lower` is the unsigned low 64-bit limb. The lower limb is always a
+/// non-nullable `u64`; validity is carried solely by the msp.
 ///
-/// e.g. for a decimal i128 \[ 127..64 | 64..0 \] msp = 127..64 and lower_part\[0\] = 64..0
+/// e.g. for a decimal i128 \[ 127..64 | 64..0 \] msp = 127..64 and lower = 64..0
 #[derive(Clone, Debug)]
-pub struct DecimalBytePartsData {
-    // NOTE: the lower_parts is currently unused, we reserve this field so that it is properly
-    //  read/written during serde, but provide no constructor to initialize this to anything
-    //  other than the empty Vec.
-    _lower_parts: Vec<ArrayRef>,
-}
+pub struct DecimalBytePartsData;
 
 impl Display for DecimalBytePartsData {
     fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result {
@@ -210,6 +227,14 @@ pub trait DecimalBytePartsArrayExt: TypedArrayRef<DecimalByteParts> {
             .as_ref()
             .vortex_expect("DecimalBytePartsArray msp slot")
     }
+
+    /// The lower (low 64-bit) limb, present only for the two-limb i128 representation.
+    fn lower(&self) -> Option<&ArrayRef> {
+        self.as_ref()
+            .slots()
+            .get(LOWER_SLOT)
+            .and_then(Option::as_ref)
+    }
 }
 
 impl<T: TypedArrayRef<DecimalByteParts>> DecimalBytePartsArrayExt for T {}
@@ -217,6 +242,7 @@ impl<T: TypedArrayRef<DecimalByteParts>> DecimalBytePartsArrayExt for T {}
 impl DecimalBytePartsData {
     pub fn validate(
         msp: &ArrayRef,
+        lower: Option<&ArrayRef>,
         decimal_dtype: DecimalDType,
         dtype: &DType,
         len: usize,
@@ -231,24 +257,25 @@ impl DecimalBytePartsData {
             "expected dtype {expected_dtype}, got {dtype}"
         );
         vortex_ensure!(msp.len() == len, "expected len {len}, got {}", msp.len());
-        Ok(())
-    }
 
-    pub(crate) fn try_new(
-        msp_dtype: &DType,
-        msp_len: usize,
-        decimal_dtype: DecimalDType,
-    ) -> VortexResult<Self> {
-        let expected_dtype = DType::Decimal(decimal_dtype, msp_dtype.nullability());
-        vortex_ensure!(
-            msp_dtype.is_signed_int(),
-            "decimal bytes parts, first part must be a signed array"
-        );
-        let _ = msp_len;
-        drop(expected_dtype);
-        Ok(Self {
-            _lower_parts: Vec::new(),
-        })
+        if let Some(lower) = lower {
+            vortex_ensure!(
+                matches!(msp.dtype(), DType::Primitive(PType::I64, _)),
+                "two-limb decimal byte parts requires an i64 high limb, got {}",
+                msp.dtype()
+            );
+            vortex_ensure!(
+                lower.dtype() == &DType::Primitive(LOWER_PTYPE, Nullability::NonNullable),
+                "decimal byte parts lower limb must be a non-nullable {LOWER_PTYPE}, got {}",
+                lower.dtype()
+            );
+            vortex_ensure!(
+                lower.len() == len,
+                "lower limb length {} != array length {len}",
+                lower.len()
+            );
+        }
+        Ok(())
     }
 }
 
@@ -256,18 +283,41 @@ impl DecimalBytePartsData {
 pub struct DecimalByteParts;
 
 impl DecimalByteParts {
-    /// Construct a new [`DecimalBytePartsArray`] from an MSP array and decimal dtype.
+    /// Construct a new single-limb [`DecimalBytePartsArray`] from an MSP array and decimal dtype.
     pub fn try_new(
         msp: ArrayRef,
         decimal_dtype: DecimalDType,
     ) -> VortexResult<DecimalBytePartsArray> {
         let len = msp.len();
         let dtype = DType::Decimal(decimal_dtype, msp.dtype().nullability());
-        let slots = smallvec![Some(msp.clone())];
-        let data = DecimalBytePartsData::try_new(msp.dtype(), msp.len(), decimal_dtype)?;
+        DecimalBytePartsData::validate(&msp, None, decimal_dtype, &dtype, len)?;
+        let slots = smallvec![Some(msp)];
         Ok(unsafe {
             Array::from_parts_unchecked(
-                ArrayParts::new(DecimalByteParts, dtype, len, data).with_slots(slots),
+                ArrayParts::new(DecimalByteParts, dtype, len, DecimalBytePartsData)
+                    .with_slots(slots),
+            )
+        })
+    }
+
+    /// Construct a two-limb [`DecimalBytePartsArray`] representing an i128 decimal.
+    ///
+    /// `msp` is the signed high 64-bit limb (carrying validity); `lower` is the non-nullable
+    /// unsigned low 64-bit limb. The decimal value at index `i` is
+    /// `(msp[i] as i128) << 64 | (lower[i] as u64 as i128)`.
+    pub fn try_new_with_lower(
+        msp: ArrayRef,
+        lower: ArrayRef,
+        decimal_dtype: DecimalDType,
+    ) -> VortexResult<DecimalBytePartsArray> {
+        let len = msp.len();
+        let dtype = DType::Decimal(decimal_dtype, msp.dtype().nullability());
+        DecimalBytePartsData::validate(&msp, Some(&lower), decimal_dtype, &dtype, len)?;
+        let slots = smallvec![Some(msp), Some(lower)];
+        Ok(unsafe {
+            Array::from_parts_unchecked(
+                ArrayParts::new(DecimalByteParts, dtype, len, DecimalBytePartsData)
+                    .with_slots(slots),
             )
         })
     }
@@ -278,26 +328,38 @@ fn to_canonical_decimal(
     array: &DecimalBytePartsArray,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef> {
-    // TODO(joe): support parts len != 1
-    let prim = array.msp().clone().execute::<PrimitiveArray>(ctx)?;
-    // Depending on the decimal type and the min/max of the primitive array we can choose
-    // the correct buffer size
-
-    Ok(match_each_signed_integer_ptype!(prim.ptype(), |P| {
-        // SAFETY: The primitive array's buffer is already validated with correct type.
-        // The decimal dtype matches the array's dtype, and validity is preserved.
-        unsafe {
-            DecimalArray::new_unchecked(
-                prim.to_buffer::<P>(),
-                *array
-                    .dtype()
-                    .as_decimal_opt()
-                    .vortex_expect("must be a decimal dtype"),
-                prim.validity()?,
-            )
-        }
-        .into_array()
-    }))
+    let decimal_dtype = *array
+        .dtype()
+        .as_decimal_opt()
+        .vortex_expect("must be a decimal dtype");
+    let msp = array.msp().clone().execute::<PrimitiveArray>(ctx)?;
+
+    let Some(lower) = array.lower() else {
+        // Single-limb: the decimal is exactly the sign-extended msp.
+        return Ok(match_each_signed_integer_ptype!(msp.ptype(), |P| {
+            // SAFETY: The primitive array's buffer is already validated with correct type.
+            // The decimal dtype matches the array's dtype, and validity is preserved.
+            unsafe {
+                DecimalArray::new_unchecked(msp.to_buffer::<P>(), decimal_dtype, msp.validity()?)
+            }
+            .into_array()
+        }));
+    };
+
+    // Two-limb: reconstruct each i128 as `(high as i128) << 64 | low`.
+    let lower = lower.clone().execute::<PrimitiveArray>(ctx)?;
+    let validity = msp.validity()?;
+    let low = lower.as_slice::<u64>();
+    let values: Buffer<i128> = match_each_signed_integer_ptype!(msp.ptype(), |P| {
+        msp.as_slice::<P>()
+            .iter()
+            .zip(low)
+            .map(|(&high, &low)| ((high as i128) << 64) | i128::from(low))
+            .collect()
+    });
+
+    // SAFETY: validity comes from the msp and the reconstructed values match the decimal dtype.
+    Ok(unsafe { DecimalArray::new_unchecked(values, decimal_dtype, validity) }.into_array())
 }
 
 impl OperationsVTable<DecimalByteParts> for DecimalByteParts {
@@ -306,16 +368,29 @@ impl OperationsVTable<DecimalByteParts> for DecimalByteParts {
         index: usize,
         ctx: &mut ExecutionCtx,
     ) -> VortexResult<Scalar> {
-        // TODO(joe): support parts len != 1
-        let scalar = array.msp().execute_scalar(index, ctx)?;
-
-        // Note. values in msp, can only be signed integers upto size i64.
-        let primitive_scalar = scalar.as_primitive();
-        // TODO(joe): extend this to support multiple parts.
-        let value = primitive_scalar.as_::<i64>().vortex_expect("non-null");
+        // Null indices are short-circuited by the validity child (the msp), so the msp scalar
+        // here is always non-null.
+        let high = array
+            .msp()
+            .execute_scalar(index, ctx)?
+            .as_primitive()
+            .as_::<i64>()
+            .vortex_expect("non-null");
+
+        let decimal_value = match array.lower() {
+            None => DecimalValue::I64(high),
+            Some(lower) => {
+                let low = lower
+                    .execute_scalar(index, ctx)?
+                    .as_primitive()
+                    .as_::<u64>()
+                    .vortex_expect("lower limb is non-nullable");
+                DecimalValue::I128((i128::from(high) << 64) | i128::from(low))
+            }
+        };
         Scalar::try_new(
             array.dtype().clone(),
-            Some(ScalarValue::Decimal(DecimalValue::I64(value))),
+            Some(ScalarValue::Decimal(decimal_value)),
         )
     }
 }
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs
index 7fafa0d5289..3e9c74dfef8 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs
@@ -38,8 +38,9 @@ impl ArrayParentReduceRule<DecimalByteParts> for DecimalBytePartsFilterPushDownR
         _child_idx: usize,
     ) -> VortexResult<Option<ArrayRef>> {
         // TODO(ngates): we should benchmark whether to push-down filters with "lower parts".
-        //  For now, we only push down if there are no lower parts.
-        if !child._lower_parts.is_empty() {
+        //  For now, we only push down if there are no lower parts; two-limb arrays fall back to
+        //  the generic `FilterReduce` implementation which filters both limbs.
+        if child.lower().is_some() {
             return Ok(None);
         }
 
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs
index a9c022d2051..530ece55892 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs
@@ -15,15 +15,17 @@ use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 
 impl SliceReduce for DecimalByteParts {
     fn slice(array: ArrayView<'_, Self>, range: Range<usize>) -> VortexResult<Option<ArrayRef>> {
-        Ok(Some(
-            DecimalByteParts::try_new(
-                array.msp().slice(range)?,
-                *array
-                    .dtype()
-                    .as_decimal_opt()
-                    .vortex_expect("must be a decimal dtype"),
-            )?
-            .into_array(),
-        ))
+        let decimal_dtype = *array
+            .dtype()
+            .as_decimal_opt()
+            .vortex_expect("must be a decimal dtype");
+        let msp = array.msp().slice(range.clone())?;
+        let sliced = match array.lower() {
+            None => DecimalByteParts::try_new(msp, decimal_dtype)?,
+            Some(lower) => {
+                DecimalByteParts::try_new_with_lower(msp, lower.slice(range)?, decimal_dtype)?
+            }
+        };
+        Ok(Some(sliced.into_array()))
     }
 }

From 903b11653771089d9c0cbb3d55f5c9ba40570631 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 17:31:13 +0000
Subject: [PATCH 05/13] Replace two-limb between with a fused single-pass
 kernel

The previous two-limb `between` composed ~11 generic array ops, each a full
pass allocating an intermediate Bool array, which was slower than both arrow
and the canonical i128 path. This replaces it with a single fused loop that
materializes the two limbs once and computes the lexicographic comparison per
row with native-width (i64/u64) integer ops via BitBuffer::collect_bool, using
branch-free bitwise combines so the body vectorizes.

On wide i128 values this cuts two-limb `between` from ~605us to ~267us at 131k
rows, making it ~1.8x faster than the canonical i128 path. Correctness is
unchanged and still verified against the canonical implementation.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/decimal_byte_parts/compute/between.rs | 120 ++++++++++--------
 1 file changed, 69 insertions(+), 51 deletions(-)

diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
index 421677f3b49..f326f937bf1 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -5,22 +5,21 @@ use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
+use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::PrimitiveArray;
 use vortex_array::builtins::ArrayBuiltins;
-use vortex_array::dtype::DType;
 use vortex_array::dtype::Nullability;
 use vortex_array::scalar::Scalar;
-use vortex_array::scalar::ScalarValue;
 use vortex_array::scalar_fn::fns::between::BetweenKernel;
 use vortex_array::scalar_fn::fns::between::BetweenOptions;
 use vortex_array::scalar_fn::fns::between::StrictComparison;
-use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_buffer::BitBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
 use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
-use crate::decimal_byte_parts::LOWER_PTYPE;
 use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive;
 
 impl BetweenKernel for DecimalByteParts {
@@ -29,7 +28,7 @@ impl BetweenKernel for DecimalByteParts {
         lower: &ArrayRef,
         upper: &ArrayRef,
         options: &BetweenOptions,
-        _ctx: &mut ExecutionCtx,
+        ctx: &mut ExecutionCtx,
     ) -> VortexResult<Option<ArrayRef>> {
         // We can only push the comparison down into the limbs when both bounds are constant.
         let (Some(lower_const), Some(upper_const)) = (lower.as_constant(), upper.as_constant())
@@ -65,6 +64,7 @@ impl BetweenKernel for DecimalByteParts {
                 upper_i128,
                 options,
                 nullability,
+                ctx,
             )?));
         }
 
@@ -99,74 +99,92 @@ impl BetweenKernel for DecimalByteParts {
     }
 }
 
-/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation.
+/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation in a
+/// single fused pass.
 ///
 /// With high limb `H` (signed i64) and low limb `L` (unsigned u64), the value `v = H<<64 | L`
 /// satisfies `v >= lower` iff `H > lo_h OR (H == lo_h AND L >=' lo_l)` and `v <= upper` iff
 /// `H < hi_h OR (H == hi_h AND L <=' hi_l)`, where `>='`/`<='` are strict when requested. Each
-/// limb comparison is a native-width integer compare that vectorizes far better than a 128-bit
-/// comparison.
+/// row resolves to native-width integer comparisons in one loop, which vectorizes far better than
+/// arrow's 128-bit comparison or a tree of generic array operations with intermediate allocations.
 fn two_limb_between(
     arr: &ArrayView<'_, DecimalByteParts>,
     lower: i128,
     upper: i128,
     options: &BetweenOptions,
     nullability: Nullability,
+    ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef> {
-    let len = arr.len();
-    let high = arr.msp().clone();
+    let high = arr.msp().clone().execute::<PrimitiveArray>(ctx)?;
     let low = arr
         .lower()
         .vortex_expect("two-limb path requires a lower limb")
-        .clone();
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
 
-    let high_dtype = high.dtype().with_nullability(nullability);
-    let low_dtype = DType::Primitive(LOWER_PTYPE, nullability);
-
-    let high_const = |limb: i64| -> VortexResult<ArrayRef> {
-        Ok(ConstantArray::new(
-            Scalar::try_new(high_dtype.clone(), Some(ScalarValue::from(limb)))?,
-            len,
-        )
-        .into_array())
-    };
-    let low_const = |limb: u64| -> VortexResult<ArrayRef> {
-        Ok(ConstantArray::new(
-            Scalar::try_new(low_dtype.clone(), Some(ScalarValue::from(limb)))?,
-            len,
-        )
-        .into_array())
-    };
-
-    let lower_low_op = match options.lower_strict {
-        StrictComparison::Strict => Operator::Gt,
-        StrictComparison::NonStrict => Operator::Gte,
-    };
-    let upper_low_op = match options.upper_strict {
-        StrictComparison::Strict => Operator::Lt,
-        StrictComparison::NonStrict => Operator::Lte,
-    };
+    let validity = high.validity()?.union_nullability(nullability);
+    let high = high.as_slice::<i64>();
+    let low = low.as_slice::<u64>();
+    assert_eq!(high.len(), low.len(), "limb lengths must match");
 
     let (lower_high, lower_low) = split_i128(lower);
     let (upper_high, upper_low) = split_i128(upper);
 
-    // value >= lower
-    let ge_lower = {
-        let h_gt = high.binary(high_const(lower_high)?, Operator::Gt)?;
-        let h_eq = high.binary(high_const(lower_high)?, Operator::Eq)?;
-        let l_cmp = low.binary(low_const(lower_low)?, lower_low_op)?;
-        h_gt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)?
-    };
+    let lower_limbs = (lower_high, lower_low);
+    let upper_limbs = (upper_high, upper_low);
 
-    // value <= upper
-    let le_upper = {
-        let h_lt = high.binary(high_const(upper_high)?, Operator::Lt)?;
-        let h_eq = high.binary(high_const(upper_high)?, Operator::Eq)?;
-        let l_cmp = low.binary(low_const(upper_low)?, upper_low_op)?;
-        h_lt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)?
+    // Pass the low-limb comparison as a monomorphized fn so the whole per-element body inlines.
+    let bits = match (options.lower_strict, options.upper_strict) {
+        (StrictComparison::Strict, StrictComparison::Strict) => {
+            collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_lt)
+        }
+        (StrictComparison::Strict, StrictComparison::NonStrict) => {
+            collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_le)
+        }
+        (StrictComparison::NonStrict, StrictComparison::Strict) => {
+            collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_lt)
+        }
+        (StrictComparison::NonStrict, StrictComparison::NonStrict) => {
+            collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_le)
+        }
     };
 
-    ge_lower.binary(le_upper, Operator::And)
+    Ok(BoolArray::new(bits, validity).into_array())
+}
+
+/// The fused per-element loop. `lower_low_cmp`/`upper_low_cmp` apply the (possibly strict) low-limb
+/// comparison. Bitwise (non-short-circuiting) `&`/`|` keep the body branch-free for the vectorizer.
+fn collect_two_limb(
+    high: &[i64],
+    low: &[u64],
+    lower: (i64, u64),
+    lower_low_cmp: impl Fn(u64, u64) -> bool,
+    upper: (i64, u64),
+    upper_low_cmp: impl Fn(u64, u64) -> bool,
+) -> BitBuffer {
+    let (lower_high, lower_low) = lower;
+    let (upper_high, upper_low) = upper;
+    BitBuffer::collect_bool(high.len(), |idx| {
+        // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
+        let h = unsafe { *high.get_unchecked(idx) };
+        let l = unsafe { *low.get_unchecked(idx) };
+        let ge_lower = (h > lower_high) | ((h == lower_high) & lower_low_cmp(l, lower_low));
+        let le_upper = (h < upper_high) | ((h == upper_high) & upper_low_cmp(l, upper_low));
+        ge_lower & le_upper
+    })
+}
+
+fn u64_ge(a: u64, b: u64) -> bool {
+    a >= b
+}
+fn u64_gt(a: u64, b: u64) -> bool {
+    a > b
+}
+fn u64_le(a: u64, b: u64) -> bool {
+    a <= b
+}
+fn u64_lt(a: u64, b: u64) -> bool {
+    a < b
 }
 
 /// Split an i128 into its signed high and unsigned low 64-bit limbs. The truncating casts are the

From df2d58e7f706c0471b01114097ef76ac01124805 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 17:48:16 +0000
Subject: [PATCH 06/13] Beat arrow on two-limb between by reconstructing i128
 in the fused pass

The hand-rolled lexicographic comparison (6 native compares + bitwise combine
per row) did more per-element work than a single hardware i128 compare, leaving
two-limb between slower than arrow. Reconstructing the i128 from its
signed-high / unsigned-low limbs and comparing it directly is ~2 instructions
per bound, and the single fused pass reads each limb once where arrow reads its
i128 array twice (one pass each for gt_eq/lt_eq).

At the benchmarked cache-resident sizes this makes two-limb between faster than
arrow's Decimal128 (~1.1-1.2x) and ~1.9x faster than the canonical i128 path,
while staying under the 1ms-per-op budget. Correctness is unchanged and still
verified against the canonical implementation.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/decimal_byte_parts/compute/between.rs | 57 +++++++------------
 1 file changed, 22 insertions(+), 35 deletions(-)

diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
index f326f937bf1..8794e3316f4 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -127,73 +127,60 @@ fn two_limb_between(
     let low = low.as_slice::<u64>();
     assert_eq!(high.len(), low.len(), "limb lengths must match");
 
-    let (lower_high, lower_low) = split_i128(lower);
-    let (upper_high, upper_low) = split_i128(upper);
-
-    let lower_limbs = (lower_high, lower_low);
-    let upper_limbs = (upper_high, upper_low);
-
-    // Pass the low-limb comparison as a monomorphized fn so the whole per-element body inlines.
+    // Pass the comparison as a monomorphized fn so the whole per-element body inlines.
     let bits = match (options.lower_strict, options.upper_strict) {
         (StrictComparison::Strict, StrictComparison::Strict) => {
-            collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_lt)
+            collect_two_limb(high, low, lower, i128_gt, upper, i128_lt)
         }
         (StrictComparison::Strict, StrictComparison::NonStrict) => {
-            collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_le)
+            collect_two_limb(high, low, lower, i128_gt, upper, i128_le)
         }
         (StrictComparison::NonStrict, StrictComparison::Strict) => {
-            collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_lt)
+            collect_two_limb(high, low, lower, i128_ge, upper, i128_lt)
         }
         (StrictComparison::NonStrict, StrictComparison::NonStrict) => {
-            collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_le)
+            collect_two_limb(high, low, lower, i128_ge, upper, i128_le)
         }
     };
 
     Ok(BoolArray::new(bits, validity).into_array())
 }
 
-/// The fused per-element loop. `lower_low_cmp`/`upper_low_cmp` apply the (possibly strict) low-limb
-/// comparison. Bitwise (non-short-circuiting) `&`/`|` keep the body branch-free for the vectorizer.
+/// The fused per-element loop. Reconstruct the i128 from its signed-high / unsigned-low limbs and
+/// compare against the bounds. A native i128 comparison is ~2 instructions, cheaper than a manual
+/// lexicographic decomposition, and the single pass reads each limb once (arrow reads its i128
+/// array twice, once per `gt_eq`/`lt_eq`). Bitwise `&` keeps the body branch-free.
 fn collect_two_limb(
     high: &[i64],
     low: &[u64],
-    lower: (i64, u64),
-    lower_low_cmp: impl Fn(u64, u64) -> bool,
-    upper: (i64, u64),
-    upper_low_cmp: impl Fn(u64, u64) -> bool,
+    lower: i128,
+    lower_cmp: impl Fn(i128, i128) -> bool,
+    upper: i128,
+    upper_cmp: impl Fn(i128, i128) -> bool,
 ) -> BitBuffer {
-    let (lower_high, lower_low) = lower;
-    let (upper_high, upper_low) = upper;
     BitBuffer::collect_bool(high.len(), |idx| {
         // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
-        let h = unsafe { *high.get_unchecked(idx) };
-        let l = unsafe { *low.get_unchecked(idx) };
-        let ge_lower = (h > lower_high) | ((h == lower_high) & lower_low_cmp(l, lower_low));
-        let le_upper = (h < upper_high) | ((h == upper_high) & upper_low_cmp(l, upper_low));
-        ge_lower & le_upper
+        let hi = unsafe { *high.get_unchecked(idx) };
+        let lo = unsafe { *low.get_unchecked(idx) };
+        // Sign-extend the high limb, zero-extend the low limb: value = (hi << 64) | lo.
+        let value = ((hi as i128) << 64) | (lo as i128);
+        lower_cmp(value, lower) & upper_cmp(value, upper)
     })
 }
 
-fn u64_ge(a: u64, b: u64) -> bool {
+fn i128_ge(a: i128, b: i128) -> bool {
     a >= b
 }
-fn u64_gt(a: u64, b: u64) -> bool {
+fn i128_gt(a: i128, b: i128) -> bool {
     a > b
 }
-fn u64_le(a: u64, b: u64) -> bool {
+fn i128_le(a: i128, b: i128) -> bool {
     a <= b
 }
-fn u64_lt(a: u64, b: u64) -> bool {
+fn i128_lt(a: i128, b: i128) -> bool {
     a < b
 }
 
-/// Split an i128 into its signed high and unsigned low 64-bit limbs. The truncating casts are the
-/// intended limb extraction: `>> 64` keeps the high 64 bits and `as u64` keeps the low 64 bits.
-#[allow(clippy::cast_possible_truncation)]
-fn split_i128(value: i128) -> (i64, u64) {
-    ((value >> 64) as i64, value as u64)
-}
-
 #[cfg(test)]
 mod tests {
     use rstest::rstest;

From 0d3700a8b5dfed6ec16a3a065384ade061931e4d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 18:00:33 +0000
Subject: [PATCH 07/13] Push two-limb comparisons down with a fused i128
 reconstruct kernel

The two-limb representation previously deferred all `compare` operators to the
canonical i128 path. Reuse the fused reconstruct approach from `between`: a
single pass rebuilds each i128 from its (signed high, unsigned low) limbs and
applies the comparison, reading each limb once. This specializes all six
operators (eq/neq/lt/lte/gt/gte), not just lt, since they share one loop.

Factor the shared limb materialization, i128 reconstruct, and i128 comparator
helpers into a `two_limb` module used by both `between` and `compare`.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/decimal_byte_parts/compute/between.rs |  31 +---
 .../src/decimal_byte_parts/compute/compare.rs | 151 +++++++++++++++++-
 .../src/decimal_byte_parts/compute/mod.rs     |   1 +
 .../decimal_byte_parts/compute/two_limb.rs    |  57 +++++++
 4 files changed, 211 insertions(+), 29 deletions(-)
 create mode 100644 encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs

diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
index 8794e3316f4..040b6cde847 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -7,7 +7,6 @@ use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
 use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ConstantArray;
-use vortex_array::arrays::PrimitiveArray;
 use vortex_array::builtins::ArrayBuiltins;
 use vortex_array::dtype::Nullability;
 use vortex_array::scalar::Scalar;
@@ -21,6 +20,12 @@ use vortex_error::VortexResult;
 use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive;
+use crate::decimal_byte_parts::compute::two_limb::i128_ge;
+use crate::decimal_byte_parts::compute::two_limb::i128_gt;
+use crate::decimal_byte_parts::compute::two_limb::i128_le;
+use crate::decimal_byte_parts::compute::two_limb::i128_lt;
+use crate::decimal_byte_parts::compute::two_limb::materialize_limbs;
+use crate::decimal_byte_parts::compute::two_limb::reconstruct;
 
 impl BetweenKernel for DecimalByteParts {
     fn between(
@@ -115,13 +120,7 @@ fn two_limb_between(
     nullability: Nullability,
     ctx: &mut ExecutionCtx,
 ) -> VortexResult<ArrayRef> {
-    let high = arr.msp().clone().execute::<PrimitiveArray>(ctx)?;
-    let low = arr
-        .lower()
-        .vortex_expect("two-limb path requires a lower limb")
-        .clone()
-        .execute::<PrimitiveArray>(ctx)?;
-
+    let (high, low) = materialize_limbs(arr, ctx)?;
     let validity = high.validity()?.union_nullability(nullability);
     let high = high.as_slice::<i64>();
     let low = low.as_slice::<u64>();
@@ -162,25 +161,11 @@ fn collect_two_limb(
         // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
         let hi = unsafe { *high.get_unchecked(idx) };
         let lo = unsafe { *low.get_unchecked(idx) };
-        // Sign-extend the high limb, zero-extend the low limb: value = (hi << 64) | lo.
-        let value = ((hi as i128) << 64) | (lo as i128);
+        let value = reconstruct(hi, lo);
         lower_cmp(value, lower) & upper_cmp(value, upper)
     })
 }
 
-fn i128_ge(a: i128, b: i128) -> bool {
-    a >= b
-}
-fn i128_gt(a: i128, b: i128) -> bool {
-    a > b
-}
-fn i128_le(a: i128, b: i128) -> bool {
-    a <= b
-}
-fn i128_lt(a: i128, b: i128) -> bool {
-    a < b
-}
-
 #[cfg(test)]
 mod tests {
     use rstest::rstest;
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
index 58a1135702e..9c193401123 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
@@ -7,6 +7,7 @@ use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
+use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ConstantArray;
 use vortex_array::builtins::ArrayBuiltins;
 use vortex_array::dtype::IntegerPType;
@@ -21,12 +22,21 @@ use vortex_array::scalar::ScalarValue;
 use vortex_array::scalar_fn::fns::binary::CompareKernel;
 use vortex_array::scalar_fn::fns::operators::CompareOperator;
 use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_buffer::BitBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
 use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 use crate::decimal_byte_parts::compute::compare::Sign::Positive;
+use crate::decimal_byte_parts::compute::two_limb::i128_eq;
+use crate::decimal_byte_parts::compute::two_limb::i128_ge;
+use crate::decimal_byte_parts::compute::two_limb::i128_gt;
+use crate::decimal_byte_parts::compute::two_limb::i128_le;
+use crate::decimal_byte_parts::compute::two_limb::i128_lt;
+use crate::decimal_byte_parts::compute::two_limb::i128_ne;
+use crate::decimal_byte_parts::compute::two_limb::materialize_limbs;
+use crate::decimal_byte_parts::compute::two_limb::reconstruct;
 
 impl CompareKernel for DecimalByteParts {
     fn compare(
@@ -39,19 +49,31 @@ impl CompareKernel for DecimalByteParts {
             return Ok(None);
         };
 
-        // The two-limb representation only specializes `between`; defer comparisons to canonical.
-        if lhs.lower().is_some() {
-            return Ok(None);
-        }
-
         let nullability = lhs.dtype().nullability() | rhs.dtype().nullability();
-        let scalar_type = lhs.msp().dtype().with_nullability(nullability);
 
         let rhs_decimal = rhs_const
             .as_decimal()
             .decimal_value()
             .vortex_expect("checked for null in entry func");
 
+        if lhs.lower().is_some() {
+            // Two-limb representation: reconstruct each i128 and compare directly in a single fused
+            // pass. The constant must fit in i128, which it always does for a two-limb (<= 38 digit)
+            // decimal; defer to canonical otherwise.
+            let Some(rhs_i128) = rhs_decimal.cast::<i128>() else {
+                return Ok(None);
+            };
+            return Ok(Some(two_limb_compare(
+                &lhs,
+                rhs_i128,
+                operator,
+                nullability,
+                ctx,
+            )?));
+        }
+
+        let scalar_type = lhs.msp().dtype().with_nullability(nullability);
+
         match decimal_value_wrapper_to_primitive(
             rhs_decimal,
             lhs.msp().as_primitive_typed().ptype(),
@@ -86,6 +108,49 @@ impl CompareKernel for DecimalByteParts {
     }
 }
 
+/// Compare each reconstructed i128 value against the constant `rhs` in a single fused pass. See
+/// `two_limb::reconstruct`; a native i128 compare is cheaper than a manual lexicographic
+/// decomposition and reads each limb only once.
+fn two_limb_compare(
+    lhs: &ArrayView<'_, DecimalByteParts>,
+    rhs: i128,
+    operator: CompareOperator,
+    nullability: Nullability,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<ArrayRef> {
+    let (high, low) = materialize_limbs(lhs, ctx)?;
+    let validity = high.validity()?.union_nullability(nullability);
+    let high = high.as_slice::<i64>();
+    let low = low.as_slice::<u64>();
+    assert_eq!(high.len(), low.len(), "limb lengths must match");
+
+    let bits = match operator {
+        CompareOperator::Eq => collect_compare(high, low, rhs, i128_eq),
+        CompareOperator::NotEq => collect_compare(high, low, rhs, i128_ne),
+        CompareOperator::Gt => collect_compare(high, low, rhs, i128_gt),
+        CompareOperator::Gte => collect_compare(high, low, rhs, i128_ge),
+        CompareOperator::Lt => collect_compare(high, low, rhs, i128_lt),
+        CompareOperator::Lte => collect_compare(high, low, rhs, i128_le),
+    };
+
+    Ok(BoolArray::new(bits, validity).into_array())
+}
+
+fn collect_compare(
+    high: &[i64],
+    low: &[u64],
+    rhs: i128,
+    cmp: impl Fn(i128, i128) -> bool,
+) -> BitBuffer {
+    BitBuffer::collect_bool(high.len(), |idx| {
+        // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
+        let hi = unsafe { *high.get_unchecked(idx) };
+        let lo = unsafe { *low.get_unchecked(idx) };
+        let value = reconstruct(hi, lo);
+        cmp(value, rhs)
+    })
+}
+
 // Used to represent the overflow direction when trying to
 // convert into the scalar type.
 #[derive(Debug)]
@@ -152,6 +217,7 @@ where
 
 #[cfg(test)]
 mod tests {
+    use vortex_array::ArrayRef;
     use vortex_array::IntoArray;
     use vortex_array::arrays::BoolArray;
     use vortex_array::arrays::ConstantArray;
@@ -165,11 +231,84 @@ mod tests {
     use vortex_array::scalar::Scalar;
     use vortex_array::scalar_fn::fns::operators::Operator;
     use vortex_array::validity::Validity;
+    use vortex_buffer::Buffer;
     use vortex_buffer::buffer;
     use vortex_error::VortexResult;
 
     use crate::DecimalByteParts;
 
+    #[allow(clippy::cast_possible_truncation)]
+    fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> ArrayRef {
+        let highs: Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
+        let lows: Buffer<u64> = values.iter().map(|v| *v as u64).collect();
+        DecimalByteParts::try_new_with_lower(
+            PrimitiveArray::new(highs, validity).into_array(),
+            PrimitiveArray::new(lows, Validity::NonNullable).into_array(),
+            dt,
+        )
+        .unwrap()
+        .into_array()
+    }
+
+    #[test]
+    fn two_limb_compare_lt() -> VortexResult<()> {
+        let dt = DecimalDType::new(38, 0);
+        // 0, 2^64, 2^64 + 5, -(2^64): straddle the constant 2^64 across both limbs.
+        let arr = two_limb(
+            &[0, 1i128 << 64, (1i128 << 64) | 5, -(1i128 << 64)],
+            Validity::AllValid,
+            dt,
+        );
+        let rhs = ConstantArray::new(
+            Scalar::decimal(
+                DecimalValue::I128(1i128 << 64),
+                dt,
+                Nullability::NonNullable,
+            ),
+            arr.len(),
+        )
+        .into_array();
+
+        let lt = arr.binary(rhs.clone(), Operator::Lt)?;
+        assert_arrays_eq!(
+            lt,
+            BoolArray::from_iter([Some(true), Some(false), Some(false), Some(true)])
+        );
+
+        // Gte is the complement here since all values are valid.
+        let gte = arr.binary(rhs, Operator::Gte)?;
+        assert_arrays_eq!(
+            gte,
+            BoolArray::from_iter([Some(false), Some(true), Some(true), Some(false)])
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn two_limb_compare_lt_nullable() -> VortexResult<()> {
+        let dt = DecimalDType::new(38, 0);
+        let arr = two_limb(
+            &[0, 1i128 << 64, (1i128 << 64) | 5],
+            Validity::Array(BoolArray::from_iter([true, false, true]).into_array()),
+            dt,
+        );
+        let rhs = ConstantArray::new(
+            Scalar::decimal(
+                DecimalValue::I128(1i128 << 64),
+                dt,
+                Nullability::NonNullable,
+            ),
+            arr.len(),
+        )
+        .into_array();
+
+        let lt = arr.binary(rhs, Operator::Lt)?;
+        assert_arrays_eq!(lt, BoolArray::from_iter([Some(true), None, Some(false)]));
+
+        Ok(())
+    }
+
     #[test]
     fn compare_decimal_const() {
         let decimal_dtype = DecimalDType::new(8, 2);
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
index fca746b7eb5..44e97887da3 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
@@ -9,6 +9,7 @@ pub(crate) mod is_constant;
 pub(crate) mod kernel;
 mod mask;
 mod take;
+mod two_limb;
 
 #[cfg(test)]
 mod tests {
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
new file mode 100644
index 00000000000..91c7a160fd7
--- /dev/null
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Shared helpers for fused single-pass kernels over the two-limb (signed-high i64, unsigned-low
+//! u64) i128 representation, used by both `between` and `compare`.
+
+use vortex_array::ArrayView;
+use vortex_array::ExecutionCtx;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_error::VortexExpect;
+use vortex_error::VortexResult;
+
+use crate::DecimalByteParts;
+use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
+
+/// Materialize the high (signed `i64`) and low (unsigned `u64`) limbs of a two-limb array.
+///
+/// The caller is responsible for checking [`DecimalBytePartsArrayExt::lower`] is present before
+/// calling; this panics otherwise.
+pub(crate) fn materialize_limbs(
+    arr: &ArrayView<'_, DecimalByteParts>,
+    ctx: &mut ExecutionCtx,
+) -> VortexResult<(PrimitiveArray, PrimitiveArray)> {
+    let high = arr.msp().clone().execute::<PrimitiveArray>(ctx)?;
+    let low = arr
+        .lower()
+        .vortex_expect("two-limb path requires a lower limb")
+        .clone()
+        .execute::<PrimitiveArray>(ctx)?;
+    Ok((high, low))
+}
+
+/// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb,
+/// so `value = (high << 64) | low`.
+#[inline(always)]
+pub(crate) fn reconstruct(high: i64, low: u64) -> i128 {
+    ((high as i128) << 64) | (low as i128)
+}
+
+pub(crate) fn i128_eq(a: i128, b: i128) -> bool {
+    a == b
+}
+pub(crate) fn i128_ne(a: i128, b: i128) -> bool {
+    a != b
+}
+pub(crate) fn i128_ge(a: i128, b: i128) -> bool {
+    a >= b
+}
+pub(crate) fn i128_gt(a: i128, b: i128) -> bool {
+    a > b
+}
+pub(crate) fn i128_le(a: i128, b: i128) -> bool {
+    a <= b
+}
+pub(crate) fn i128_lt(a: i128, b: i128) -> bool {
+    a < b
+}

From b1a4c9444910504086a0c5d1fc422cc509a0822f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 18:04:35 +0000
Subject: [PATCH 08/13] Add decimal lt benchmark vs arrow-rs

Benchmarks single-sided `x < threshold` across the i32/i64 MSP pushdown, the
two-limb i128 kernel, the canonical i128 path, and arrow-rs cmp::lt. Unlike
between, lt is a single pass for every engine, which isolates the cost of the
two-limb reconstruct against a contiguous i128 read.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/decimal-byte-parts/Cargo.toml       |   4 +
 .../decimal-byte-parts/benches/decimal_lt.rs  | 287 ++++++++++++++++++
 2 files changed, 291 insertions(+)
 create mode 100644 encodings/decimal-byte-parts/benches/decimal_lt.rs

diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml
index 549c0b05e7c..783fc815aa0 100644
--- a/encodings/decimal-byte-parts/Cargo.toml
+++ b/encodings/decimal-byte-parts/Cargo.toml
@@ -37,3 +37,7 @@ vortex-array = { path = "../../vortex-array", features = ["_test-harness"] }
 [[bench]]
 name = "decimal_between"
 harness = false
+
+[[bench]]
+name = "decimal_lt"
+harness = false
diff --git a/encodings/decimal-byte-parts/benches/decimal_lt.rs b/encodings/decimal-byte-parts/benches/decimal_lt.rs
new file mode 100644
index 00000000000..037c8800526
--- /dev/null
+++ b/encodings/decimal-byte-parts/benches/decimal_lt.rs
@@ -0,0 +1,287 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Compares `x < threshold` (less-than against a constant) over decimal data for:
+//!   * Vortex `DecimalByteParts` with an i32 most-significant-part (pushdown kernel),
+//!   * Vortex `DecimalByteParts` with an i64 most-significant-part (pushdown kernel),
+//!   * Vortex `DecimalByteParts` two-limb i128 (signed-high / unsigned-low limbs, fused kernel),
+//!   * Vortex canonical `DecimalArray` (i128 storage),
+//!   * arrow-rs `Decimal128Array` via `cmp::lt`.
+//!
+//! Unlike `between`, arrow evaluates `lt` in a single pass, so this isolates a one-sided
+//! comparison. arrow-rs has no decimal storage narrower than 128 bits, so logically-small or
+//! limb-split decimals that Vortex keeps narrower must be materialised as i128 in arrow.
+
+#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)]
+
+use arrow_array::Decimal128Array;
+use arrow_array::Scalar as ArrowScalar;
+use arrow_ord::cmp;
+use divan::Bencher;
+use divan::black_box;
+use rand::RngExt;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use vortex_array::IntoArray;
+use vortex_array::LEGACY_SESSION;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::BoolArray;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::DecimalArray;
+use vortex_array::arrays::PrimitiveArray;
+use vortex_array::builtins::ArrayBuiltins;
+use vortex_array::dtype::DecimalDType;
+use vortex_array::dtype::Nullability;
+use vortex_array::scalar::DecimalValue;
+use vortex_array::scalar::Scalar;
+use vortex_array::scalar_fn::fns::operators::Operator;
+use vortex_array::validity::Validity;
+use vortex_decimal_byte_parts::DecimalByteParts;
+
+fn main() {
+    divan::main();
+}
+
+const LENGTHS: &[usize] = &[1 << 16, 1 << 17];
+
+// Logical decimal range [0, 1000); threshold in the middle so ~half the rows pass.
+const THRESHOLD: i64 = 500;
+
+fn values(len: usize) -> Vec<i64> {
+    let mut rng = StdRng::seed_from_u64(0x5eed);
+    (0..len).map(|_| rng.random_range(0..1000i64)).collect()
+}
+
+// ---- Vortex DecimalByteParts (i32 MSP) ----
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_byteparts_i32(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(9, 2);
+    let msp = PrimitiveArray::from_iter(values(len).into_iter().map(|v| v as i32)).into_array();
+    let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array();
+    let rhs = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I32(THRESHOLD as i32),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                rhs.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, rhs, mut ctx)| {
+            black_box(
+                arr.binary(rhs, Operator::Lt)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+// ---- Vortex DecimalByteParts (i64 MSP) ----
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_byteparts_i64(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(18, 2);
+    let msp = PrimitiveArray::from_iter(values(len)).into_array();
+    let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array();
+    let rhs = ConstantArray::new(
+        Scalar::decimal(DecimalValue::I64(THRESHOLD), dt, Nullability::NonNullable),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                rhs.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, rhs, mut ctx)| {
+            black_box(
+                arr.binary(rhs, Operator::Lt)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+// ---- Vortex canonical DecimalArray (i128 storage) ----
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_canonical_i128(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(9, 2);
+    let arr = DecimalArray::new(
+        values(len).into_iter().map(i128::from).collect(),
+        dt,
+        Validity::NonNullable,
+    )
+    .into_array();
+    let rhs = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I128(THRESHOLD as i128),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                rhs.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, rhs, mut ctx)| {
+            black_box(
+                arr.binary(rhs, Operator::Lt)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+// ---- arrow-rs Decimal128 (cmp::lt) ----
+
+#[divan::bench(args = LENGTHS)]
+fn arrow_decimal128(bencher: Bencher, len: usize) {
+    let arr = Decimal128Array::from_iter_values(values(len).into_iter().map(i128::from))
+        .with_precision_and_scale(9, 2)
+        .unwrap();
+    let rhs = ArrowScalar::new(
+        Decimal128Array::from_iter_values([THRESHOLD as i128])
+            .with_precision_and_scale(9, 2)
+            .unwrap(),
+    );
+
+    bencher
+        .with_inputs(|| (arr.clone(), rhs.clone()))
+        .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap()));
+}
+
+// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ----
+//
+// These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex
+// nor arrow can keep them in a narrow integer. The two-limb representation splits each value into a
+// signed i64 high limb and an unsigned u64 low limb, letting `lt` reconstruct the i128 once and run
+// a single native comparison.
+
+fn wide_values(len: usize) -> Vec<i128> {
+    let mut rng = StdRng::seed_from_u64(0x5eed);
+    (0..len)
+        .map(|_| {
+            let high = i128::from(rng.random_range(0..1000i64));
+            let low = i128::from(rng.random_range(0..u64::MAX));
+            (high << 64) | low
+        })
+        .collect()
+}
+
+// Threshold with a non-zero low limb so the low-limb tie-break is exercised at the high-limb edge.
+const WIDE_THRESHOLD: i128 = (500i128 << 64) | 0x90ab_cdef;
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(38, 2);
+    let values = wide_values(len);
+    let highs = PrimitiveArray::from_iter(values.iter().map(|v| (v >> 64) as i64)).into_array();
+    let lows = PrimitiveArray::from_iter(values.iter().map(|v| *v as u64)).into_array();
+    let arr = DecimalByteParts::try_new_with_lower(highs, lows, dt)
+        .unwrap()
+        .into_array();
+    let rhs = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I128(WIDE_THRESHOLD),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                rhs.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, rhs, mut ctx)| {
+            black_box(
+                arr.binary(rhs, Operator::Lt)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+#[divan::bench(args = LENGTHS)]
+fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) {
+    let dt = DecimalDType::new(38, 2);
+    let arr = DecimalArray::new(
+        wide_values(len).into_iter().collect(),
+        dt,
+        Validity::NonNullable,
+    )
+    .into_array();
+    let rhs = ConstantArray::new(
+        Scalar::decimal(
+            DecimalValue::I128(WIDE_THRESHOLD),
+            dt,
+            Nullability::NonNullable,
+        ),
+        len,
+    )
+    .into_array();
+
+    bencher
+        .with_inputs(|| {
+            (
+                arr.clone(),
+                rhs.clone(),
+                LEGACY_SESSION.create_execution_ctx(),
+            )
+        })
+        .bench_values(|(arr, rhs, mut ctx)| {
+            black_box(
+                arr.binary(rhs, Operator::Lt)
+                    .unwrap()
+                    .execute::<BoolArray>(&mut ctx)
+                    .unwrap(),
+            )
+        });
+}
+
+#[divan::bench(args = LENGTHS)]
+fn arrow_decimal128_wide(bencher: Bencher, len: usize) {
+    let arr = Decimal128Array::from_iter_values(wide_values(len))
+        .with_precision_and_scale(38, 2)
+        .unwrap();
+    let rhs = ArrowScalar::new(
+        Decimal128Array::from_iter_values([WIDE_THRESHOLD])
+            .with_precision_and_scale(38, 2)
+            .unwrap(),
+    );
+
+    bencher
+        .with_inputs(|| (arr.clone(), rhs.clone()))
+        .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap()));
+}

From 5455f13877442f1df0eb48ff023f355da80c3198 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 18:11:56 +0000
Subject: [PATCH 09/13] Add limb-SIMD microbenchmark: AVX-512 limb compare vs
 arrow i128

Isolates the kernel cost of wide-i128 `lt` with no Vortex expression overhead.
arrow's i128 has no vector compare on any x86, so it is inherently scalar
(raw_i128_scalar tracks arrow). The (i64 high, u64 low) limbs are native SIMD
widths: an AVX-512 kernel compares 8 lanes with vpcmpq/vpcmpuq straight to a
__mmask8, which is exactly the packed-bit output, beating arrow ~1.4-2.1x at
the measured sizes. The AVX-512 output is cross-checked against the scalar
reference before timing.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/decimal-byte-parts/Cargo.toml       |   4 +
 .../decimal-byte-parts/benches/limb_simd.rs   | 188 ++++++++++++++++++
 2 files changed, 192 insertions(+)
 create mode 100644 encodings/decimal-byte-parts/benches/limb_simd.rs

diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml
index 783fc815aa0..d35378ea179 100644
--- a/encodings/decimal-byte-parts/Cargo.toml
+++ b/encodings/decimal-byte-parts/Cargo.toml
@@ -41,3 +41,7 @@ harness = false
 [[bench]]
 name = "decimal_lt"
 harness = false
+
+[[bench]]
+name = "limb_simd"
+harness = false
diff --git a/encodings/decimal-byte-parts/benches/limb_simd.rs b/encodings/decimal-byte-parts/benches/limb_simd.rs
new file mode 100644
index 00000000000..889f12e17f1
--- /dev/null
+++ b/encodings/decimal-byte-parts/benches/limb_simd.rs
@@ -0,0 +1,188 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Microbenchmark isolating the *kernel* cost of `x < threshold` over wide i128 decimals, with no
+//! Vortex expression-execution overhead, to answer: can comparing the (signed-high i64, unsigned-low
+//! u64) limb pair beat arrow's i128 comparison?
+//!
+//! arrow's i128 has no SIMD compare on any x86 (there is no 128-bit-integer vector comparison), so
+//! it is inherently scalar. The limbs are native i64/u64, which AVX-512 compares 8-wide with
+//! `vpcmpq`/`vpcmpuq`, producing a mask register that is exactly the packed-bit output we want.
+//!
+//! Kernels:
+//!   * `arrow_cmp_lt`      arrow-rs `cmp::lt` over a `Decimal128Array` (reference).
+//!   * `raw_i128_scalar`   read one contiguous i128 slice, scalar compare, serial bit-pack.
+//!   * `raw_limb_scalar`   read the two limb slices, lexicographic compare, serial bit-pack.
+//!   * `raw_limb_avx512`   read the two limb slices, AVX-512 limb compare straight to a mask byte.
+
+#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)]
+
+use arrow_array::Decimal128Array;
+use arrow_array::Scalar as ArrowScalar;
+use arrow_ord::cmp;
+use divan::Bencher;
+use divan::black_box;
+use rand::RngExt;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+
+fn main() {
+    divan::main();
+}
+
+const LENGTHS: &[usize] = &[1 << 16, 1 << 17, 1 << 20];
+
+const THRESHOLD: i128 = (500i128 << 64) | 0x90ab_cdef;
+
+fn wide_values(len: usize) -> Vec<i128> {
+    let mut rng = StdRng::seed_from_u64(0x5eed);
+    (0..len)
+        .map(|_| {
+            let high = i128::from(rng.random_range(0..1000i64));
+            let low = i128::from(rng.random_range(0..u64::MAX));
+            (high << 64) | low
+        })
+        .collect()
+}
+
+fn limbs(values: &[i128]) -> (Vec<i64>, Vec<u64>) {
+    (
+        values.iter().map(|v| (v >> 64) as i64).collect(),
+        values.iter().map(|v| *v as u64).collect(),
+    )
+}
+
+// ---- arrow-rs reference ----
+
+#[divan::bench(args = LENGTHS)]
+fn arrow_cmp_lt(bencher: Bencher, len: usize) {
+    let arr = Decimal128Array::from_iter_values(wide_values(len))
+        .with_precision_and_scale(38, 2)
+        .unwrap();
+    let rhs = ArrowScalar::new(
+        Decimal128Array::from_iter_values([THRESHOLD])
+            .with_precision_and_scale(38, 2)
+            .unwrap(),
+    );
+    bencher
+        .with_inputs(|| (arr.clone(), rhs.clone()))
+        .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap()));
+}
+
+// ---- scalar i128 (one contiguous read) ----
+
+fn lt_i128_scalar(vals: &[i128], th: i128, out: &mut [u64]) {
+    for (w, chunk) in vals.chunks(64).enumerate() {
+        let mut packed = 0u64;
+        for (b, &v) in chunk.iter().enumerate() {
+            packed |= u64::from(v < th) << b;
+        }
+        out[w] = packed;
+    }
+}
+
+#[divan::bench(args = LENGTHS)]
+fn raw_i128_scalar(bencher: Bencher, len: usize) {
+    let vals = wide_values(len);
+    let mut out = vec![0u64; len.div_ceil(64)];
+    bencher.bench_local(|| {
+        lt_i128_scalar(black_box(&vals), black_box(THRESHOLD), &mut out);
+        black_box(&out);
+    });
+}
+
+// ---- scalar limb (lexicographic) ----
+
+fn lt_limb_scalar(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u64]) {
+    for w in 0..hi.len().div_ceil(64) {
+        let base = w * 64;
+        let end = (base + 64).min(hi.len());
+        let mut packed = 0u64;
+        for i in base..end {
+            let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo));
+            packed |= u64::from(lt) << (i - base);
+        }
+        out[w] = packed;
+    }
+}
+
+#[divan::bench(args = LENGTHS)]
+fn raw_limb_scalar(bencher: Bencher, len: usize) {
+    let (hi, lo) = limbs(&wide_values(len));
+    let mut out = vec![0u64; len.div_ceil(64)];
+    let th_hi = (THRESHOLD >> 64) as i64;
+    let th_lo = THRESHOLD as u64;
+    bencher.bench_local(|| {
+        lt_limb_scalar(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out);
+        black_box(&out);
+    });
+}
+
+// ---- AVX-512 limb (8-wide compare straight to mask bytes) ----
+
+#[cfg(target_arch = "x86_64")]
+#[target_feature(enable = "avx512f")]
+unsafe fn lt_limb_avx512(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u8]) {
+    use std::arch::x86_64::_mm512_cmpeq_epi64_mask;
+    use std::arch::x86_64::_mm512_cmplt_epi64_mask;
+    use std::arch::x86_64::_mm512_cmplt_epu64_mask;
+    use std::arch::x86_64::_mm512_loadu_epi64;
+    use std::arch::x86_64::_mm512_set1_epi64;
+
+    let n = hi.len();
+    let chunks = n / 8;
+    // SAFETY: this fn is only called when avx512f is detected; all intrinsics/loads below are in
+    // bounds because `chunks * 8 <= n` and `out.len() >= n.div_ceil(8)`.
+    unsafe {
+        let th_hi_v = _mm512_set1_epi64(th_hi);
+        let th_lo_v = _mm512_set1_epi64(th_lo as i64);
+
+        for chunk in 0..chunks {
+            let hv = _mm512_loadu_epi64(hi.as_ptr().add(chunk * 8));
+            let lv = _mm512_loadu_epi64(lo.as_ptr().add(chunk * 8).cast());
+            let hi_lt = _mm512_cmplt_epi64_mask(hv, th_hi_v);
+            let hi_eq = _mm512_cmpeq_epi64_mask(hv, th_hi_v);
+            let lo_lt = _mm512_cmplt_epu64_mask(lv, th_lo_v);
+            // mask = hi < th_hi | (hi == th_hi & lo < th_lo); each is a __mmask8 (one bit/lane).
+            out[chunk] = hi_lt | (hi_eq & lo_lt);
+        }
+    }
+
+    // scalar tail
+    for i in (chunks * 8)..n {
+        let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo));
+        let byte = &mut out[i / 8];
+        *byte |= u8::from(lt) << (i % 8);
+    }
+}
+
+#[divan::bench(args = LENGTHS)]
+fn raw_limb_avx512(bencher: Bencher, len: usize) {
+    if !is_x86_feature_detected!("avx512f") {
+        return;
+    }
+    let (hi, lo) = limbs(&wide_values(len));
+    let mut out = vec![0u8; len.div_ceil(8)];
+    let th_hi = (THRESHOLD >> 64) as i64;
+    let th_lo = THRESHOLD as u64;
+
+    // Cross-check the AVX-512 output against the scalar reference so the timing is for a correct
+    // kernel. The u8 mask bytes alias the u64 words bit-for-bit (lane i -> bit i).
+    unsafe { lt_limb_avx512(&hi, &lo, th_hi, th_lo, &mut out) };
+    let mut reference = vec![0u64; len.div_ceil(64)];
+    lt_limb_scalar(&hi, &lo, th_hi, th_lo, &mut reference);
+    for (i, &byte) in out.iter().enumerate() {
+        assert_eq!(
+            byte,
+            (reference[i / 8] >> ((i % 8) * 8)) as u8,
+            "avx512 mask mismatch at byte {i}"
+        );
+    }
+    out.iter_mut().for_each(|b| *b = 0);
+
+    bencher.bench_local(|| {
+        // SAFETY: guarded by the avx512f feature check above.
+        unsafe { lt_limb_avx512(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out) };
+        black_box(&out);
+    });
+}

From c542c5280ad7cc6e92250096eda135cd6a78c664 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 18:37:20 +0000
Subject: [PATCH 10/13] Add AVX-512 limb kernel for two-limb decimal compare
 and between

The two-limb compare/between kernels now dispatch to an AVX-512 path when the
host supports it, falling back to the scalar i128-reconstruct path otherwise.
The AVX-512 kernel compares 8 lanes of the signed-high i64 and unsigned-low u64
limbs with vpcmpq/vpcmpuq, combining them lexicographically into a __mmask8 that
is written directly as one byte of the output bitmap, with no serial bit-packing.
arrow's i128 has no vector-compare form on x86, so this is a comparison the
limb representation can vectorize and arrow cannot.

Dispatch follows the existing take/avx2 pattern (is_x86_feature_detected gate).
Operators are monomorphized via const generics so the hot loop carries no
operator branch. Correctness is validated against the canonical implementation
including a 99-element (non-multiple-of-8) two-limb case that exercises both the
vectorized main loop and the scalar tail.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/decimal_byte_parts/compute/between.rs |  54 +--
 .../src/decimal_byte_parts/compute/compare.rs |  39 +--
 .../src/decimal_byte_parts/compute/mod.rs     |  23 ++
 .../decimal_byte_parts/compute/two_limb.rs    | 329 +++++++++++++++++-
 4 files changed, 341 insertions(+), 104 deletions(-)

diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
index 040b6cde847..69033f12094 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -12,20 +12,14 @@ use vortex_array::dtype::Nullability;
 use vortex_array::scalar::Scalar;
 use vortex_array::scalar_fn::fns::between::BetweenKernel;
 use vortex_array::scalar_fn::fns::between::BetweenOptions;
-use vortex_array::scalar_fn::fns::between::StrictComparison;
-use vortex_buffer::BitBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
 use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive;
-use crate::decimal_byte_parts::compute::two_limb::i128_ge;
-use crate::decimal_byte_parts::compute::two_limb::i128_gt;
-use crate::decimal_byte_parts::compute::two_limb::i128_le;
-use crate::decimal_byte_parts::compute::two_limb::i128_lt;
+use crate::decimal_byte_parts::compute::two_limb::between_bits;
 use crate::decimal_byte_parts::compute::two_limb::materialize_limbs;
-use crate::decimal_byte_parts::compute::two_limb::reconstruct;
 
 impl BetweenKernel for DecimalByteParts {
     fn between(
@@ -105,13 +99,7 @@ impl BetweenKernel for DecimalByteParts {
 }
 
 /// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation in a
-/// single fused pass.
-///
-/// With high limb `H` (signed i64) and low limb `L` (unsigned u64), the value `v = H<<64 | L`
-/// satisfies `v >= lower` iff `H > lo_h OR (H == lo_h AND L >=' lo_l)` and `v <= upper` iff
-/// `H < hi_h OR (H == hi_h AND L <=' hi_l)`, where `>='`/`<='` are strict when requested. Each
-/// row resolves to native-width integer comparisons in one loop, which vectorizes far better than
-/// arrow's 128-bit comparison or a tree of generic array operations with intermediate allocations.
+/// single fused pass, dispatching to the AVX-512 limb kernel when available (see [`between_bits`]).
 fn two_limb_between(
     arr: &ArrayView<'_, DecimalByteParts>,
     lower: i128,
@@ -126,46 +114,10 @@ fn two_limb_between(
     let low = low.as_slice::<u64>();
     assert_eq!(high.len(), low.len(), "limb lengths must match");
 
-    // Pass the comparison as a monomorphized fn so the whole per-element body inlines.
-    let bits = match (options.lower_strict, options.upper_strict) {
-        (StrictComparison::Strict, StrictComparison::Strict) => {
-            collect_two_limb(high, low, lower, i128_gt, upper, i128_lt)
-        }
-        (StrictComparison::Strict, StrictComparison::NonStrict) => {
-            collect_two_limb(high, low, lower, i128_gt, upper, i128_le)
-        }
-        (StrictComparison::NonStrict, StrictComparison::Strict) => {
-            collect_two_limb(high, low, lower, i128_ge, upper, i128_lt)
-        }
-        (StrictComparison::NonStrict, StrictComparison::NonStrict) => {
-            collect_two_limb(high, low, lower, i128_ge, upper, i128_le)
-        }
-    };
-
+    let bits = between_bits(high, low, lower, upper, options);
     Ok(BoolArray::new(bits, validity).into_array())
 }
 
-/// The fused per-element loop. Reconstruct the i128 from its signed-high / unsigned-low limbs and
-/// compare against the bounds. A native i128 comparison is ~2 instructions, cheaper than a manual
-/// lexicographic decomposition, and the single pass reads each limb once (arrow reads its i128
-/// array twice, once per `gt_eq`/`lt_eq`). Bitwise `&` keeps the body branch-free.
-fn collect_two_limb(
-    high: &[i64],
-    low: &[u64],
-    lower: i128,
-    lower_cmp: impl Fn(i128, i128) -> bool,
-    upper: i128,
-    upper_cmp: impl Fn(i128, i128) -> bool,
-) -> BitBuffer {
-    BitBuffer::collect_bool(high.len(), |idx| {
-        // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
-        let hi = unsafe { *high.get_unchecked(idx) };
-        let lo = unsafe { *low.get_unchecked(idx) };
-        let value = reconstruct(hi, lo);
-        lower_cmp(value, lower) & upper_cmp(value, upper)
-    })
-}
-
 #[cfg(test)]
 mod tests {
     use rstest::rstest;
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
index 9c193401123..7ebc50e7ac0 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
@@ -22,21 +22,14 @@ use vortex_array::scalar::ScalarValue;
 use vortex_array::scalar_fn::fns::binary::CompareKernel;
 use vortex_array::scalar_fn::fns::operators::CompareOperator;
 use vortex_array::scalar_fn::fns::operators::Operator;
-use vortex_buffer::BitBuffer;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
 use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 use crate::decimal_byte_parts::compute::compare::Sign::Positive;
-use crate::decimal_byte_parts::compute::two_limb::i128_eq;
-use crate::decimal_byte_parts::compute::two_limb::i128_ge;
-use crate::decimal_byte_parts::compute::two_limb::i128_gt;
-use crate::decimal_byte_parts::compute::two_limb::i128_le;
-use crate::decimal_byte_parts::compute::two_limb::i128_lt;
-use crate::decimal_byte_parts::compute::two_limb::i128_ne;
+use crate::decimal_byte_parts::compute::two_limb::compare_bits;
 use crate::decimal_byte_parts::compute::two_limb::materialize_limbs;
-use crate::decimal_byte_parts::compute::two_limb::reconstruct;
 
 impl CompareKernel for DecimalByteParts {
     fn compare(
@@ -108,9 +101,8 @@ impl CompareKernel for DecimalByteParts {
     }
 }
 
-/// Compare each reconstructed i128 value against the constant `rhs` in a single fused pass. See
-/// `two_limb::reconstruct`; a native i128 compare is cheaper than a manual lexicographic
-/// decomposition and reads each limb only once.
+/// Compare the two-limb array against the constant `rhs` in a single fused pass, dispatching to the
+/// AVX-512 limb kernel when available (see [`compare_bits`]).
 fn two_limb_compare(
     lhs: &ArrayView<'_, DecimalByteParts>,
     rhs: i128,
@@ -124,33 +116,10 @@ fn two_limb_compare(
     let low = low.as_slice::<u64>();
     assert_eq!(high.len(), low.len(), "limb lengths must match");
 
-    let bits = match operator {
-        CompareOperator::Eq => collect_compare(high, low, rhs, i128_eq),
-        CompareOperator::NotEq => collect_compare(high, low, rhs, i128_ne),
-        CompareOperator::Gt => collect_compare(high, low, rhs, i128_gt),
-        CompareOperator::Gte => collect_compare(high, low, rhs, i128_ge),
-        CompareOperator::Lt => collect_compare(high, low, rhs, i128_lt),
-        CompareOperator::Lte => collect_compare(high, low, rhs, i128_le),
-    };
-
+    let bits = compare_bits(high, low, rhs, operator);
     Ok(BoolArray::new(bits, validity).into_array())
 }
 
-fn collect_compare(
-    high: &[i64],
-    low: &[u64],
-    rhs: i128,
-    cmp: impl Fn(i128, i128) -> bool,
-) -> BitBuffer {
-    BitBuffer::collect_bool(high.len(), |idx| {
-        // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
-        let hi = unsafe { *high.get_unchecked(idx) };
-        let lo = unsafe { *low.get_unchecked(idx) };
-        let value = reconstruct(hi, lo);
-        cmp(value, rhs)
-    })
-}
-
 // Used to represent the overflow direction when trying to
 // convert into the scalar type.
 #[derive(Debug)]
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
index 44e97887da3..5ea11ead376 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
@@ -26,6 +26,15 @@ mod tests {
     use crate::DecimalByteParts;
     use crate::DecimalBytePartsArray;
 
+    // 99 i128 values whose high (i64) limb varies in sign and whose low (u64) limb spans the full
+    // unsigned range, so limb tie-breaks and sign handling are covered across SIMD chunks.
+    #[allow(clippy::cast_possible_truncation)]
+    fn large_two_limb_values() -> Vec<i128> {
+        (0..99i128)
+            .map(|i| ((i - 50) << 64) | i128::from((i as u64).wrapping_mul(0x0123_4567_89ab_cdef)))
+            .collect()
+    }
+
     #[allow(clippy::cast_possible_truncation)]
     fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> DecimalBytePartsArray {
         let highs: Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
@@ -95,6 +104,20 @@ mod tests {
         Validity::Array(BoolArray::from_iter([true, false, true, true, false]).into_array()),
         DecimalDType::new(38, 2),
     ))]
+    // 99 values (not a multiple of 8) so the AVX-512 limb kernel's vectorized main loop *and* its
+    // scalar tail are both exercised when validating compare/between against canonical.
+    #[case::two_limb_large(two_limb(
+        &large_two_limb_values(),
+        Validity::NonNullable,
+        DecimalDType::new(38, 2),
+    ))]
+    #[case::two_limb_large_nullable(two_limb(
+        &large_two_limb_values(),
+        Validity::Array(
+            BoolArray::from_iter((0..99).map(|i| i % 3 != 0)).into_array(),
+        ),
+        DecimalDType::new(38, 2),
+    ))]
 
     fn test_decimal_byte_parts_consistency(#[case] array: DecimalBytePartsArray) {
         test_array_consistency(&array.into_array());
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
index 91c7a160fd7..2a4ceea359f 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
@@ -1,12 +1,23 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-//! Shared helpers for fused single-pass kernels over the two-limb (signed-high i64, unsigned-low
-//! u64) i128 representation, used by both `between` and `compare`.
+//! Fused single-pass kernels over the two-limb (signed-high `i64`, unsigned-low `u64`) i128
+//! representation, shared by `between` and `compare`.
+//!
+//! arrow's i128 comparison has no SIMD form on any x86 (there is no 128-bit-integer vector
+//! compare), so it is inherently scalar. The limbs, by contrast, are native widths: on AVX-512 we
+//! compare 8 lanes of `i64`/`u64` with `vpcmpq`/`vpcmpuq`, each producing a `__mmask8` that is
+//! exactly one byte of the output bitmap — no serial bit-packing. A scalar reconstruct path (which
+//! is itself competitive with arrow) is used when AVX-512 is unavailable.
 
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::scalar_fn::fns::between::BetweenOptions;
+use vortex_array::scalar_fn::fns::between::StrictComparison;
+use vortex_array::scalar_fn::fns::operators::CompareOperator;
+use vortex_buffer::BitBuffer;
+use vortex_buffer::BufferMut;
 use vortex_error::VortexExpect;
 use vortex_error::VortexResult;
 
@@ -15,8 +26,8 @@ use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 
 /// Materialize the high (signed `i64`) and low (unsigned `u64`) limbs of a two-limb array.
 ///
-/// The caller is responsible for checking [`DecimalBytePartsArrayExt::lower`] is present before
-/// calling; this panics otherwise.
+/// The caller must check [`DecimalBytePartsArrayExt::lower`] is present first; this panics
+/// otherwise.
 pub(crate) fn materialize_limbs(
     arr: &ArrayView<'_, DecimalByteParts>,
     ctx: &mut ExecutionCtx,
@@ -30,28 +41,310 @@ pub(crate) fn materialize_limbs(
     Ok((high, low))
 }
 
-/// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb,
-/// so `value = (high << 64) | low`.
+/// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb.
 #[inline(always)]
 pub(crate) fn reconstruct(high: i64, low: u64) -> i128 {
     ((high as i128) << 64) | (low as i128)
 }
 
-pub(crate) fn i128_eq(a: i128, b: i128) -> bool {
-    a == b
+// Operator codes, used as const generics to monomorphize the comparison out of the hot loop.
+const OP_LT: u8 = 0;
+const OP_LE: u8 = 1;
+const OP_GT: u8 = 2;
+const OP_GE: u8 = 3;
+const OP_EQ: u8 = 4;
+const OP_NE: u8 = 5;
+
+#[inline(always)]
+fn op_code(op: CompareOperator) -> u8 {
+    match op {
+        CompareOperator::Lt => OP_LT,
+        CompareOperator::Lte => OP_LE,
+        CompareOperator::Gt => OP_GT,
+        CompareOperator::Gte => OP_GE,
+        CompareOperator::Eq => OP_EQ,
+        CompareOperator::NotEq => OP_NE,
+    }
+}
+
+#[inline(always)]
+fn cmp_i128<const OP: u8>(value: i128, bound: i128) -> bool {
+    match OP {
+        OP_LT => value < bound,
+        OP_LE => value <= bound,
+        OP_GT => value > bound,
+        OP_GE => value >= bound,
+        OP_EQ => value == bound,
+        OP_NE => value != bound,
+        _ => unreachable!(),
+    }
+}
+
+/// `value <op> bound` over the two-limb representation, returned as a packed [`BitBuffer`].
+pub(crate) fn compare_bits(
+    high: &[i64],
+    low: &[u64],
+    bound: i128,
+    op: CompareOperator,
+) -> BitBuffer {
+    debug_assert_eq!(high.len(), low.len());
+    let code = op_code(op);
+
+    #[cfg(target_arch = "x86_64")]
+    if std::is_x86_feature_detected!("avx512f") {
+        let mut bytes = BufferMut::<u8>::zeroed(high.len().div_ceil(8));
+        // SAFETY: avx512f is detected, and `bytes` has `len.div_ceil(8)` bytes.
+        unsafe { avx512::compare(high, low, bound, code, bytes.as_mut_slice()) };
+        return BitBuffer::new(bytes.freeze(), high.len());
+    }
+
+    compare_scalar(high, low, bound, code)
 }
-pub(crate) fn i128_ne(a: i128, b: i128) -> bool {
-    a != b
+
+/// `lower <= value <= upper` (respecting strictness) over the two-limb representation, returned as
+/// a packed [`BitBuffer`].
+pub(crate) fn between_bits(
+    high: &[i64],
+    low: &[u64],
+    lower: i128,
+    upper: i128,
+    options: &BetweenOptions,
+) -> BitBuffer {
+    debug_assert_eq!(high.len(), low.len());
+    let lower_op = match options.lower_strict {
+        StrictComparison::Strict => OP_GT,
+        StrictComparison::NonStrict => OP_GE,
+    };
+    let upper_op = match options.upper_strict {
+        StrictComparison::Strict => OP_LT,
+        StrictComparison::NonStrict => OP_LE,
+    };
+
+    #[cfg(target_arch = "x86_64")]
+    if std::is_x86_feature_detected!("avx512f") {
+        let mut bytes = BufferMut::<u8>::zeroed(high.len().div_ceil(8));
+        // SAFETY: avx512f is detected, and `bytes` has `len.div_ceil(8)` bytes.
+        unsafe {
+            avx512::between(
+                high,
+                low,
+                lower,
+                upper,
+                lower_op,
+                upper_op,
+                bytes.as_mut_slice(),
+            )
+        };
+        return BitBuffer::new(bytes.freeze(), high.len());
+    }
+
+    between_scalar(high, low, lower, upper, lower_op, upper_op)
 }
-pub(crate) fn i128_ge(a: i128, b: i128) -> bool {
-    a >= b
+
+// ---- Scalar reconstruct fallback ----
+
+fn compare_scalar(high: &[i64], low: &[u64], bound: i128, code: u8) -> BitBuffer {
+    match code {
+        OP_LT => compare_scalar_impl::<OP_LT>(high, low, bound),
+        OP_LE => compare_scalar_impl::<OP_LE>(high, low, bound),
+        OP_GT => compare_scalar_impl::<OP_GT>(high, low, bound),
+        OP_GE => compare_scalar_impl::<OP_GE>(high, low, bound),
+        OP_EQ => compare_scalar_impl::<OP_EQ>(high, low, bound),
+        _ => compare_scalar_impl::<OP_NE>(high, low, bound),
+    }
 }
-pub(crate) fn i128_gt(a: i128, b: i128) -> bool {
-    a > b
+
+fn compare_scalar_impl<const OP: u8>(high: &[i64], low: &[u64], bound: i128) -> BitBuffer {
+    BitBuffer::collect_bool(high.len(), |idx| {
+        // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
+        let value = reconstruct(unsafe { *high.get_unchecked(idx) }, unsafe {
+            *low.get_unchecked(idx)
+        });
+        cmp_i128::<OP>(value, bound)
+    })
 }
-pub(crate) fn i128_le(a: i128, b: i128) -> bool {
-    a <= b
+
+fn between_scalar(
+    high: &[i64],
+    low: &[u64],
+    lower: i128,
+    upper: i128,
+    lower_op: u8,
+    upper_op: u8,
+) -> BitBuffer {
+    match (lower_op, upper_op) {
+        (OP_GT, OP_LT) => between_scalar_impl::<OP_GT, OP_LT>(high, low, lower, upper),
+        (OP_GT, _) => between_scalar_impl::<OP_GT, OP_LE>(high, low, lower, upper),
+        (_, OP_LT) => between_scalar_impl::<OP_GE, OP_LT>(high, low, lower, upper),
+        _ => between_scalar_impl::<OP_GE, OP_LE>(high, low, lower, upper),
+    }
 }
-pub(crate) fn i128_lt(a: i128, b: i128) -> bool {
-    a < b
+
+fn between_scalar_impl<const LOWER: u8, const UPPER: u8>(
+    high: &[i64],
+    low: &[u64],
+    lower: i128,
+    upper: i128,
+) -> BitBuffer {
+    BitBuffer::collect_bool(high.len(), |idx| {
+        // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len().
+        let value = reconstruct(unsafe { *high.get_unchecked(idx) }, unsafe {
+            *low.get_unchecked(idx)
+        });
+        cmp_i128::<LOWER>(value, lower) & cmp_i128::<UPPER>(value, upper)
+    })
+}
+
+// ---- AVX-512 fast path ----
+
+#[cfg(target_arch = "x86_64")]
+mod avx512 {
+    // Extracting the low 64 bits of an i128 bound via `as u64` is intentional limb truncation.
+    #![allow(clippy::cast_possible_truncation)]
+
+    use std::arch::x86_64::__m512i;
+    use std::arch::x86_64::_mm512_cmpeq_epi64_mask;
+    use std::arch::x86_64::_mm512_cmpge_epu64_mask;
+    use std::arch::x86_64::_mm512_cmpgt_epi64_mask;
+    use std::arch::x86_64::_mm512_cmpgt_epu64_mask;
+    use std::arch::x86_64::_mm512_cmple_epu64_mask;
+    use std::arch::x86_64::_mm512_cmplt_epi64_mask;
+    use std::arch::x86_64::_mm512_cmplt_epu64_mask;
+    use std::arch::x86_64::_mm512_loadu_epi64;
+    use std::arch::x86_64::_mm512_set1_epi64;
+
+    use super::OP_EQ;
+    use super::OP_GE;
+    use super::OP_GT;
+    use super::OP_LE;
+    use super::OP_LT;
+    use super::OP_NE;
+    use super::cmp_i128;
+    use super::reconstruct;
+
+    /// Per-chunk mask for `value <OP> bound`, combining the signed-high and unsigned-low limb
+    /// comparisons lexicographically. Each operand is a vector of 8 lanes; the result is a
+    /// `__mmask8` (one bit per lane).
+    ///
+    /// # Safety
+    /// The caller must ensure the `avx512f` feature is enabled.
+    #[target_feature(enable = "avx512f")]
+    fn chunk_mask<const OP: u8>(h: __m512i, l: __m512i, bh: __m512i, bl: __m512i) -> u8 {
+        match OP {
+            OP_LT => {
+                _mm512_cmplt_epi64_mask(h, bh)
+                    | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmplt_epu64_mask(l, bl))
+            }
+            OP_LE => {
+                _mm512_cmplt_epi64_mask(h, bh)
+                    | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmple_epu64_mask(l, bl))
+            }
+            OP_GT => {
+                _mm512_cmpgt_epi64_mask(h, bh)
+                    | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpgt_epu64_mask(l, bl))
+            }
+            OP_GE => {
+                _mm512_cmpgt_epi64_mask(h, bh)
+                    | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpge_epu64_mask(l, bl))
+            }
+            OP_EQ => _mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpeq_epi64_mask(l, bl),
+            OP_NE => !(_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpeq_epi64_mask(l, bl)),
+            _ => unreachable!(),
+        }
+    }
+
+    /// # Safety
+    /// The caller must ensure `avx512f` is enabled and `out.len() == high.len().div_ceil(8)`.
+    #[target_feature(enable = "avx512f")]
+    pub(super) unsafe fn compare(high: &[i64], low: &[u64], bound: i128, code: u8, out: &mut [u8]) {
+        // SAFETY: avx512f is enabled by the target_feature on this fn and its callees.
+        unsafe {
+            match code {
+                OP_LT => compare_impl::<OP_LT>(high, low, bound, out),
+                OP_LE => compare_impl::<OP_LE>(high, low, bound, out),
+                OP_GT => compare_impl::<OP_GT>(high, low, bound, out),
+                OP_GE => compare_impl::<OP_GE>(high, low, bound, out),
+                OP_EQ => compare_impl::<OP_EQ>(high, low, bound, out),
+                _ => compare_impl::<OP_NE>(high, low, bound, out),
+            }
+        }
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn compare_impl<const OP: u8>(high: &[i64], low: &[u64], bound: i128, out: &mut [u8]) {
+        let bh = (bound >> 64) as i64;
+        let bl = bound as u64;
+        // SAFETY: avx512f enabled; loads are unaligned; indices stay within the 8-lane chunks.
+        unsafe {
+            let bhv = _mm512_set1_epi64(bh);
+            let blv = _mm512_set1_epi64(bl as i64);
+
+            let chunks = high.len() / 8;
+            for c in 0..chunks {
+                let h = _mm512_loadu_epi64(high.as_ptr().add(c * 8));
+                let l = _mm512_loadu_epi64(low.as_ptr().add(c * 8).cast());
+                out[c] = chunk_mask::<OP>(h, l, bhv, blv);
+            }
+
+            for i in (chunks * 8)..high.len() {
+                if cmp_i128::<OP>(reconstruct(high[i], low[i]), bound) {
+                    out[i / 8] |= 1 << (i % 8);
+                }
+            }
+        }
+    }
+
+    /// # Safety
+    /// The caller must ensure `avx512f` is enabled and `out.len() == high.len().div_ceil(8)`.
+    #[target_feature(enable = "avx512f")]
+    pub(super) unsafe fn between(
+        high: &[i64],
+        low: &[u64],
+        lower: i128,
+        upper: i128,
+        lower_op: u8,
+        upper_op: u8,
+        out: &mut [u8],
+    ) {
+        // SAFETY: avx512f is enabled by the target_feature on this fn and its callees.
+        unsafe {
+            match (lower_op, upper_op) {
+                (OP_GT, OP_LT) => between_impl::<OP_GT, OP_LT>(high, low, lower, upper, out),
+                (OP_GT, _) => between_impl::<OP_GT, OP_LE>(high, low, lower, upper, out),
+                (_, OP_LT) => between_impl::<OP_GE, OP_LT>(high, low, lower, upper, out),
+                _ => between_impl::<OP_GE, OP_LE>(high, low, lower, upper, out),
+            }
+        }
+    }
+
+    #[target_feature(enable = "avx512f")]
+    unsafe fn between_impl<const LOWER: u8, const UPPER: u8>(
+        high: &[i64],
+        low: &[u64],
+        lower: i128,
+        upper: i128,
+        out: &mut [u8],
+    ) {
+        // SAFETY: avx512f enabled; loads are unaligned; indices stay within the 8-lane chunks.
+        unsafe {
+            let lhv = _mm512_set1_epi64((lower >> 64) as i64);
+            let llv = _mm512_set1_epi64(lower as u64 as i64);
+            let uhv = _mm512_set1_epi64((upper >> 64) as i64);
+            let ulv = _mm512_set1_epi64(upper as u64 as i64);
+
+            let chunks = high.len() / 8;
+            for c in 0..chunks {
+                let h = _mm512_loadu_epi64(high.as_ptr().add(c * 8));
+                let l = _mm512_loadu_epi64(low.as_ptr().add(c * 8).cast());
+                out[c] = chunk_mask::<LOWER>(h, l, lhv, llv) & chunk_mask::<UPPER>(h, l, uhv, ulv);
+            }
+
+            for i in (chunks * 8)..high.len() {
+                let value = reconstruct(high[i], low[i]);
+                if cmp_i128::<LOWER>(value, lower) & cmp_i128::<UPPER>(value, upper) {
+                    out[i / 8] |= 1 << (i % 8);
+                }
+            }
+        }
+    }
 }

From 96b64b7ab849eb28a0c6b377d4136180d30ea083 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 18:54:36 +0000
Subject: [PATCH 11/13] Trim decimal comparison benchmarks to a justified, <1ms
 set

Audit of the added benches:
- Remove limb_simd: it duplicated the AVX-512 kernel that now lives in
  two_limb.rs (drift risk), benchmarked a lexicographic-scalar kernel that
  ships nowhere, and its 1<<20 case exceeded the 1ms-per-op budget. The
  productized path is now measured by decimal_lt/decimal_between.
- Drop the `_wide` arrow and canonical baselines from both files: an i128
  comparison's cost is independent of values and precision/scale, so they were
  provably identical to their narrow counterparts (measured 51.4 vs 51.4us).
  One arrow + one canonical i128 baseline now serves both narrow and wide.

Every remaining case stays well under 1ms.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 encodings/decimal-byte-parts/Cargo.toml       |   4 -
 .../benches/decimal_between.rs                |  74 +------
 .../decimal-byte-parts/benches/decimal_lt.rs  |  62 +-----
 .../decimal-byte-parts/benches/limb_simd.rs   | 188 ------------------
 4 files changed, 13 insertions(+), 315 deletions(-)
 delete mode 100644 encodings/decimal-byte-parts/benches/limb_simd.rs

diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml
index d35378ea179..783fc815aa0 100644
--- a/encodings/decimal-byte-parts/Cargo.toml
+++ b/encodings/decimal-byte-parts/Cargo.toml
@@ -41,7 +41,3 @@ harness = false
 [[bench]]
 name = "decimal_lt"
 harness = false
-
-[[bench]]
-name = "limb_simd"
-harness = false
diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs
index e55f0866c7c..f9e66c444a7 100644
--- a/encodings/decimal-byte-parts/benches/decimal_between.rs
+++ b/encodings/decimal-byte-parts/benches/decimal_between.rs
@@ -217,12 +217,16 @@ fn arrow_decimal128(bencher: Bencher, len: usize) {
         });
 }
 
-// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ----
+// ---- Wide i128 decimals: two-limb ----
 //
 // These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex
 // nor arrow can keep them in a narrow integer. The two-limb representation splits each value into
-// a signed i64 high limb and an unsigned u64 low limb, letting `between` run as native-width
-// (SIMD-friendly) integer comparisons instead of arrow's 128-bit comparison.
+// a signed i64 high limb and an unsigned u64 low limb, compared limb-wise (AVX-512 when available)
+// instead of arrow's 128-bit comparison.
+//
+// The i128 baselines for this comparison are `arrow_decimal128` and `vortex_canonical_i128` above:
+// an i128 comparison's cost is independent of the values and the declared precision/scale, so those
+// benches measure the same kernel regardless of whether the data is logically narrow or wide.
 
 fn wide_values(len: usize) -> Vec<i128> {
     let mut rng = StdRng::seed_from_u64(0x5eed);
@@ -277,67 +281,3 @@ fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) {
             )
         });
 }
-
-#[divan::bench(args = LENGTHS)]
-fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) {
-    let dt = DecimalDType::new(38, 2);
-    let arr = DecimalArray::new(
-        wide_values(len).into_iter().collect(),
-        dt,
-        Validity::NonNullable,
-    )
-    .into_array();
-    let lower = ConstantArray::new(
-        Scalar::decimal(DecimalValue::I128(WIDE_LOWER), dt, Nullability::NonNullable),
-        len,
-    )
-    .into_array();
-    let upper = ConstantArray::new(
-        Scalar::decimal(DecimalValue::I128(WIDE_UPPER), dt, Nullability::NonNullable),
-        len,
-    )
-    .into_array();
-
-    bencher
-        .with_inputs(|| {
-            (
-                arr.clone(),
-                lower.clone(),
-                upper.clone(),
-                LEGACY_SESSION.create_execution_ctx(),
-            )
-        })
-        .bench_values(|(arr, lower, upper, mut ctx)| {
-            black_box(
-                arr.between(lower, upper, OPTIONS)
-                    .unwrap()
-                    .execute::<BoolArray>(&mut ctx)
-                    .unwrap(),
-            )
-        });
-}
-
-#[divan::bench(args = LENGTHS)]
-fn arrow_decimal128_wide(bencher: Bencher, len: usize) {
-    let arr = Decimal128Array::from_iter_values(wide_values(len))
-        .with_precision_and_scale(38, 2)
-        .unwrap();
-    let lower = ArrowScalar::new(
-        Decimal128Array::from_iter_values([WIDE_LOWER])
-            .with_precision_and_scale(38, 2)
-            .unwrap(),
-    );
-    let upper = ArrowScalar::new(
-        Decimal128Array::from_iter_values([WIDE_UPPER])
-            .with_precision_and_scale(38, 2)
-            .unwrap(),
-    );
-
-    bencher
-        .with_inputs(|| (arr.clone(), lower.clone(), upper.clone()))
-        .bench_values(|(arr, lower, upper)| {
-            let ge = cmp::gt_eq(&arr, &lower).unwrap();
-            let le = cmp::lt_eq(&arr, &upper).unwrap();
-            black_box(and(&ge, &le).unwrap())
-        });
-}
diff --git a/encodings/decimal-byte-parts/benches/decimal_lt.rs b/encodings/decimal-byte-parts/benches/decimal_lt.rs
index 037c8800526..da84ca4e3e0 100644
--- a/encodings/decimal-byte-parts/benches/decimal_lt.rs
+++ b/encodings/decimal-byte-parts/benches/decimal_lt.rs
@@ -175,12 +175,15 @@ fn arrow_decimal128(bencher: Bencher, len: usize) {
         .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap()));
 }
 
-// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ----
+// ---- Wide i128 decimals: two-limb ----
 //
 // These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex
 // nor arrow can keep them in a narrow integer. The two-limb representation splits each value into a
-// signed i64 high limb and an unsigned u64 low limb, letting `lt` reconstruct the i128 once and run
-// a single native comparison.
+// signed i64 high limb and an unsigned u64 low limb, compared limb-wise (AVX-512 when available).
+//
+// The i128 baselines for this comparison are `arrow_decimal128` and `vortex_canonical_i128` above:
+// an i128 comparison's cost is independent of the values and the declared precision/scale, so those
+// benches measure the same kernel regardless of whether the data is logically narrow or wide.
 
 fn wide_values(len: usize) -> Vec<i128> {
     let mut rng = StdRng::seed_from_u64(0x5eed);
@@ -232,56 +235,3 @@ fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) {
             )
         });
 }
-
-#[divan::bench(args = LENGTHS)]
-fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) {
-    let dt = DecimalDType::new(38, 2);
-    let arr = DecimalArray::new(
-        wide_values(len).into_iter().collect(),
-        dt,
-        Validity::NonNullable,
-    )
-    .into_array();
-    let rhs = ConstantArray::new(
-        Scalar::decimal(
-            DecimalValue::I128(WIDE_THRESHOLD),
-            dt,
-            Nullability::NonNullable,
-        ),
-        len,
-    )
-    .into_array();
-
-    bencher
-        .with_inputs(|| {
-            (
-                arr.clone(),
-                rhs.clone(),
-                LEGACY_SESSION.create_execution_ctx(),
-            )
-        })
-        .bench_values(|(arr, rhs, mut ctx)| {
-            black_box(
-                arr.binary(rhs, Operator::Lt)
-                    .unwrap()
-                    .execute::<BoolArray>(&mut ctx)
-                    .unwrap(),
-            )
-        });
-}
-
-#[divan::bench(args = LENGTHS)]
-fn arrow_decimal128_wide(bencher: Bencher, len: usize) {
-    let arr = Decimal128Array::from_iter_values(wide_values(len))
-        .with_precision_and_scale(38, 2)
-        .unwrap();
-    let rhs = ArrowScalar::new(
-        Decimal128Array::from_iter_values([WIDE_THRESHOLD])
-            .with_precision_and_scale(38, 2)
-            .unwrap(),
-    );
-
-    bencher
-        .with_inputs(|| (arr.clone(), rhs.clone()))
-        .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap()));
-}
diff --git a/encodings/decimal-byte-parts/benches/limb_simd.rs b/encodings/decimal-byte-parts/benches/limb_simd.rs
deleted file mode 100644
index 889f12e17f1..00000000000
--- a/encodings/decimal-byte-parts/benches/limb_simd.rs
+++ /dev/null
@@ -1,188 +0,0 @@
-// SPDX-License-Identifier: Apache-2.0
-// SPDX-FileCopyrightText: Copyright the Vortex contributors
-
-//! Microbenchmark isolating the *kernel* cost of `x < threshold` over wide i128 decimals, with no
-//! Vortex expression-execution overhead, to answer: can comparing the (signed-high i64, unsigned-low
-//! u64) limb pair beat arrow's i128 comparison?
-//!
-//! arrow's i128 has no SIMD compare on any x86 (there is no 128-bit-integer vector comparison), so
-//! it is inherently scalar. The limbs are native i64/u64, which AVX-512 compares 8-wide with
-//! `vpcmpq`/`vpcmpuq`, producing a mask register that is exactly the packed-bit output we want.
-//!
-//! Kernels:
-//!   * `arrow_cmp_lt`      arrow-rs `cmp::lt` over a `Decimal128Array` (reference).
-//!   * `raw_i128_scalar`   read one contiguous i128 slice, scalar compare, serial bit-pack.
-//!   * `raw_limb_scalar`   read the two limb slices, lexicographic compare, serial bit-pack.
-//!   * `raw_limb_avx512`   read the two limb slices, AVX-512 limb compare straight to a mask byte.
-
-#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)]
-
-use arrow_array::Decimal128Array;
-use arrow_array::Scalar as ArrowScalar;
-use arrow_ord::cmp;
-use divan::Bencher;
-use divan::black_box;
-use rand::RngExt;
-use rand::SeedableRng;
-use rand::rngs::StdRng;
-
-fn main() {
-    divan::main();
-}
-
-const LENGTHS: &[usize] = &[1 << 16, 1 << 17, 1 << 20];
-
-const THRESHOLD: i128 = (500i128 << 64) | 0x90ab_cdef;
-
-fn wide_values(len: usize) -> Vec<i128> {
-    let mut rng = StdRng::seed_from_u64(0x5eed);
-    (0..len)
-        .map(|_| {
-            let high = i128::from(rng.random_range(0..1000i64));
-            let low = i128::from(rng.random_range(0..u64::MAX));
-            (high << 64) | low
-        })
-        .collect()
-}
-
-fn limbs(values: &[i128]) -> (Vec<i64>, Vec<u64>) {
-    (
-        values.iter().map(|v| (v >> 64) as i64).collect(),
-        values.iter().map(|v| *v as u64).collect(),
-    )
-}
-
-// ---- arrow-rs reference ----
-
-#[divan::bench(args = LENGTHS)]
-fn arrow_cmp_lt(bencher: Bencher, len: usize) {
-    let arr = Decimal128Array::from_iter_values(wide_values(len))
-        .with_precision_and_scale(38, 2)
-        .unwrap();
-    let rhs = ArrowScalar::new(
-        Decimal128Array::from_iter_values([THRESHOLD])
-            .with_precision_and_scale(38, 2)
-            .unwrap(),
-    );
-    bencher
-        .with_inputs(|| (arr.clone(), rhs.clone()))
-        .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap()));
-}
-
-// ---- scalar i128 (one contiguous read) ----
-
-fn lt_i128_scalar(vals: &[i128], th: i128, out: &mut [u64]) {
-    for (w, chunk) in vals.chunks(64).enumerate() {
-        let mut packed = 0u64;
-        for (b, &v) in chunk.iter().enumerate() {
-            packed |= u64::from(v < th) << b;
-        }
-        out[w] = packed;
-    }
-}
-
-#[divan::bench(args = LENGTHS)]
-fn raw_i128_scalar(bencher: Bencher, len: usize) {
-    let vals = wide_values(len);
-    let mut out = vec![0u64; len.div_ceil(64)];
-    bencher.bench_local(|| {
-        lt_i128_scalar(black_box(&vals), black_box(THRESHOLD), &mut out);
-        black_box(&out);
-    });
-}
-
-// ---- scalar limb (lexicographic) ----
-
-fn lt_limb_scalar(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u64]) {
-    for w in 0..hi.len().div_ceil(64) {
-        let base = w * 64;
-        let end = (base + 64).min(hi.len());
-        let mut packed = 0u64;
-        for i in base..end {
-            let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo));
-            packed |= u64::from(lt) << (i - base);
-        }
-        out[w] = packed;
-    }
-}
-
-#[divan::bench(args = LENGTHS)]
-fn raw_limb_scalar(bencher: Bencher, len: usize) {
-    let (hi, lo) = limbs(&wide_values(len));
-    let mut out = vec![0u64; len.div_ceil(64)];
-    let th_hi = (THRESHOLD >> 64) as i64;
-    let th_lo = THRESHOLD as u64;
-    bencher.bench_local(|| {
-        lt_limb_scalar(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out);
-        black_box(&out);
-    });
-}
-
-// ---- AVX-512 limb (8-wide compare straight to mask bytes) ----
-
-#[cfg(target_arch = "x86_64")]
-#[target_feature(enable = "avx512f")]
-unsafe fn lt_limb_avx512(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u8]) {
-    use std::arch::x86_64::_mm512_cmpeq_epi64_mask;
-    use std::arch::x86_64::_mm512_cmplt_epi64_mask;
-    use std::arch::x86_64::_mm512_cmplt_epu64_mask;
-    use std::arch::x86_64::_mm512_loadu_epi64;
-    use std::arch::x86_64::_mm512_set1_epi64;
-
-    let n = hi.len();
-    let chunks = n / 8;
-    // SAFETY: this fn is only called when avx512f is detected; all intrinsics/loads below are in
-    // bounds because `chunks * 8 <= n` and `out.len() >= n.div_ceil(8)`.
-    unsafe {
-        let th_hi_v = _mm512_set1_epi64(th_hi);
-        let th_lo_v = _mm512_set1_epi64(th_lo as i64);
-
-        for chunk in 0..chunks {
-            let hv = _mm512_loadu_epi64(hi.as_ptr().add(chunk * 8));
-            let lv = _mm512_loadu_epi64(lo.as_ptr().add(chunk * 8).cast());
-            let hi_lt = _mm512_cmplt_epi64_mask(hv, th_hi_v);
-            let hi_eq = _mm512_cmpeq_epi64_mask(hv, th_hi_v);
-            let lo_lt = _mm512_cmplt_epu64_mask(lv, th_lo_v);
-            // mask = hi < th_hi | (hi == th_hi & lo < th_lo); each is a __mmask8 (one bit/lane).
-            out[chunk] = hi_lt | (hi_eq & lo_lt);
-        }
-    }
-
-    // scalar tail
-    for i in (chunks * 8)..n {
-        let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo));
-        let byte = &mut out[i / 8];
-        *byte |= u8::from(lt) << (i % 8);
-    }
-}
-
-#[divan::bench(args = LENGTHS)]
-fn raw_limb_avx512(bencher: Bencher, len: usize) {
-    if !is_x86_feature_detected!("avx512f") {
-        return;
-    }
-    let (hi, lo) = limbs(&wide_values(len));
-    let mut out = vec![0u8; len.div_ceil(8)];
-    let th_hi = (THRESHOLD >> 64) as i64;
-    let th_lo = THRESHOLD as u64;
-
-    // Cross-check the AVX-512 output against the scalar reference so the timing is for a correct
-    // kernel. The u8 mask bytes alias the u64 words bit-for-bit (lane i -> bit i).
-    unsafe { lt_limb_avx512(&hi, &lo, th_hi, th_lo, &mut out) };
-    let mut reference = vec![0u64; len.div_ceil(64)];
-    lt_limb_scalar(&hi, &lo, th_hi, th_lo, &mut reference);
-    for (i, &byte) in out.iter().enumerate() {
-        assert_eq!(
-            byte,
-            (reference[i / 8] >> ((i % 8) * 8)) as u8,
-            "avx512 mask mismatch at byte {i}"
-        );
-    }
-    out.iter_mut().for_each(|b| *b = 0);
-
-    bencher.bench_local(|| {
-        // SAFETY: guarded by the avx512f feature check above.
-        unsafe { lt_limb_avx512(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out) };
-        black_box(&out);
-    });
-}

From 59e50ea6fbb1d1d648b191c5d5f99f64e3d26868 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 19:30:17 +0000
Subject: [PATCH 12/13] Consolidate two-limb compare/between plumbing into one
 helper

Both kernels duplicated the same materialize-limbs / combine-validity / wrap-as-
BoolArray boilerplate. Factor it into `two_limb::eval`, which takes a closure
mapping the high/low limb slices to a packed BitBuffer, so `compare` and
`between` reduce to extracting the bound(s) and calling it. Drop the now-internal
`materialize_limbs`/`reconstruct` from the crate-visible surface.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/decimal_byte_parts/compute/between.rs | 35 +++----------------
 .../src/decimal_byte_parts/compute/compare.rs | 32 +++--------------
 .../decimal_byte_parts/compute/two_limb.rs    | 24 ++++++++-----
 3 files changed, 24 insertions(+), 67 deletions(-)

diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
index 69033f12094..c33cd54317b 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -5,10 +5,8 @@ use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
-use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ConstantArray;
 use vortex_array::builtins::ArrayBuiltins;
-use vortex_array::dtype::Nullability;
 use vortex_array::scalar::Scalar;
 use vortex_array::scalar_fn::fns::between::BetweenKernel;
 use vortex_array::scalar_fn::fns::between::BetweenOptions;
@@ -19,7 +17,7 @@ use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive;
 use crate::decimal_byte_parts::compute::two_limb::between_bits;
-use crate::decimal_byte_parts::compute::two_limb::materialize_limbs;
+use crate::decimal_byte_parts::compute::two_limb::eval;
 
 impl BetweenKernel for DecimalByteParts {
     fn between(
@@ -57,14 +55,9 @@ impl BetweenKernel for DecimalByteParts {
             else {
                 return Ok(None);
             };
-            return Ok(Some(two_limb_between(
-                &arr,
-                lower_i128,
-                upper_i128,
-                options,
-                nullability,
-                ctx,
-            )?));
+            return Ok(Some(eval(&arr, nullability, ctx, |high, low| {
+                between_bits(high, low, lower_i128, upper_i128, options)
+            })?));
         }
 
         let scalar_type = arr.msp().dtype().with_nullability(nullability);
@@ -98,26 +91,6 @@ impl BetweenKernel for DecimalByteParts {
     }
 }
 
-/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation in a
-/// single fused pass, dispatching to the AVX-512 limb kernel when available (see [`between_bits`]).
-fn two_limb_between(
-    arr: &ArrayView<'_, DecimalByteParts>,
-    lower: i128,
-    upper: i128,
-    options: &BetweenOptions,
-    nullability: Nullability,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ArrayRef> {
-    let (high, low) = materialize_limbs(arr, ctx)?;
-    let validity = high.validity()?.union_nullability(nullability);
-    let high = high.as_slice::<i64>();
-    let low = low.as_slice::<u64>();
-    assert_eq!(high.len(), low.len(), "limb lengths must match");
-
-    let bits = between_bits(high, low, lower, upper, options);
-    Ok(BoolArray::new(bits, validity).into_array())
-}
-
 #[cfg(test)]
 mod tests {
     use rstest::rstest;
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
index 7ebc50e7ac0..21223ae92bf 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
@@ -7,7 +7,6 @@ use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
 use vortex_array::IntoArray;
-use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::ConstantArray;
 use vortex_array::builtins::ArrayBuiltins;
 use vortex_array::dtype::IntegerPType;
@@ -29,7 +28,7 @@ use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 use crate::decimal_byte_parts::compute::compare::Sign::Positive;
 use crate::decimal_byte_parts::compute::two_limb::compare_bits;
-use crate::decimal_byte_parts::compute::two_limb::materialize_limbs;
+use crate::decimal_byte_parts::compute::two_limb::eval;
 
 impl CompareKernel for DecimalByteParts {
     fn compare(
@@ -56,13 +55,9 @@ impl CompareKernel for DecimalByteParts {
             let Some(rhs_i128) = rhs_decimal.cast::<i128>() else {
                 return Ok(None);
             };
-            return Ok(Some(two_limb_compare(
-                &lhs,
-                rhs_i128,
-                operator,
-                nullability,
-                ctx,
-            )?));
+            return Ok(Some(eval(&lhs, nullability, ctx, |high, low| {
+                compare_bits(high, low, rhs_i128, operator)
+            })?));
         }
 
         let scalar_type = lhs.msp().dtype().with_nullability(nullability);
@@ -101,25 +96,6 @@ impl CompareKernel for DecimalByteParts {
     }
 }
 
-/// Compare the two-limb array against the constant `rhs` in a single fused pass, dispatching to the
-/// AVX-512 limb kernel when available (see [`compare_bits`]).
-fn two_limb_compare(
-    lhs: &ArrayView<'_, DecimalByteParts>,
-    rhs: i128,
-    operator: CompareOperator,
-    nullability: Nullability,
-    ctx: &mut ExecutionCtx,
-) -> VortexResult<ArrayRef> {
-    let (high, low) = materialize_limbs(lhs, ctx)?;
-    let validity = high.validity()?.union_nullability(nullability);
-    let high = high.as_slice::<i64>();
-    let low = low.as_slice::<u64>();
-    assert_eq!(high.len(), low.len(), "limb lengths must match");
-
-    let bits = compare_bits(high, low, rhs, operator);
-    Ok(BoolArray::new(bits, validity).into_array())
-}
-
 // Used to represent the overflow direction when trying to
 // convert into the scalar type.
 #[derive(Debug)]
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
index 2a4ceea359f..272ecf003dc 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
@@ -10,9 +10,13 @@
 //! exactly one byte of the output bitmap — no serial bit-packing. A scalar reconstruct path (which
 //! is itself competitive with arrow) is used when AVX-512 is unavailable.
 
+use vortex_array::ArrayRef;
 use vortex_array::ArrayView;
 use vortex_array::ExecutionCtx;
+use vortex_array::IntoArray;
+use vortex_array::arrays::BoolArray;
 use vortex_array::arrays::PrimitiveArray;
+use vortex_array::dtype::Nullability;
 use vortex_array::scalar_fn::fns::between::BetweenOptions;
 use vortex_array::scalar_fn::fns::between::StrictComparison;
 use vortex_array::scalar_fn::fns::operators::CompareOperator;
@@ -24,26 +28,30 @@ use vortex_error::VortexResult;
 use crate::DecimalByteParts;
 use crate::decimal_byte_parts::DecimalBytePartsArrayExt;
 
-/// Materialize the high (signed `i64`) and low (unsigned `u64`) limbs of a two-limb array.
-///
-/// The caller must check [`DecimalBytePartsArrayExt::lower`] is present first; this panics
-/// otherwise.
-pub(crate) fn materialize_limbs(
+/// Materialize the two limbs, run a fused bit-kernel over them, and wrap the result as a boolean
+/// array carrying the combined validity. Shared entry point for the two-limb `compare`/`between`
+/// paths: the caller supplies a kernel mapping the high (`i64`) and low (`u64`) limb slices to a
+/// packed [`BitBuffer`].
+pub(crate) fn eval(
     arr: &ArrayView<'_, DecimalByteParts>,
+    nullability: Nullability,
     ctx: &mut ExecutionCtx,
-) -> VortexResult<(PrimitiveArray, PrimitiveArray)> {
+    kernel: impl FnOnce(&[i64], &[u64]) -> BitBuffer,
+) -> VortexResult<ArrayRef> {
     let high = arr.msp().clone().execute::<PrimitiveArray>(ctx)?;
     let low = arr
         .lower()
         .vortex_expect("two-limb path requires a lower limb")
         .clone()
         .execute::<PrimitiveArray>(ctx)?;
-    Ok((high, low))
+    let validity = high.validity()?.union_nullability(nullability);
+    let bits = kernel(high.as_slice::<i64>(), low.as_slice::<u64>());
+    Ok(BoolArray::new(bits, validity).into_array())
 }
 
 /// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb.
 #[inline(always)]
-pub(crate) fn reconstruct(high: i64, low: u64) -> i128 {
+fn reconstruct(high: i64, low: u64) -> i128 {
     ((high as i128) << 64) | (low as i128)
 }
 

From 990d2f6ed38c71e9659a4edafa0271188c4f329d Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Tue, 26 May 2026 19:35:24 +0000
Subject: [PATCH 13/13] Share the two-limb test array builder across compute
 test modules

The mod/compare/between test modules each duplicated the same i128 limb-split
construction. Hoist it to a single `#[cfg(test)] two_limb_array` in two_limb.rs;
the two thin local adapters just add `.into_array()` / a default validity.

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 .../src/decimal_byte_parts/compute/between.rs | 12 ++----------
 .../src/decimal_byte_parts/compute/compare.rs | 13 ++-----------
 .../src/decimal_byte_parts/compute/mod.rs     | 14 +-------------
 .../decimal_byte_parts/compute/two_limb.rs    | 19 +++++++++++++++++++
 4 files changed, 24 insertions(+), 34 deletions(-)

diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
index c33cd54317b..3be35fbd563 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs
@@ -115,6 +115,7 @@ mod tests {
     use vortex_buffer::buffer;
     use vortex_error::VortexResult;
 
+    use super::super::two_limb::two_limb_array;
     use crate::DecimalByteParts;
 
     fn decimal_const(value: DecimalValue, decimal_type: DecimalDType, len: usize) -> ArrayRef {
@@ -125,17 +126,8 @@ mod tests {
         .into_array()
     }
 
-    #[allow(clippy::cast_possible_truncation)]
     fn two_limb(values: &[i128], decimal_type: DecimalDType) -> ArrayRef {
-        let highs: Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
-        let lows: Buffer<u64> = values.iter().map(|v| *v as u64).collect();
-        DecimalByteParts::try_new_with_lower(
-            PrimitiveArray::new(highs, Validity::NonNullable).into_array(),
-            PrimitiveArray::new(lows, Validity::NonNullable).into_array(),
-            decimal_type,
-        )
-        .unwrap()
-        .into_array()
+        two_limb_array(values, Validity::NonNullable, decimal_type).into_array()
     }
 
     /// The two-limb `between` pushdown must agree with the canonical i128 implementation across
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
index 21223ae92bf..12cd8b40edd 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs
@@ -176,23 +176,14 @@ mod tests {
     use vortex_array::scalar::Scalar;
     use vortex_array::scalar_fn::fns::operators::Operator;
     use vortex_array::validity::Validity;
-    use vortex_buffer::Buffer;
     use vortex_buffer::buffer;
     use vortex_error::VortexResult;
 
+    use super::super::two_limb::two_limb_array;
     use crate::DecimalByteParts;
 
-    #[allow(clippy::cast_possible_truncation)]
     fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> ArrayRef {
-        let highs: Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
-        let lows: Buffer<u64> = values.iter().map(|v| *v as u64).collect();
-        DecimalByteParts::try_new_with_lower(
-            PrimitiveArray::new(highs, validity).into_array(),
-            PrimitiveArray::new(lows, Validity::NonNullable).into_array(),
-            dt,
-        )
-        .unwrap()
-        .into_array()
+        two_limb_array(values, validity, dt).into_array()
     }
 
     #[test]
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
index 5ea11ead376..53ff8fb9dbe 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs
@@ -20,9 +20,9 @@ mod tests {
     use vortex_array::compute::conformance::consistency::test_array_consistency;
     use vortex_array::dtype::DecimalDType;
     use vortex_array::validity::Validity;
-    use vortex_buffer::Buffer;
     use vortex_buffer::buffer;
 
+    use super::two_limb::two_limb_array as two_limb;
     use crate::DecimalByteParts;
     use crate::DecimalBytePartsArray;
 
@@ -35,18 +35,6 @@ mod tests {
             .collect()
     }
 
-    #[allow(clippy::cast_possible_truncation)]
-    fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> DecimalBytePartsArray {
-        let highs: Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
-        let lows: Buffer<u64> = values.iter().map(|v| *v as u64).collect();
-        DecimalByteParts::try_new_with_lower(
-            PrimitiveArray::new(highs, validity).into_array(),
-            PrimitiveArray::new(lows, Validity::NonNullable).into_array(),
-            dt,
-        )
-        .unwrap()
-    }
-
     #[rstest]
     // Basic decimal byte parts arrays
     #[case::decimal_i32(DecimalByteParts::try_new(
diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
index 272ecf003dc..f64262ff371 100644
--- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
+++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs
@@ -356,3 +356,22 @@ mod avx512 {
         }
     }
 }
+
+/// Test helper: build a two-limb [`DecimalByteParts`] array from i128 values, splitting each into a
+/// signed high limb and an unsigned low limb.
+#[cfg(test)]
+#[allow(clippy::cast_possible_truncation)]
+pub(crate) fn two_limb_array(
+    values: &[i128],
+    validity: vortex_array::validity::Validity,
+    dt: vortex_array::dtype::DecimalDType,
+) -> crate::DecimalBytePartsArray {
+    let highs: vortex_buffer::Buffer<i64> = values.iter().map(|v| (v >> 64) as i64).collect();
+    let lows: vortex_buffer::Buffer<u64> = values.iter().map(|v| *v as u64).collect();
+    DecimalByteParts::try_new_with_lower(
+        PrimitiveArray::new(highs, validity).into_array(),
+        PrimitiveArray::new(lows, vortex_array::validity::Validity::NonNullable).into_array(),
+        dt,
+    )
+    .unwrap()
+}