From c982fc848369d64ca60641141e462879d5f285f3 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 08:57:29 +0000 Subject: [PATCH 01/13] Add between pushdown kernel for DecimalByteParts DecimalByteParts already pushed `compare` against a constant down to its numeric MSP child. This adds the symmetric `between` kernel so that bounded-range predicates are evaluated directly on the compact MSP representation instead of canonicalizing to a wide DecimalArray. Both bounds are converted to the MSP's physical integer type and the comparison is delegated to the MSP's own `between`. When a bound falls outside the MSP's integer range the kernel falls back to the canonical decimal `between`, which already handles the overflow directions. Signed-off-by: Joe Isaacs --- encodings/decimal-byte-parts/public-api.lock | 4 + .../src/decimal_byte_parts/compute/between.rs | 223 ++++++++++++++++++ .../src/decimal_byte_parts/compute/compare.rs | 9 +- .../src/decimal_byte_parts/compute/kernel.rs | 2 + .../src/decimal_byte_parts/compute/mod.rs | 3 +- 5 files changed, 237 insertions(+), 4 deletions(-) create mode 100644 encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs diff --git a/encodings/decimal-byte-parts/public-api.lock b/encodings/decimal-byte-parts/public-api.lock index 985172de937..372706034dc 100644 --- a/encodings/decimal-byte-parts/public-api.lock +++ b/encodings/decimal-byte-parts/public-api.lock @@ -64,6 +64,10 @@ impl vortex_array::arrays::slice::SliceReduce for vortex_decimal_byte_parts::Dec pub fn vortex_decimal_byte_parts::DecimalByteParts::slice(vortex_array::array::view::ArrayView<'_, Self>, core::ops::range::Range) -> vortex_error::VortexResult> +impl vortex_array::scalar_fn::fns::between::kernel::BetweenKernel for vortex_decimal_byte_parts::DecimalByteParts + +pub fn vortex_decimal_byte_parts::DecimalByteParts::between(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, &vortex_array::array::erased::ArrayRef, &vortex_array::scalar_fn::fns::between::BetweenOptions, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> + impl vortex_array::scalar_fn::fns::binary::compare::CompareKernel for vortex_decimal_byte_parts::DecimalByteParts pub fn vortex_decimal_byte_parts::DecimalByteParts::compare(vortex_array::array::view::ArrayView<'_, Self>, &vortex_array::array::erased::ArrayRef, vortex_array::scalar_fn::fns::operators::CompareOperator, &mut vortex_array::executor::ExecutionCtx) -> vortex_error::VortexResult> diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs new file mode 100644 index 00000000000..78d4aa47689 --- /dev/null +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::between::BetweenKernel; +use vortex_array::scalar_fn::fns::between::BetweenOptions; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::DecimalByteParts; +use crate::decimal_byte_parts::DecimalBytePartsArrayExt; +use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive; + +impl BetweenKernel for DecimalByteParts { + fn between( + arr: ArrayView<'_, Self>, + lower: &ArrayRef, + upper: &ArrayRef, + options: &BetweenOptions, + _ctx: &mut ExecutionCtx, + ) -> VortexResult> { + // We can only push the comparison down into the MSP when both bounds are constant. + let (Some(lower_const), Some(upper_const)) = (lower.as_constant(), upper.as_constant()) + else { + return Ok(None); + }; + + // NOTE: the `between` entrypoint precondition already replaced null bounds with an + // all-null result, so both bounds are guaranteed to be non-null here. + let lower_decimal = lower_const + .as_decimal() + .decimal_value() + .vortex_expect("checked for null in entry func"); + let upper_decimal = upper_const + .as_decimal() + .decimal_value() + .vortex_expect("checked for null in entry func"); + + let nullability = + arr.dtype().nullability() | lower.dtype().nullability() | upper.dtype().nullability(); + let scalar_type = arr.msp().dtype().with_nullability(nullability); + let msp_ptype = arr.msp().as_primitive_typed().ptype(); + + // If either bound falls outside the MSP's physical integer range we cannot push the + // comparison down losslessly. Fall back to the canonical decimal `between`, which handles + // the overflow directions (all-true / all-false constraints) correctly. + let (Ok(lower_value), Ok(upper_value)) = ( + decimal_value_wrapper_to_primitive(lower_decimal, msp_ptype), + decimal_value_wrapper_to_primitive(upper_decimal, msp_ptype), + ) else { + return Ok(None); + }; + + let lower_const = ConstantArray::new( + Scalar::try_new(scalar_type.clone(), Some(lower_value))?, + arr.len(), + ); + let upper_const = + ConstantArray::new(Scalar::try_new(scalar_type, Some(upper_value))?, arr.len()); + + arr.msp() + .clone() + .between( + lower_const.into_array(), + upper_const.into_array(), + options.clone(), + ) + .map(Some) + } +} + +#[cfg(test)] +mod tests { + use vortex_array::ArrayRef; + use vortex_array::IntoArray; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::PrimitiveArray; + use vortex_array::assert_arrays_eq; + use vortex_array::builtins::ArrayBuiltins; + use vortex_array::dtype::DecimalDType; + use vortex_array::dtype::Nullability; + use vortex_array::scalar::DecimalValue; + use vortex_array::scalar::Scalar; + use vortex_array::scalar_fn::fns::between::BetweenOptions; + use vortex_array::scalar_fn::fns::between::StrictComparison; + use vortex_array::validity::Validity; + use vortex_buffer::buffer; + use vortex_error::VortexResult; + + use crate::DecimalByteParts; + + fn decimal_const(value: DecimalValue, decimal_type: DecimalDType, len: usize) -> ArrayRef { + ConstantArray::new( + Scalar::decimal(value, decimal_type, Nullability::NonNullable), + len, + ) + .into_array() + } + + #[test] + fn between_decimal_const() -> VortexResult<()> { + let decimal_type = DecimalDType::new(8, 2); + let arr = DecimalByteParts::try_new( + PrimitiveArray::new(buffer![100i32, 200, 300, 400, 500], Validity::AllValid) + .into_array(), + decimal_type, + )? + .into_array(); + + let lower = decimal_const(DecimalValue::I64(200), decimal_type, arr.len()); + let upper = decimal_const(DecimalValue::I64(400), decimal_type, arr.len()); + + // 200 <= value <= 400 + let res = arr.clone().between( + lower.clone(), + upper.clone(), + BetweenOptions { + lower_strict: StrictComparison::NonStrict, + upper_strict: StrictComparison::NonStrict, + }, + )?; + assert_arrays_eq!( + res, + BoolArray::from_iter([Some(false), Some(true), Some(true), Some(true), Some(false)]) + ); + + // 200 < value < 400 + let res = arr.between( + lower, + upper, + BetweenOptions { + lower_strict: StrictComparison::Strict, + upper_strict: StrictComparison::Strict, + }, + )?; + assert_arrays_eq!( + res, + BoolArray::from_iter([ + Some(false), + Some(false), + Some(true), + Some(false), + Some(false) + ]) + ); + + Ok(()) + } + + #[test] + fn between_decimal_nullable() -> VortexResult<()> { + let decimal_type = DecimalDType::new(8, 2); + let arr = DecimalByteParts::try_new( + PrimitiveArray::new( + buffer![100i32, 200, 300, 400], + Validity::Array(BoolArray::from_iter([false, true, true, true]).into_array()), + ) + .into_array(), + decimal_type, + )? + .into_array(); + + let lower = decimal_const(DecimalValue::I64(100), decimal_type, arr.len()); + let upper = decimal_const(DecimalValue::I64(300), decimal_type, arr.len()); + + let res = arr.between( + lower, + upper, + BetweenOptions { + lower_strict: StrictComparison::NonStrict, + upper_strict: StrictComparison::NonStrict, + }, + )?; + assert_arrays_eq!( + res, + BoolArray::from_iter([None, Some(true), Some(true), Some(false)]) + ); + + Ok(()) + } + + /// Bounds that do not fit in the MSP's physical type must fall back to the canonical decimal + /// `between`, which handles the overflow directions. Here the array uses i32 storage but the + /// upper bound only fits in i128, so the upper constraint is always satisfied. + #[test] + fn between_decimal_unconvertible_bound() -> VortexResult<()> { + let decimal_type = DecimalDType::new(38, 2); + let arr = DecimalByteParts::try_new( + PrimitiveArray::new(buffer![100i32, 200, 300], Validity::AllValid).into_array(), + decimal_type, + )? + .into_array(); + + let lower = decimal_const(DecimalValue::I64(150), decimal_type, arr.len()); + let upper = decimal_const( + DecimalValue::I128(9_999_999_999_999_999_999), + decimal_type, + arr.len(), + ); + + let res = arr.between( + lower, + upper, + BetweenOptions { + lower_strict: StrictComparison::NonStrict, + upper_strict: StrictComparison::NonStrict, + }, + )?; + assert_arrays_eq!( + res, + BoolArray::from_iter([Some(false), Some(true), Some(true)]) + ); + + Ok(()) + } +} diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs index e6cc8d4d72d..041fcd864a1 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs @@ -84,7 +84,7 @@ impl CompareKernel for DecimalByteParts { // Used to represent the overflow direction when trying to // convert into the scalar type. #[derive(Debug)] -enum Sign { +pub(crate) enum Sign { Positive, Negative, } @@ -102,8 +102,11 @@ fn unconvertible_value(sign: Sign, operator: CompareOperator, nullability: Nulla } } -// this value return None is the decimal scalar cannot be cast the ptype. -fn decimal_value_wrapper_to_primitive( +// Converts a decimal value into the integer `ScalarValue` of the MSP's physical `ptype`. +// +// Returns `Err(Sign)` when the value does not fit in `ptype`, where the sign indicates whether +// the value overflowed past `ptype`'s maximum (`Positive`) or minimum (`Negative`). +pub(crate) fn decimal_value_wrapper_to_primitive( decimal_value: DecimalValue, ptype: PType, ) -> Result { diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs index 9e16b4f81dc..1ca17f99b8f 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/kernel.rs @@ -3,11 +3,13 @@ use vortex_array::arrays::dict::TakeExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::between::BetweenExecuteAdaptor; use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; use crate::DecimalByteParts; pub(crate) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&BetweenExecuteAdaptor(DecimalByteParts)), ParentKernelSet::lift(&CompareExecuteAdaptor(DecimalByteParts)), ParentKernelSet::lift(&TakeExecuteAdaptor(DecimalByteParts)), ]); diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs index 35da5be40b6..8434b4f7dfb 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs @@ -1,8 +1,9 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +mod between; mod cast; -mod compare; +pub(crate) mod compare; mod filter; pub(crate) mod is_constant; pub(crate) mod kernel; From 0c1f8f6567206224bde353fe22f0b8f29ef297d9 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 13:29:25 +0000 Subject: [PATCH 02/13] Add decimal between benchmark vs arrow-rs Benchmarks `between` over DecimalByteParts (i32/i64 MSP pushdown), the canonical i128 DecimalArray, and arrow-rs Decimal128. Demonstrates that pushing the comparison down to a narrow MSP beats arrow-rs (~2.4-2.7x at 1M rows), since arrow has no decimal storage narrower than 128 bits. Signed-off-by: Joe Isaacs --- Cargo.lock | 5 + encodings/decimal-byte-parts/Cargo.toml | 9 + .../benches/decimal_between.rs | 216 ++++++++++++++++++ 3 files changed, 230 insertions(+) create mode 100644 encodings/decimal-byte-parts/benches/decimal_between.rs diff --git a/Cargo.lock b/Cargo.lock index cbf73b2a20c..88cfe592aa2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -9586,8 +9586,13 @@ dependencies = [ name = "vortex-decimal-byte-parts" version = "0.1.0" dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-ord", + "codspeed-divan-compat", "num-traits", "prost 0.14.3", + "rand 0.10.1", "rstest", "vortex-array", "vortex-buffer", diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml index 4934ec4fa27..549c0b05e7c 100644 --- a/encodings/decimal-byte-parts/Cargo.toml +++ b/encodings/decimal-byte-parts/Cargo.toml @@ -26,5 +26,14 @@ vortex-mask = { workspace = true } vortex-session = { workspace = true } [dev-dependencies] +arrow-arith = { workspace = true } +arrow-array = { workspace = true } +arrow-ord = { workspace = true } +divan = { workspace = true } +rand = { workspace = true } rstest = { workspace = true } vortex-array = { path = "../../vortex-array", features = ["_test-harness"] } + +[[bench]] +name = "decimal_between" +harness = false diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs new file mode 100644 index 00000000000..f1efb39bbc2 --- /dev/null +++ b/encodings/decimal-byte-parts/benches/decimal_between.rs @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compares `between` (`lower <= x <= upper`) over decimal data for: +//! * Vortex `DecimalByteParts` with an i32 most-significant-part (pushdown kernel), +//! * Vortex `DecimalByteParts` with an i64 most-significant-part (pushdown kernel), +//! * Vortex canonical `DecimalArray` (i128 storage), +//! * arrow-rs `Decimal128Array` via `gt_eq` + `lt_eq` + `and`. +//! +//! arrow-rs has no decimal storage narrower than 128 bits, so logically-small decimals that +//! Vortex keeps in an i32/i64 MSP must be materialised as i128 in Arrow. This benchmark +//! measures the resulting throughput difference. + +use arrow_arith::boolean::and; +use arrow_array::Decimal128Array; +use arrow_array::Scalar as ArrowScalar; +use arrow_ord::cmp; +use divan::Bencher; +use divan::black_box; +use rand::RngExt; +use rand::SeedableRng; +use rand::rngs::StdRng; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DecimalDType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar::DecimalValue; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::between::BetweenOptions; +use vortex_array::scalar_fn::fns::between::StrictComparison; +use vortex_array::validity::Validity; +use vortex_decimal_byte_parts::DecimalByteParts; + +fn main() { + divan::main(); +} + +const LENGTHS: &[usize] = &[1 << 16, 1 << 20]; + +// Logical decimal range [0, 1000), precision 9 scale 2 (fits i32) and precision 18 (fits i64). +const LOWER: i64 = 250; +const UPPER: i64 = 750; + +const OPTIONS: BetweenOptions = BetweenOptions { + lower_strict: StrictComparison::NonStrict, + upper_strict: StrictComparison::NonStrict, +}; + +fn values(len: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(0x5eed); + (0..len).map(|_| rng.random_range(0..1000i64)).collect() +} + +// ---- Vortex DecimalByteParts (i32 MSP) ---- + +#[divan::bench(args = LENGTHS)] +fn vortex_byteparts_i32(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(9, 2); + let msp = PrimitiveArray::from_iter(values(len).into_iter().map(|v| v as i32)).into_array(); + let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array(); + let lower = ConstantArray::new( + Scalar::decimal( + DecimalValue::I32(LOWER as i32), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + let upper = ConstantArray::new( + Scalar::decimal( + DecimalValue::I32(UPPER as i32), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + lower.clone(), + upper.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, lower, upper, mut ctx)| { + black_box( + arr.between(lower, upper, OPTIONS) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +// ---- Vortex DecimalByteParts (i64 MSP) ---- + +#[divan::bench(args = LENGTHS)] +fn vortex_byteparts_i64(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(18, 2); + let msp = PrimitiveArray::from_iter(values(len)).into_array(); + let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array(); + let lower = ConstantArray::new( + Scalar::decimal(DecimalValue::I64(LOWER), dt, Nullability::NonNullable), + len, + ) + .into_array(); + let upper = ConstantArray::new( + Scalar::decimal(DecimalValue::I64(UPPER), dt, Nullability::NonNullable), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + lower.clone(), + upper.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, lower, upper, mut ctx)| { + black_box( + arr.between(lower, upper, OPTIONS) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +// ---- Vortex canonical DecimalArray (i128 storage) ---- + +#[divan::bench(args = LENGTHS)] +fn vortex_canonical_i128(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(9, 2); + let arr = DecimalArray::new( + values(len).into_iter().map(i128::from).collect(), + dt, + Validity::NonNullable, + ) + .into_array(); + let lower = ConstantArray::new( + Scalar::decimal( + DecimalValue::I128(LOWER as i128), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + let upper = ConstantArray::new( + Scalar::decimal( + DecimalValue::I128(UPPER as i128), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + lower.clone(), + upper.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, lower, upper, mut ctx)| { + black_box( + arr.between(lower, upper, OPTIONS) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +// ---- arrow-rs Decimal128 (gt_eq + lt_eq + and) ---- + +#[divan::bench(args = LENGTHS)] +fn arrow_decimal128(bencher: Bencher, len: usize) { + let arr = Decimal128Array::from_iter_values(values(len).into_iter().map(i128::from)) + .with_precision_and_scale(9, 2) + .unwrap(); + let lower = ArrowScalar::new( + Decimal128Array::from_iter_values([LOWER as i128]) + .with_precision_and_scale(9, 2) + .unwrap(), + ); + let upper = ArrowScalar::new( + Decimal128Array::from_iter_values([UPPER as i128]) + .with_precision_and_scale(9, 2) + .unwrap(), + ); + + bencher + .with_inputs(|| (arr.clone(), lower.clone(), upper.clone())) + .bench_values(|(arr, lower, upper)| { + let ge = cmp::gt_eq(&arr, &lower).unwrap(); + let le = cmp::lt_eq(&arr, &upper).unwrap(); + black_box(and(&ge, &le).unwrap()) + }); +} From 77a32cd94415e7e9816933ef7515ab9d061542c6 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 14:16:32 +0000 Subject: [PATCH 03/13] Cap decimal between bench sizes so all cases stay under 1ms Drop the 1M-row case (arrow/canonical exceeded 1ms there) and use 64k/128k rows, keeping all four engines under a 1ms-per-op budget while preserving the cross-engine comparison. Signed-off-by: Joe Isaacs --- encodings/decimal-byte-parts/benches/decimal_between.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs index f1efb39bbc2..5b8efdffd5f 100644 --- a/encodings/decimal-byte-parts/benches/decimal_between.rs +++ b/encodings/decimal-byte-parts/benches/decimal_between.rs @@ -41,7 +41,7 @@ fn main() { divan::main(); } -const LENGTHS: &[usize] = &[1 << 16, 1 << 20]; +const LENGTHS: &[usize] = &[1 << 16, 1 << 17]; // Logical decimal range [0, 1000), precision 9 scale 2 (fits i32) and precision 18 (fits i64). const LOWER: i64 = 250; From 1f2f901f2669006c1e8d594326625d806d403523 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 17:23:41 +0000 Subject: [PATCH 04/13] Add two-limb i128 representation and lexicographic between to DecimalByteParts Implements the previously-scaffolded second limb: a decimal i128 can now be stored as a signed i64 high limb (msp, carrying validity) plus a non-nullable u64 low limb, reconstructed as (msp as i128) << 64 | low. Wires the lower limb through the constructor, validation, serde, canonicalization, scalar_at, and slice/filter/take/mask/cast; compare and is_constant defer to the canonical path for two-limb arrays. Adds a lexicographic two-limb `between` pushdown that compares the limbs with native-width integer ops. Correctness is verified against the canonical i128 implementation across strictness modes and nulls, plus two-limb consistency cases. Benchmark note: the pushdown composes generic array ops (~11 passes with intermediate Bool allocations) and is currently slower than both arrow-rs Decimal128 (~2.7x) and the canonical i128 path on wide values. Beating arrow requires a fused single-pass SIMD kernel rather than composed expressions; the benchmark (decimal_between, wide cases) documents the current state. Signed-off-by: Joe Isaacs --- .../benches/decimal_between.rs | 127 ++++++++++ encodings/decimal-byte-parts/public-api.lock | 8 +- .../src/decimal_byte_parts/compute/between.rs | 199 +++++++++++++++- .../src/decimal_byte_parts/compute/cast.rs | 13 +- .../src/decimal_byte_parts/compute/compare.rs | 5 + .../src/decimal_byte_parts/compute/filter.rs | 22 +- .../decimal_byte_parts/compute/is_constant.rs | 5 + .../src/decimal_byte_parts/compute/mask.rs | 23 +- .../src/decimal_byte_parts/compute/mod.rs | 26 +++ .../src/decimal_byte_parts/compute/take.rs | 22 +- .../src/decimal_byte_parts/mod.rs | 219 ++++++++++++------ .../src/decimal_byte_parts/rules.rs | 5 +- .../src/decimal_byte_parts/slice.rs | 22 +- 13 files changed, 580 insertions(+), 116 deletions(-) diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs index 5b8efdffd5f..e55f0866c7c 100644 --- a/encodings/decimal-byte-parts/benches/decimal_between.rs +++ b/encodings/decimal-byte-parts/benches/decimal_between.rs @@ -11,6 +11,8 @@ //! Vortex keeps in an i32/i64 MSP must be materialised as i128 in Arrow. This benchmark //! measures the resulting throughput difference. +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + use arrow_arith::boolean::and; use arrow_array::Decimal128Array; use arrow_array::Scalar as ArrowScalar; @@ -214,3 +216,128 @@ fn arrow_decimal128(bencher: Bencher, len: usize) { black_box(and(&ge, &le).unwrap()) }); } + +// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ---- +// +// These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex +// nor arrow can keep them in a narrow integer. The two-limb representation splits each value into +// a signed i64 high limb and an unsigned u64 low limb, letting `between` run as native-width +// (SIMD-friendly) integer comparisons instead of arrow's 128-bit comparison. + +fn wide_values(len: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(0x5eed); + (0..len) + .map(|_| { + let high = i128::from(rng.random_range(0..1000i64)); + let low = i128::from(rng.random_range(0..u64::MAX)); + (high << 64) | low + }) + .collect() +} + +// Bounds with non-zero low limbs so the low-limb tie-break is exercised at the high-limb edges. +const WIDE_LOWER: i128 = (250i128 << 64) | 0x1234_5678; +const WIDE_UPPER: i128 = (750i128 << 64) | 0x90ab_cdef; + +#[divan::bench(args = LENGTHS)] +fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(38, 2); + let values = wide_values(len); + let highs = PrimitiveArray::from_iter(values.iter().map(|v| (v >> 64) as i64)).into_array(); + let lows = PrimitiveArray::from_iter(values.iter().map(|v| *v as u64)).into_array(); + let arr = DecimalByteParts::try_new_with_lower(highs, lows, dt) + .unwrap() + .into_array(); + let lower = ConstantArray::new( + Scalar::decimal(DecimalValue::I128(WIDE_LOWER), dt, Nullability::NonNullable), + len, + ) + .into_array(); + let upper = ConstantArray::new( + Scalar::decimal(DecimalValue::I128(WIDE_UPPER), dt, Nullability::NonNullable), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + lower.clone(), + upper.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, lower, upper, mut ctx)| { + black_box( + arr.between(lower, upper, OPTIONS) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +#[divan::bench(args = LENGTHS)] +fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(38, 2); + let arr = DecimalArray::new( + wide_values(len).into_iter().collect(), + dt, + Validity::NonNullable, + ) + .into_array(); + let lower = ConstantArray::new( + Scalar::decimal(DecimalValue::I128(WIDE_LOWER), dt, Nullability::NonNullable), + len, + ) + .into_array(); + let upper = ConstantArray::new( + Scalar::decimal(DecimalValue::I128(WIDE_UPPER), dt, Nullability::NonNullable), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + lower.clone(), + upper.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, lower, upper, mut ctx)| { + black_box( + arr.between(lower, upper, OPTIONS) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +#[divan::bench(args = LENGTHS)] +fn arrow_decimal128_wide(bencher: Bencher, len: usize) { + let arr = Decimal128Array::from_iter_values(wide_values(len)) + .with_precision_and_scale(38, 2) + .unwrap(); + let lower = ArrowScalar::new( + Decimal128Array::from_iter_values([WIDE_LOWER]) + .with_precision_and_scale(38, 2) + .unwrap(), + ); + let upper = ArrowScalar::new( + Decimal128Array::from_iter_values([WIDE_UPPER]) + .with_precision_and_scale(38, 2) + .unwrap(), + ); + + bencher + .with_inputs(|| (arr.clone(), lower.clone(), upper.clone())) + .bench_values(|(arr, lower, upper)| { + let ge = cmp::gt_eq(&arr, &lower).unwrap(); + let le = cmp::lt_eq(&arr, &upper).unwrap(); + black_box(and(&ge, &le).unwrap()) + }); +} diff --git a/encodings/decimal-byte-parts/public-api.lock b/encodings/decimal-byte-parts/public-api.lock index 372706034dc..3fdd0ea6013 100644 --- a/encodings/decimal-byte-parts/public-api.lock +++ b/encodings/decimal-byte-parts/public-api.lock @@ -6,6 +6,8 @@ impl vortex_decimal_byte_parts::DecimalByteParts pub fn vortex_decimal_byte_parts::DecimalByteParts::try_new(vortex_array::array::erased::ArrayRef, vortex_array::dtype::decimal::DecimalDType) -> vortex_error::VortexResult +pub fn vortex_decimal_byte_parts::DecimalByteParts::try_new_with_lower(vortex_array::array::erased::ArrayRef, vortex_array::array::erased::ArrayRef, vortex_array::dtype::decimal::DecimalDType) -> vortex_error::VortexResult + impl core::clone::Clone for vortex_decimal_byte_parts::DecimalByteParts pub fn vortex_decimal_byte_parts::DecimalByteParts::clone(&self) -> vortex_decimal_byte_parts::DecimalByteParts @@ -84,7 +86,7 @@ pub struct vortex_decimal_byte_parts::DecimalBytePartsData impl vortex_decimal_byte_parts::DecimalBytePartsData -pub fn vortex_decimal_byte_parts::DecimalBytePartsData::validate(&vortex_array::array::erased::ArrayRef, vortex_array::dtype::decimal::DecimalDType, &vortex_array::dtype::DType, usize) -> vortex_error::VortexResult<()> +pub fn vortex_decimal_byte_parts::DecimalBytePartsData::validate(&vortex_array::array::erased::ArrayRef, core::option::Option<&vortex_array::array::erased::ArrayRef>, vortex_array::dtype::decimal::DecimalDType, &vortex_array::dtype::DType, usize) -> vortex_error::VortexResult<()> impl core::clone::Clone for vortex_decimal_byte_parts::DecimalBytePartsData @@ -138,10 +140,14 @@ pub fn vortex_decimal_byte_parts::DecimalBytesPartsMetadata::encoded_len(&self) pub trait vortex_decimal_byte_parts::DecimalBytePartsArrayExt: vortex_array::array::typed::TypedArrayRef +pub fn vortex_decimal_byte_parts::DecimalBytePartsArrayExt::lower(&self) -> core::option::Option<&vortex_array::array::erased::ArrayRef> + pub fn vortex_decimal_byte_parts::DecimalBytePartsArrayExt::msp(&self) -> &vortex_array::array::erased::ArrayRef impl> vortex_decimal_byte_parts::DecimalBytePartsArrayExt for T +pub fn T::lower(&self) -> core::option::Option<&vortex_array::array::erased::ArrayRef> + pub fn T::msp(&self) -> &vortex_array::array::erased::ArrayRef pub fn vortex_decimal_byte_parts::initialize(&vortex_session::VortexSession) diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs index 78d4aa47689..421677f3b49 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -7,14 +7,20 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::ConstantArray; use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; +use vortex_array::scalar::ScalarValue; use vortex_array::scalar_fn::fns::between::BetweenKernel; use vortex_array::scalar_fn::fns::between::BetweenOptions; +use vortex_array::scalar_fn::fns::between::StrictComparison; +use vortex_array::scalar_fn::fns::operators::Operator; use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; +use crate::decimal_byte_parts::LOWER_PTYPE; use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive; impl BetweenKernel for DecimalByteParts { @@ -25,7 +31,7 @@ impl BetweenKernel for DecimalByteParts { options: &BetweenOptions, _ctx: &mut ExecutionCtx, ) -> VortexResult> { - // We can only push the comparison down into the MSP when both bounds are constant. + // We can only push the comparison down into the limbs when both bounds are constant. let (Some(lower_const), Some(upper_const)) = (lower.as_constant(), upper.as_constant()) else { return Ok(None); @@ -44,6 +50,24 @@ impl BetweenKernel for DecimalByteParts { let nullability = arr.dtype().nullability() | lower.dtype().nullability() | upper.dtype().nullability(); + + if arr.lower().is_some() { + // Two-limb representation: a lexicographic comparison over the (signed high, unsigned + // low) limbs. Both bounds must fit in i128 to be split into limbs. + let (Some(lower_i128), Some(upper_i128)) = + (lower_decimal.cast::(), upper_decimal.cast::()) + else { + return Ok(None); + }; + return Ok(Some(two_limb_between( + &arr, + lower_i128, + upper_i128, + options, + nullability, + )?)); + } + let scalar_type = arr.msp().dtype().with_nullability(nullability); let msp_ptype = arr.msp().as_primitive_typed().ptype(); @@ -75,12 +99,93 @@ impl BetweenKernel for DecimalByteParts { } } +/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation. +/// +/// With high limb `H` (signed i64) and low limb `L` (unsigned u64), the value `v = H<<64 | L` +/// satisfies `v >= lower` iff `H > lo_h OR (H == lo_h AND L >=' lo_l)` and `v <= upper` iff +/// `H < hi_h OR (H == hi_h AND L <=' hi_l)`, where `>='`/`<='` are strict when requested. Each +/// limb comparison is a native-width integer compare that vectorizes far better than a 128-bit +/// comparison. +fn two_limb_between( + arr: &ArrayView<'_, DecimalByteParts>, + lower: i128, + upper: i128, + options: &BetweenOptions, + nullability: Nullability, +) -> VortexResult { + let len = arr.len(); + let high = arr.msp().clone(); + let low = arr + .lower() + .vortex_expect("two-limb path requires a lower limb") + .clone(); + + let high_dtype = high.dtype().with_nullability(nullability); + let low_dtype = DType::Primitive(LOWER_PTYPE, nullability); + + let high_const = |limb: i64| -> VortexResult { + Ok(ConstantArray::new( + Scalar::try_new(high_dtype.clone(), Some(ScalarValue::from(limb)))?, + len, + ) + .into_array()) + }; + let low_const = |limb: u64| -> VortexResult { + Ok(ConstantArray::new( + Scalar::try_new(low_dtype.clone(), Some(ScalarValue::from(limb)))?, + len, + ) + .into_array()) + }; + + let lower_low_op = match options.lower_strict { + StrictComparison::Strict => Operator::Gt, + StrictComparison::NonStrict => Operator::Gte, + }; + let upper_low_op = match options.upper_strict { + StrictComparison::Strict => Operator::Lt, + StrictComparison::NonStrict => Operator::Lte, + }; + + let (lower_high, lower_low) = split_i128(lower); + let (upper_high, upper_low) = split_i128(upper); + + // value >= lower + let ge_lower = { + let h_gt = high.binary(high_const(lower_high)?, Operator::Gt)?; + let h_eq = high.binary(high_const(lower_high)?, Operator::Eq)?; + let l_cmp = low.binary(low_const(lower_low)?, lower_low_op)?; + h_gt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)? + }; + + // value <= upper + let le_upper = { + let h_lt = high.binary(high_const(upper_high)?, Operator::Lt)?; + let h_eq = high.binary(high_const(upper_high)?, Operator::Eq)?; + let l_cmp = low.binary(low_const(upper_low)?, upper_low_op)?; + h_lt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)? + }; + + ge_lower.binary(le_upper, Operator::And) +} + +/// Split an i128 into its signed high and unsigned low 64-bit limbs. The truncating casts are the +/// intended limb extraction: `>> 64` keeps the high 64 bits and `as u64` keeps the low 64 bits. +#[allow(clippy::cast_possible_truncation)] +fn split_i128(value: i128) -> (i64, u64) { + ((value >> 64) as i64, value as u64) +} + #[cfg(test)] mod tests { + use rstest::rstest; use vortex_array::ArrayRef; use vortex_array::IntoArray; + use vortex_array::LEGACY_SESSION; + use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::DecimalArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::assert_arrays_eq; use vortex_array::builtins::ArrayBuiltins; @@ -91,6 +196,7 @@ mod tests { use vortex_array::scalar_fn::fns::between::BetweenOptions; use vortex_array::scalar_fn::fns::between::StrictComparison; use vortex_array::validity::Validity; + use vortex_buffer::Buffer; use vortex_buffer::buffer; use vortex_error::VortexResult; @@ -104,6 +210,97 @@ mod tests { .into_array() } + #[allow(clippy::cast_possible_truncation)] + fn two_limb(values: &[i128], decimal_type: DecimalDType) -> ArrayRef { + let highs: Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); + let lows: Buffer = values.iter().map(|v| *v as u64).collect(); + DecimalByteParts::try_new_with_lower( + PrimitiveArray::new(highs, Validity::NonNullable).into_array(), + PrimitiveArray::new(lows, Validity::NonNullable).into_array(), + decimal_type, + ) + .unwrap() + .into_array() + } + + /// The two-limb `between` pushdown must agree with the canonical i128 implementation across + /// values spanning the low-limb wraparound, the high limb, and negatives, for every strictness. + #[rstest] + #[case(StrictComparison::NonStrict, StrictComparison::NonStrict)] + #[case(StrictComparison::Strict, StrictComparison::NonStrict)] + #[case(StrictComparison::NonStrict, StrictComparison::Strict)] + #[case(StrictComparison::Strict, StrictComparison::Strict)] + fn two_limb_between_matches_canonical( + #[case] lower_strict: StrictComparison, + #[case] upper_strict: StrictComparison, + ) -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let decimal_type = DecimalDType::new(38, 0); + let values: Vec = vec![ + 0, + 1, + -1, + i128::from(i64::MAX), + i128::from(i64::MAX) + 1, + (5i128 << 64) | 3, + (5i128 << 64) | 5, + (5i128 << 64) | 9, + (4i128 << 64) | i128::from(u64::MAX), + (6i128 << 64), + -(7i128 << 64) | 11, + ]; + let lower = (5i128 << 64) | 3; + let upper = (5i128 << 64) | 9; + let len = values.len(); + let options = BetweenOptions { + lower_strict, + upper_strict, + }; + + let lower_arr = decimal_const(DecimalValue::I128(lower), decimal_type, len); + let upper_arr = decimal_const(DecimalValue::I128(upper), decimal_type, len); + + let got = two_limb(&values, decimal_type) + .between(lower_arr.clone(), upper_arr.clone(), options.clone())? + .execute::(&mut ctx)?; + + let canonical = DecimalArray::new( + values.iter().copied().collect::>(), + decimal_type, + Validity::NonNullable, + ) + .into_array(); + let want = canonical + .between(lower_arr, upper_arr, options)? + .execute::(&mut ctx)?; + + assert_arrays_eq!(got, want); + Ok(()) + } + + /// A two-limb array must canonicalize to the same values as a canonical i128 `DecimalArray`. + #[test] + fn two_limb_canonicalizes_to_i128() -> VortexResult<()> { + let mut ctx = LEGACY_SESSION.create_execution_ctx(); + let decimal_type = DecimalDType::new(38, 0); + let values: Vec = vec![ + 0, + -1, + i128::from(i64::MIN), + (3i128 << 64) | 42, + -(9i128 << 64) | 17, + ]; + + let got = two_limb(&values, decimal_type).execute::(&mut ctx)?; + let want = DecimalArray::new( + values.iter().copied().collect::>(), + decimal_type, + Validity::NonNullable, + ); + assert_arrays_eq!(got.into_array(), want.into_array()); + Ok(()) + } + #[test] fn between_decimal_const() -> VortexResult<()> { let decimal_type = DecimalDType::new(8, 2); diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs index 882d07bbaef..d0238f956d7 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/cast.rs @@ -24,14 +24,19 @@ impl CastReduce for DecimalByteParts { return Ok(None); }; - // Cast the msp array to handle nullability change + // Cast the msp array to handle nullability change. Validity lives on the msp, so the lower + // limb (always non-nullable) is unchanged. let new_msp = array .msp() .cast(array.msp().dtype().with_nullability(*target_nullability))?; - Ok(Some( - DecimalByteParts::try_new(new_msp, *target_decimal)?.into_array(), - )) + let casted = match array.lower() { + None => DecimalByteParts::try_new(new_msp, *target_decimal)?, + Some(lower) => { + DecimalByteParts::try_new_with_lower(new_msp, lower.clone(), *target_decimal)? + } + }; + Ok(Some(casted.into_array())) } } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs index 041fcd864a1..58a1135702e 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs @@ -39,6 +39,11 @@ impl CompareKernel for DecimalByteParts { return Ok(None); }; + // The two-limb representation only specializes `between`; defer comparisons to canonical. + if lhs.lower().is_some() { + return Ok(None); + } + let nullability = lhs.dtype().nullability() | rhs.dtype().nullability(); let scalar_type = lhs.msp().dtype().with_nullability(nullability); diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs index c82171b0e13..91a836319d7 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/filter.rs @@ -13,14 +13,20 @@ use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; impl FilterReduce for DecimalByteParts { fn filter(array: ArrayView<'_, Self>, mask: &Mask) -> VortexResult> { - DecimalByteParts::try_new( - array.msp().filter(mask.clone())?, - *array - .dtype() - .as_decimal_opt() - .vortex_expect("must be a decimal dtype"), - ) - .map(|d| Some(d.into_array())) + let decimal_dtype = *array + .dtype() + .as_decimal_opt() + .vortex_expect("must be a decimal dtype"); + let msp = array.msp().filter(mask.clone())?; + let filtered = match array.lower() { + None => DecimalByteParts::try_new(msp, decimal_dtype)?, + Some(lower) => DecimalByteParts::try_new_with_lower( + msp, + lower.filter(mask.clone())?, + decimal_dtype, + )?, + }; + Ok(Some(filtered.into_array())) } } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs index f4da528a9b3..ccf28594691 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/is_constant.rs @@ -34,6 +34,11 @@ impl DynAggregateKernel for DecimalBytePartsIsConstantKernel { return Ok(None); }; + // Constness of a two-limb array depends on both limbs; defer to the canonical path. + if array.lower().is_some() { + return Ok(None); + } + let result = is_constant(array.msp(), ctx)?; Ok(Some(IsConstant::make_partial(batch, result, ctx)?)) } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs index 0f007519894..efbdd02775b 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mask.rs @@ -16,20 +16,23 @@ use crate::decimal_byte_parts::DecimalBytePartsArrayExt; impl MaskReduce for DecimalByteParts { fn mask(array: ArrayView<'_, Self>, mask: &ArrayRef) -> VortexResult> { + let decimal_dtype = *array + .dtype() + .as_decimal_opt() + .vortex_expect("must be a decimal dtype"); + // Validity is carried solely by the msp, so masking the msp is sufficient; the lower + // limb's values at masked positions become unobservable. let masked_msp = MaskExpr.try_new_array( array.msp().len(), EmptyOptions, [array.msp().clone(), mask.clone()], )?; - Ok(Some( - DecimalByteParts::try_new( - masked_msp, - *array - .dtype() - .as_decimal_opt() - .vortex_expect("must be a decimal dtype"), - )? - .into_array(), - )) + let masked = match array.lower() { + None => DecimalByteParts::try_new(masked_msp, decimal_dtype)?, + Some(lower) => { + DecimalByteParts::try_new_with_lower(masked_msp, lower.clone(), decimal_dtype)? + } + }; + Ok(Some(masked.into_array())) } } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs index 8434b4f7dfb..fca746b7eb5 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs @@ -14,14 +14,29 @@ mod take; mod tests { use rstest::rstest; use vortex_array::IntoArray; + use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; use vortex_array::compute::conformance::consistency::test_array_consistency; use vortex_array::dtype::DecimalDType; + use vortex_array::validity::Validity; + use vortex_buffer::Buffer; use vortex_buffer::buffer; use crate::DecimalByteParts; use crate::DecimalBytePartsArray; + #[allow(clippy::cast_possible_truncation)] + fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> DecimalBytePartsArray { + let highs: Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); + let lows: Buffer = values.iter().map(|v| *v as u64).collect(); + DecimalByteParts::try_new_with_lower( + PrimitiveArray::new(highs, validity).into_array(), + PrimitiveArray::new(lows, Validity::NonNullable).into_array(), + dt, + ) + .unwrap() + } + #[rstest] // Basic decimal byte parts arrays #[case::decimal_i32(DecimalByteParts::try_new( @@ -68,6 +83,17 @@ mod tests { PrimitiveArray::from_iter((0..2000i64).map(|i| i * 1000000)).into_array(), DecimalDType::new(19, 6) ).unwrap())] + // Two-limb i128 representation + #[case::two_limb(two_limb( + &[0, -1, (3i128 << 64) | 42, -(9i128 << 64) | 17, i128::from(i64::MIN), (7i128 << 64)], + Validity::NonNullable, + DecimalDType::new(38, 2), + ))] + #[case::two_limb_nullable(two_limb( + &[0, -1, (3i128 << 64) | 42, -(9i128 << 64) | 17, 5], + Validity::Array(BoolArray::from_iter([true, false, true, true, false]).into_array()), + DecimalDType::new(38, 2), + ))] fn test_decimal_byte_parts_consistency(#[case] array: DecimalBytePartsArray) { test_array_consistency(&array.into_array()); diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs index 807b660ec73..807bc48e847 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/take.rs @@ -18,13 +18,19 @@ impl TakeExecute for DecimalByteParts { indices: &ArrayRef, _ctx: &mut ExecutionCtx, ) -> VortexResult> { - DecimalByteParts::try_new( - array.msp().take(indices.clone())?, - *array - .dtype() - .as_decimal_opt() - .vortex_expect("must be a decimal dtype"), - ) - .map(|a| Some(a.into_array())) + let decimal_dtype = *array + .dtype() + .as_decimal_opt() + .vortex_expect("must be a decimal dtype"); + let msp = array.msp().take(indices.clone())?; + let taken = match array.lower() { + None => DecimalByteParts::try_new(msp, decimal_dtype)?, + Some(lower) => DecimalByteParts::try_new_with_lower( + msp, + lower.take(indices.clone())?, + decimal_dtype, + )?, + }; + Ok(Some(taken.into_array())) } } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs index 76349f40910..f2f8eeb7a0f 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/mod.rs @@ -27,6 +27,7 @@ use vortex_array::arrays::PrimitiveArray; use vortex_array::buffer::BufferHandle; use vortex_array::dtype::DType; use vortex_array::dtype::DecimalDType; +use vortex_array::dtype::Nullability; use vortex_array::dtype::PType; use vortex_array::match_each_signed_integer_ptype; use vortex_array::scalar::DecimalValue; @@ -38,6 +39,7 @@ use vortex_array::vtable::OperationsVTable; use vortex_array::vtable::VTable; use vortex_array::vtable::ValidityChild; use vortex_array::vtable::ValidityVTableFromChild; +use vortex_buffer::Buffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use vortex_error::vortex_bail; @@ -94,7 +96,8 @@ impl VTable for DecimalByteParts { let msp = slots[MSP_SLOT] .as_ref() .vortex_expect("DecimalBytePartsArray msp slot"); - DecimalBytePartsData::validate(msp, *decimal_dtype, dtype, len) + let lower = slots.get(LOWER_SLOT).and_then(Option::as_ref); + DecimalBytePartsData::validate(msp, lower, *decimal_dtype, dtype, len) } fn nbuffers(_array: ArrayView<'_, Self>) -> usize { @@ -116,7 +119,7 @@ impl VTable for DecimalByteParts { Ok(Some( DecimalBytesPartsMetadata { zeroth_child_ptype: PType::try_from(array.msp().dtype())? as i32, - lower_part_count: 0, + lower_part_count: u32::from(array.lower().is_some()), } .encode_to_vec(), )) @@ -140,14 +143,22 @@ impl VTable for DecimalByteParts { let msp = children.get(0, &encoded_dtype, len)?; - assert_eq!( - metadata.lower_part_count, 0, - "lower_part_count > 0 not currently supported" - ); + let slots = match metadata.lower_part_count { + 0 => smallvec![Some(msp.clone())], + 1 => { + let lower_dtype = DType::Primitive(LOWER_PTYPE, Nullability::NonNullable); + let lower = children.get(1, &lower_dtype, len)?; + smallvec![Some(msp.clone()), Some(lower)] + } + n => vortex_bail!("decimal byte parts supports at most one lower limb, got {n}"), + }; - let slots = smallvec![Some(msp.clone())]; - let data = DecimalBytePartsData::try_new(msp.dtype(), msp.len(), *decimal_dtype)?; - Ok(ArrayParts::new(self.clone(), dtype.clone(), len, data).with_slots(slots)) + let lower = slots.as_slice().get(LOWER_SLOT).and_then(Option::as_ref); + DecimalBytePartsData::validate(&msp, lower, *decimal_dtype, dtype, len)?; + Ok( + ArrayParts::new(self.clone(), dtype.clone(), len, DecimalBytePartsData) + .with_slots(slots), + ) } fn slot_name(_array: ArrayView<'_, Self>, idx: usize) -> String { @@ -176,23 +187,29 @@ impl VTable for DecimalByteParts { } } -/// The most significant parts of the decimal values. +/// The most significant part (high limb) of the decimal values. pub(super) const MSP_SLOT: usize = 0; -pub(super) const NUM_SLOTS: usize = 1; -pub(super) const SLOT_NAMES: [&str; NUM_SLOTS] = ["msp"]; - -/// This array encodes decimals as between 1-4 columns of primitive typed children. -/// The most significant part (msp) sorting the most significant decimal bits. -/// This array must be signed and is nullable iff the decimal is nullable. +/// The single lower limb, present only for the two-limb i128 representation. +pub(super) const LOWER_SLOT: usize = 1; +/// The maximum number of children an array of this encoding can hold. +pub(super) const MAX_SLOTS: usize = 2; +pub(super) const SLOT_NAMES: [&str; MAX_SLOTS] = ["msp", "lower"]; + +/// The physical type of the lower limb in the two-limb representation. +pub(super) const LOWER_PTYPE: PType = PType::U64; + +/// This array encodes decimals as 1 or 2 columns of primitive typed children. +/// The most significant part (msp) stores the most significant decimal bits and is always a +/// signed integer, nullable iff the decimal is nullable. +/// +/// With a single child the decimal value is exactly the (sign-extended) msp. With two children the +/// value is reconstructed as `(msp as i128) << 64 | (lower as u64 as i128)`, i.e. the msp is the +/// signed high 64-bit limb and `lower` is the unsigned low 64-bit limb. The lower limb is always a +/// non-nullable `u64`; validity is carried solely by the msp. /// -/// e.g. for a decimal i128 \[ 127..64 | 64..0 \] msp = 127..64 and lower_part\[0\] = 64..0 +/// e.g. for a decimal i128 \[ 127..64 | 64..0 \] msp = 127..64 and lower = 64..0 #[derive(Clone, Debug)] -pub struct DecimalBytePartsData { - // NOTE: the lower_parts is currently unused, we reserve this field so that it is properly - // read/written during serde, but provide no constructor to initialize this to anything - // other than the empty Vec. - _lower_parts: Vec, -} +pub struct DecimalBytePartsData; impl Display for DecimalBytePartsData { fn fmt(&self, _f: &mut Formatter<'_>) -> std::fmt::Result { @@ -210,6 +227,14 @@ pub trait DecimalBytePartsArrayExt: TypedArrayRef { .as_ref() .vortex_expect("DecimalBytePartsArray msp slot") } + + /// The lower (low 64-bit) limb, present only for the two-limb i128 representation. + fn lower(&self) -> Option<&ArrayRef> { + self.as_ref() + .slots() + .get(LOWER_SLOT) + .and_then(Option::as_ref) + } } impl> DecimalBytePartsArrayExt for T {} @@ -217,6 +242,7 @@ impl> DecimalBytePartsArrayExt for T {} impl DecimalBytePartsData { pub fn validate( msp: &ArrayRef, + lower: Option<&ArrayRef>, decimal_dtype: DecimalDType, dtype: &DType, len: usize, @@ -231,24 +257,25 @@ impl DecimalBytePartsData { "expected dtype {expected_dtype}, got {dtype}" ); vortex_ensure!(msp.len() == len, "expected len {len}, got {}", msp.len()); - Ok(()) - } - pub(crate) fn try_new( - msp_dtype: &DType, - msp_len: usize, - decimal_dtype: DecimalDType, - ) -> VortexResult { - let expected_dtype = DType::Decimal(decimal_dtype, msp_dtype.nullability()); - vortex_ensure!( - msp_dtype.is_signed_int(), - "decimal bytes parts, first part must be a signed array" - ); - let _ = msp_len; - drop(expected_dtype); - Ok(Self { - _lower_parts: Vec::new(), - }) + if let Some(lower) = lower { + vortex_ensure!( + matches!(msp.dtype(), DType::Primitive(PType::I64, _)), + "two-limb decimal byte parts requires an i64 high limb, got {}", + msp.dtype() + ); + vortex_ensure!( + lower.dtype() == &DType::Primitive(LOWER_PTYPE, Nullability::NonNullable), + "decimal byte parts lower limb must be a non-nullable {LOWER_PTYPE}, got {}", + lower.dtype() + ); + vortex_ensure!( + lower.len() == len, + "lower limb length {} != array length {len}", + lower.len() + ); + } + Ok(()) } } @@ -256,18 +283,41 @@ impl DecimalBytePartsData { pub struct DecimalByteParts; impl DecimalByteParts { - /// Construct a new [`DecimalBytePartsArray`] from an MSP array and decimal dtype. + /// Construct a new single-limb [`DecimalBytePartsArray`] from an MSP array and decimal dtype. pub fn try_new( msp: ArrayRef, decimal_dtype: DecimalDType, ) -> VortexResult { let len = msp.len(); let dtype = DType::Decimal(decimal_dtype, msp.dtype().nullability()); - let slots = smallvec![Some(msp.clone())]; - let data = DecimalBytePartsData::try_new(msp.dtype(), msp.len(), decimal_dtype)?; + DecimalBytePartsData::validate(&msp, None, decimal_dtype, &dtype, len)?; + let slots = smallvec![Some(msp)]; Ok(unsafe { Array::from_parts_unchecked( - ArrayParts::new(DecimalByteParts, dtype, len, data).with_slots(slots), + ArrayParts::new(DecimalByteParts, dtype, len, DecimalBytePartsData) + .with_slots(slots), + ) + }) + } + + /// Construct a two-limb [`DecimalBytePartsArray`] representing an i128 decimal. + /// + /// `msp` is the signed high 64-bit limb (carrying validity); `lower` is the non-nullable + /// unsigned low 64-bit limb. The decimal value at index `i` is + /// `(msp[i] as i128) << 64 | (lower[i] as u64 as i128)`. + pub fn try_new_with_lower( + msp: ArrayRef, + lower: ArrayRef, + decimal_dtype: DecimalDType, + ) -> VortexResult { + let len = msp.len(); + let dtype = DType::Decimal(decimal_dtype, msp.dtype().nullability()); + DecimalBytePartsData::validate(&msp, Some(&lower), decimal_dtype, &dtype, len)?; + let slots = smallvec![Some(msp), Some(lower)]; + Ok(unsafe { + Array::from_parts_unchecked( + ArrayParts::new(DecimalByteParts, dtype, len, DecimalBytePartsData) + .with_slots(slots), ) }) } @@ -278,26 +328,38 @@ fn to_canonical_decimal( array: &DecimalBytePartsArray, ctx: &mut ExecutionCtx, ) -> VortexResult { - // TODO(joe): support parts len != 1 - let prim = array.msp().clone().execute::(ctx)?; - // Depending on the decimal type and the min/max of the primitive array we can choose - // the correct buffer size - - Ok(match_each_signed_integer_ptype!(prim.ptype(), |P| { - // SAFETY: The primitive array's buffer is already validated with correct type. - // The decimal dtype matches the array's dtype, and validity is preserved. - unsafe { - DecimalArray::new_unchecked( - prim.to_buffer::

(), - *array - .dtype() - .as_decimal_opt() - .vortex_expect("must be a decimal dtype"), - prim.validity()?, - ) - } - .into_array() - })) + let decimal_dtype = *array + .dtype() + .as_decimal_opt() + .vortex_expect("must be a decimal dtype"); + let msp = array.msp().clone().execute::(ctx)?; + + let Some(lower) = array.lower() else { + // Single-limb: the decimal is exactly the sign-extended msp. + return Ok(match_each_signed_integer_ptype!(msp.ptype(), |P| { + // SAFETY: The primitive array's buffer is already validated with correct type. + // The decimal dtype matches the array's dtype, and validity is preserved. + unsafe { + DecimalArray::new_unchecked(msp.to_buffer::

(), decimal_dtype, msp.validity()?) + } + .into_array() + })); + }; + + // Two-limb: reconstruct each i128 as `(high as i128) << 64 | low`. + let lower = lower.clone().execute::(ctx)?; + let validity = msp.validity()?; + let low = lower.as_slice::(); + let values: Buffer = match_each_signed_integer_ptype!(msp.ptype(), |P| { + msp.as_slice::

() + .iter() + .zip(low) + .map(|(&high, &low)| ((high as i128) << 64) | i128::from(low)) + .collect() + }); + + // SAFETY: validity comes from the msp and the reconstructed values match the decimal dtype. + Ok(unsafe { DecimalArray::new_unchecked(values, decimal_dtype, validity) }.into_array()) } impl OperationsVTable for DecimalByteParts { @@ -306,16 +368,29 @@ impl OperationsVTable for DecimalByteParts { index: usize, ctx: &mut ExecutionCtx, ) -> VortexResult { - // TODO(joe): support parts len != 1 - let scalar = array.msp().execute_scalar(index, ctx)?; - - // Note. values in msp, can only be signed integers upto size i64. - let primitive_scalar = scalar.as_primitive(); - // TODO(joe): extend this to support multiple parts. - let value = primitive_scalar.as_::().vortex_expect("non-null"); + // Null indices are short-circuited by the validity child (the msp), so the msp scalar + // here is always non-null. + let high = array + .msp() + .execute_scalar(index, ctx)? + .as_primitive() + .as_::() + .vortex_expect("non-null"); + + let decimal_value = match array.lower() { + None => DecimalValue::I64(high), + Some(lower) => { + let low = lower + .execute_scalar(index, ctx)? + .as_primitive() + .as_::() + .vortex_expect("lower limb is non-nullable"); + DecimalValue::I128((i128::from(high) << 64) | i128::from(low)) + } + }; Scalar::try_new( array.dtype().clone(), - Some(ScalarValue::Decimal(DecimalValue::I64(value))), + Some(ScalarValue::Decimal(decimal_value)), ) } } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs index 7fafa0d5289..3e9c74dfef8 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/rules.rs @@ -38,8 +38,9 @@ impl ArrayParentReduceRule for DecimalBytePartsFilterPushDownR _child_idx: usize, ) -> VortexResult> { // TODO(ngates): we should benchmark whether to push-down filters with "lower parts". - // For now, we only push down if there are no lower parts. - if !child._lower_parts.is_empty() { + // For now, we only push down if there are no lower parts; two-limb arrays fall back to + // the generic `FilterReduce` implementation which filters both limbs. + if child.lower().is_some() { return Ok(None); } diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs index a9c022d2051..530ece55892 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/slice.rs @@ -15,15 +15,17 @@ use crate::decimal_byte_parts::DecimalBytePartsArrayExt; impl SliceReduce for DecimalByteParts { fn slice(array: ArrayView<'_, Self>, range: Range) -> VortexResult> { - Ok(Some( - DecimalByteParts::try_new( - array.msp().slice(range)?, - *array - .dtype() - .as_decimal_opt() - .vortex_expect("must be a decimal dtype"), - )? - .into_array(), - )) + let decimal_dtype = *array + .dtype() + .as_decimal_opt() + .vortex_expect("must be a decimal dtype"); + let msp = array.msp().slice(range.clone())?; + let sliced = match array.lower() { + None => DecimalByteParts::try_new(msp, decimal_dtype)?, + Some(lower) => { + DecimalByteParts::try_new_with_lower(msp, lower.slice(range)?, decimal_dtype)? + } + }; + Ok(Some(sliced.into_array())) } } From 903b11653771089d9c0cbb3d55f5c9ba40570631 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 17:31:13 +0000 Subject: [PATCH 05/13] Replace two-limb between with a fused single-pass kernel The previous two-limb `between` composed ~11 generic array ops, each a full pass allocating an intermediate Bool array, which was slower than both arrow and the canonical i128 path. This replaces it with a single fused loop that materializes the two limbs once and computes the lexicographic comparison per row with native-width (i64/u64) integer ops via BitBuffer::collect_bool, using branch-free bitwise combines so the body vectorizes. On wide i128 values this cuts two-limb `between` from ~605us to ~267us at 131k rows, making it ~1.8x faster than the canonical i128 path. Correctness is unchanged and still verified against the canonical implementation. Signed-off-by: Joe Isaacs --- .../src/decimal_byte_parts/compute/between.rs | 120 ++++++++++-------- 1 file changed, 69 insertions(+), 51 deletions(-) diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs index 421677f3b49..f326f937bf1 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -5,22 +5,21 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::PrimitiveArray; use vortex_array::builtins::ArrayBuiltins; -use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; -use vortex_array::scalar::ScalarValue; use vortex_array::scalar_fn::fns::between::BetweenKernel; use vortex_array::scalar_fn::fns::between::BetweenOptions; use vortex_array::scalar_fn::fns::between::StrictComparison; -use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_buffer::BitBuffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; -use crate::decimal_byte_parts::LOWER_PTYPE; use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive; impl BetweenKernel for DecimalByteParts { @@ -29,7 +28,7 @@ impl BetweenKernel for DecimalByteParts { lower: &ArrayRef, upper: &ArrayRef, options: &BetweenOptions, - _ctx: &mut ExecutionCtx, + ctx: &mut ExecutionCtx, ) -> VortexResult> { // We can only push the comparison down into the limbs when both bounds are constant. let (Some(lower_const), Some(upper_const)) = (lower.as_constant(), upper.as_constant()) @@ -65,6 +64,7 @@ impl BetweenKernel for DecimalByteParts { upper_i128, options, nullability, + ctx, )?)); } @@ -99,74 +99,92 @@ impl BetweenKernel for DecimalByteParts { } } -/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation. +/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation in a +/// single fused pass. /// /// With high limb `H` (signed i64) and low limb `L` (unsigned u64), the value `v = H<<64 | L` /// satisfies `v >= lower` iff `H > lo_h OR (H == lo_h AND L >=' lo_l)` and `v <= upper` iff /// `H < hi_h OR (H == hi_h AND L <=' hi_l)`, where `>='`/`<='` are strict when requested. Each -/// limb comparison is a native-width integer compare that vectorizes far better than a 128-bit -/// comparison. +/// row resolves to native-width integer comparisons in one loop, which vectorizes far better than +/// arrow's 128-bit comparison or a tree of generic array operations with intermediate allocations. fn two_limb_between( arr: &ArrayView<'_, DecimalByteParts>, lower: i128, upper: i128, options: &BetweenOptions, nullability: Nullability, + ctx: &mut ExecutionCtx, ) -> VortexResult { - let len = arr.len(); - let high = arr.msp().clone(); + let high = arr.msp().clone().execute::(ctx)?; let low = arr .lower() .vortex_expect("two-limb path requires a lower limb") - .clone(); + .clone() + .execute::(ctx)?; - let high_dtype = high.dtype().with_nullability(nullability); - let low_dtype = DType::Primitive(LOWER_PTYPE, nullability); - - let high_const = |limb: i64| -> VortexResult { - Ok(ConstantArray::new( - Scalar::try_new(high_dtype.clone(), Some(ScalarValue::from(limb)))?, - len, - ) - .into_array()) - }; - let low_const = |limb: u64| -> VortexResult { - Ok(ConstantArray::new( - Scalar::try_new(low_dtype.clone(), Some(ScalarValue::from(limb)))?, - len, - ) - .into_array()) - }; - - let lower_low_op = match options.lower_strict { - StrictComparison::Strict => Operator::Gt, - StrictComparison::NonStrict => Operator::Gte, - }; - let upper_low_op = match options.upper_strict { - StrictComparison::Strict => Operator::Lt, - StrictComparison::NonStrict => Operator::Lte, - }; + let validity = high.validity()?.union_nullability(nullability); + let high = high.as_slice::(); + let low = low.as_slice::(); + assert_eq!(high.len(), low.len(), "limb lengths must match"); let (lower_high, lower_low) = split_i128(lower); let (upper_high, upper_low) = split_i128(upper); - // value >= lower - let ge_lower = { - let h_gt = high.binary(high_const(lower_high)?, Operator::Gt)?; - let h_eq = high.binary(high_const(lower_high)?, Operator::Eq)?; - let l_cmp = low.binary(low_const(lower_low)?, lower_low_op)?; - h_gt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)? - }; + let lower_limbs = (lower_high, lower_low); + let upper_limbs = (upper_high, upper_low); - // value <= upper - let le_upper = { - let h_lt = high.binary(high_const(upper_high)?, Operator::Lt)?; - let h_eq = high.binary(high_const(upper_high)?, Operator::Eq)?; - let l_cmp = low.binary(low_const(upper_low)?, upper_low_op)?; - h_lt.binary(h_eq.binary(l_cmp, Operator::And)?, Operator::Or)? + // Pass the low-limb comparison as a monomorphized fn so the whole per-element body inlines. + let bits = match (options.lower_strict, options.upper_strict) { + (StrictComparison::Strict, StrictComparison::Strict) => { + collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_lt) + } + (StrictComparison::Strict, StrictComparison::NonStrict) => { + collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_le) + } + (StrictComparison::NonStrict, StrictComparison::Strict) => { + collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_lt) + } + (StrictComparison::NonStrict, StrictComparison::NonStrict) => { + collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_le) + } }; - ge_lower.binary(le_upper, Operator::And) + Ok(BoolArray::new(bits, validity).into_array()) +} + +/// The fused per-element loop. `lower_low_cmp`/`upper_low_cmp` apply the (possibly strict) low-limb +/// comparison. Bitwise (non-short-circuiting) `&`/`|` keep the body branch-free for the vectorizer. +fn collect_two_limb( + high: &[i64], + low: &[u64], + lower: (i64, u64), + lower_low_cmp: impl Fn(u64, u64) -> bool, + upper: (i64, u64), + upper_low_cmp: impl Fn(u64, u64) -> bool, +) -> BitBuffer { + let (lower_high, lower_low) = lower; + let (upper_high, upper_low) = upper; + BitBuffer::collect_bool(high.len(), |idx| { + // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). + let h = unsafe { *high.get_unchecked(idx) }; + let l = unsafe { *low.get_unchecked(idx) }; + let ge_lower = (h > lower_high) | ((h == lower_high) & lower_low_cmp(l, lower_low)); + let le_upper = (h < upper_high) | ((h == upper_high) & upper_low_cmp(l, upper_low)); + ge_lower & le_upper + }) +} + +fn u64_ge(a: u64, b: u64) -> bool { + a >= b +} +fn u64_gt(a: u64, b: u64) -> bool { + a > b +} +fn u64_le(a: u64, b: u64) -> bool { + a <= b +} +fn u64_lt(a: u64, b: u64) -> bool { + a < b } /// Split an i128 into its signed high and unsigned low 64-bit limbs. The truncating casts are the From df2d58e7f706c0471b01114097ef76ac01124805 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 17:48:16 +0000 Subject: [PATCH 06/13] Beat arrow on two-limb between by reconstructing i128 in the fused pass The hand-rolled lexicographic comparison (6 native compares + bitwise combine per row) did more per-element work than a single hardware i128 compare, leaving two-limb between slower than arrow. Reconstructing the i128 from its signed-high / unsigned-low limbs and comparing it directly is ~2 instructions per bound, and the single fused pass reads each limb once where arrow reads its i128 array twice (one pass each for gt_eq/lt_eq). At the benchmarked cache-resident sizes this makes two-limb between faster than arrow's Decimal128 (~1.1-1.2x) and ~1.9x faster than the canonical i128 path, while staying under the 1ms-per-op budget. Correctness is unchanged and still verified against the canonical implementation. Signed-off-by: Joe Isaacs --- .../src/decimal_byte_parts/compute/between.rs | 57 +++++++------------ 1 file changed, 22 insertions(+), 35 deletions(-) diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs index f326f937bf1..8794e3316f4 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -127,73 +127,60 @@ fn two_limb_between( let low = low.as_slice::(); assert_eq!(high.len(), low.len(), "limb lengths must match"); - let (lower_high, lower_low) = split_i128(lower); - let (upper_high, upper_low) = split_i128(upper); - - let lower_limbs = (lower_high, lower_low); - let upper_limbs = (upper_high, upper_low); - - // Pass the low-limb comparison as a monomorphized fn so the whole per-element body inlines. + // Pass the comparison as a monomorphized fn so the whole per-element body inlines. let bits = match (options.lower_strict, options.upper_strict) { (StrictComparison::Strict, StrictComparison::Strict) => { - collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_lt) + collect_two_limb(high, low, lower, i128_gt, upper, i128_lt) } (StrictComparison::Strict, StrictComparison::NonStrict) => { - collect_two_limb(high, low, lower_limbs, u64_gt, upper_limbs, u64_le) + collect_two_limb(high, low, lower, i128_gt, upper, i128_le) } (StrictComparison::NonStrict, StrictComparison::Strict) => { - collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_lt) + collect_two_limb(high, low, lower, i128_ge, upper, i128_lt) } (StrictComparison::NonStrict, StrictComparison::NonStrict) => { - collect_two_limb(high, low, lower_limbs, u64_ge, upper_limbs, u64_le) + collect_two_limb(high, low, lower, i128_ge, upper, i128_le) } }; Ok(BoolArray::new(bits, validity).into_array()) } -/// The fused per-element loop. `lower_low_cmp`/`upper_low_cmp` apply the (possibly strict) low-limb -/// comparison. Bitwise (non-short-circuiting) `&`/`|` keep the body branch-free for the vectorizer. +/// The fused per-element loop. Reconstruct the i128 from its signed-high / unsigned-low limbs and +/// compare against the bounds. A native i128 comparison is ~2 instructions, cheaper than a manual +/// lexicographic decomposition, and the single pass reads each limb once (arrow reads its i128 +/// array twice, once per `gt_eq`/`lt_eq`). Bitwise `&` keeps the body branch-free. fn collect_two_limb( high: &[i64], low: &[u64], - lower: (i64, u64), - lower_low_cmp: impl Fn(u64, u64) -> bool, - upper: (i64, u64), - upper_low_cmp: impl Fn(u64, u64) -> bool, + lower: i128, + lower_cmp: impl Fn(i128, i128) -> bool, + upper: i128, + upper_cmp: impl Fn(i128, i128) -> bool, ) -> BitBuffer { - let (lower_high, lower_low) = lower; - let (upper_high, upper_low) = upper; BitBuffer::collect_bool(high.len(), |idx| { // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). - let h = unsafe { *high.get_unchecked(idx) }; - let l = unsafe { *low.get_unchecked(idx) }; - let ge_lower = (h > lower_high) | ((h == lower_high) & lower_low_cmp(l, lower_low)); - let le_upper = (h < upper_high) | ((h == upper_high) & upper_low_cmp(l, upper_low)); - ge_lower & le_upper + let hi = unsafe { *high.get_unchecked(idx) }; + let lo = unsafe { *low.get_unchecked(idx) }; + // Sign-extend the high limb, zero-extend the low limb: value = (hi << 64) | lo. + let value = ((hi as i128) << 64) | (lo as i128); + lower_cmp(value, lower) & upper_cmp(value, upper) }) } -fn u64_ge(a: u64, b: u64) -> bool { +fn i128_ge(a: i128, b: i128) -> bool { a >= b } -fn u64_gt(a: u64, b: u64) -> bool { +fn i128_gt(a: i128, b: i128) -> bool { a > b } -fn u64_le(a: u64, b: u64) -> bool { +fn i128_le(a: i128, b: i128) -> bool { a <= b } -fn u64_lt(a: u64, b: u64) -> bool { +fn i128_lt(a: i128, b: i128) -> bool { a < b } -/// Split an i128 into its signed high and unsigned low 64-bit limbs. The truncating casts are the -/// intended limb extraction: `>> 64` keeps the high 64 bits and `as u64` keeps the low 64 bits. -#[allow(clippy::cast_possible_truncation)] -fn split_i128(value: i128) -> (i64, u64) { - ((value >> 64) as i64, value as u64) -} - #[cfg(test)] mod tests { use rstest::rstest; From 0d3700a8b5dfed6ec16a3a065384ade061931e4d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:00:33 +0000 Subject: [PATCH 07/13] Push two-limb comparisons down with a fused i128 reconstruct kernel The two-limb representation previously deferred all `compare` operators to the canonical i128 path. Reuse the fused reconstruct approach from `between`: a single pass rebuilds each i128 from its (signed high, unsigned low) limbs and applies the comparison, reading each limb once. This specializes all six operators (eq/neq/lt/lte/gt/gte), not just lt, since they share one loop. Factor the shared limb materialization, i128 reconstruct, and i128 comparator helpers into a `two_limb` module used by both `between` and `compare`. Signed-off-by: Joe Isaacs --- .../src/decimal_byte_parts/compute/between.rs | 31 +--- .../src/decimal_byte_parts/compute/compare.rs | 151 +++++++++++++++++- .../src/decimal_byte_parts/compute/mod.rs | 1 + .../decimal_byte_parts/compute/two_limb.rs | 57 +++++++ 4 files changed, 211 insertions(+), 29 deletions(-) create mode 100644 encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs index 8794e3316f4..040b6cde847 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -7,7 +7,6 @@ use vortex_array::ExecutionCtx; use vortex_array::IntoArray; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; -use vortex_array::arrays::PrimitiveArray; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; @@ -21,6 +20,12 @@ use vortex_error::VortexResult; use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive; +use crate::decimal_byte_parts::compute::two_limb::i128_ge; +use crate::decimal_byte_parts::compute::two_limb::i128_gt; +use crate::decimal_byte_parts::compute::two_limb::i128_le; +use crate::decimal_byte_parts::compute::two_limb::i128_lt; +use crate::decimal_byte_parts::compute::two_limb::materialize_limbs; +use crate::decimal_byte_parts::compute::two_limb::reconstruct; impl BetweenKernel for DecimalByteParts { fn between( @@ -115,13 +120,7 @@ fn two_limb_between( nullability: Nullability, ctx: &mut ExecutionCtx, ) -> VortexResult { - let high = arr.msp().clone().execute::(ctx)?; - let low = arr - .lower() - .vortex_expect("two-limb path requires a lower limb") - .clone() - .execute::(ctx)?; - + let (high, low) = materialize_limbs(arr, ctx)?; let validity = high.validity()?.union_nullability(nullability); let high = high.as_slice::(); let low = low.as_slice::(); @@ -162,25 +161,11 @@ fn collect_two_limb( // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). let hi = unsafe { *high.get_unchecked(idx) }; let lo = unsafe { *low.get_unchecked(idx) }; - // Sign-extend the high limb, zero-extend the low limb: value = (hi << 64) | lo. - let value = ((hi as i128) << 64) | (lo as i128); + let value = reconstruct(hi, lo); lower_cmp(value, lower) & upper_cmp(value, upper) }) } -fn i128_ge(a: i128, b: i128) -> bool { - a >= b -} -fn i128_gt(a: i128, b: i128) -> bool { - a > b -} -fn i128_le(a: i128, b: i128) -> bool { - a <= b -} -fn i128_lt(a: i128, b: i128) -> bool { - a < b -} - #[cfg(test)] mod tests { use rstest::rstest; diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs index 58a1135702e..9c193401123 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs @@ -7,6 +7,7 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::IntegerPType; @@ -21,12 +22,21 @@ use vortex_array::scalar::ScalarValue; use vortex_array::scalar_fn::fns::binary::CompareKernel; use vortex_array::scalar_fn::fns::operators::CompareOperator; use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_buffer::BitBuffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; use crate::decimal_byte_parts::compute::compare::Sign::Positive; +use crate::decimal_byte_parts::compute::two_limb::i128_eq; +use crate::decimal_byte_parts::compute::two_limb::i128_ge; +use crate::decimal_byte_parts::compute::two_limb::i128_gt; +use crate::decimal_byte_parts::compute::two_limb::i128_le; +use crate::decimal_byte_parts::compute::two_limb::i128_lt; +use crate::decimal_byte_parts::compute::two_limb::i128_ne; +use crate::decimal_byte_parts::compute::two_limb::materialize_limbs; +use crate::decimal_byte_parts::compute::two_limb::reconstruct; impl CompareKernel for DecimalByteParts { fn compare( @@ -39,19 +49,31 @@ impl CompareKernel for DecimalByteParts { return Ok(None); }; - // The two-limb representation only specializes `between`; defer comparisons to canonical. - if lhs.lower().is_some() { - return Ok(None); - } - let nullability = lhs.dtype().nullability() | rhs.dtype().nullability(); - let scalar_type = lhs.msp().dtype().with_nullability(nullability); let rhs_decimal = rhs_const .as_decimal() .decimal_value() .vortex_expect("checked for null in entry func"); + if lhs.lower().is_some() { + // Two-limb representation: reconstruct each i128 and compare directly in a single fused + // pass. The constant must fit in i128, which it always does for a two-limb (<= 38 digit) + // decimal; defer to canonical otherwise. + let Some(rhs_i128) = rhs_decimal.cast::() else { + return Ok(None); + }; + return Ok(Some(two_limb_compare( + &lhs, + rhs_i128, + operator, + nullability, + ctx, + )?)); + } + + let scalar_type = lhs.msp().dtype().with_nullability(nullability); + match decimal_value_wrapper_to_primitive( rhs_decimal, lhs.msp().as_primitive_typed().ptype(), @@ -86,6 +108,49 @@ impl CompareKernel for DecimalByteParts { } } +/// Compare each reconstructed i128 value against the constant `rhs` in a single fused pass. See +/// `two_limb::reconstruct`; a native i128 compare is cheaper than a manual lexicographic +/// decomposition and reads each limb only once. +fn two_limb_compare( + lhs: &ArrayView<'_, DecimalByteParts>, + rhs: i128, + operator: CompareOperator, + nullability: Nullability, + ctx: &mut ExecutionCtx, +) -> VortexResult { + let (high, low) = materialize_limbs(lhs, ctx)?; + let validity = high.validity()?.union_nullability(nullability); + let high = high.as_slice::(); + let low = low.as_slice::(); + assert_eq!(high.len(), low.len(), "limb lengths must match"); + + let bits = match operator { + CompareOperator::Eq => collect_compare(high, low, rhs, i128_eq), + CompareOperator::NotEq => collect_compare(high, low, rhs, i128_ne), + CompareOperator::Gt => collect_compare(high, low, rhs, i128_gt), + CompareOperator::Gte => collect_compare(high, low, rhs, i128_ge), + CompareOperator::Lt => collect_compare(high, low, rhs, i128_lt), + CompareOperator::Lte => collect_compare(high, low, rhs, i128_le), + }; + + Ok(BoolArray::new(bits, validity).into_array()) +} + +fn collect_compare( + high: &[i64], + low: &[u64], + rhs: i128, + cmp: impl Fn(i128, i128) -> bool, +) -> BitBuffer { + BitBuffer::collect_bool(high.len(), |idx| { + // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). + let hi = unsafe { *high.get_unchecked(idx) }; + let lo = unsafe { *low.get_unchecked(idx) }; + let value = reconstruct(hi, lo); + cmp(value, rhs) + }) +} + // Used to represent the overflow direction when trying to // convert into the scalar type. #[derive(Debug)] @@ -152,6 +217,7 @@ where #[cfg(test)] mod tests { + use vortex_array::ArrayRef; use vortex_array::IntoArray; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; @@ -165,11 +231,84 @@ mod tests { use vortex_array::scalar::Scalar; use vortex_array::scalar_fn::fns::operators::Operator; use vortex_array::validity::Validity; + use vortex_buffer::Buffer; use vortex_buffer::buffer; use vortex_error::VortexResult; use crate::DecimalByteParts; + #[allow(clippy::cast_possible_truncation)] + fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> ArrayRef { + let highs: Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); + let lows: Buffer = values.iter().map(|v| *v as u64).collect(); + DecimalByteParts::try_new_with_lower( + PrimitiveArray::new(highs, validity).into_array(), + PrimitiveArray::new(lows, Validity::NonNullable).into_array(), + dt, + ) + .unwrap() + .into_array() + } + + #[test] + fn two_limb_compare_lt() -> VortexResult<()> { + let dt = DecimalDType::new(38, 0); + // 0, 2^64, 2^64 + 5, -(2^64): straddle the constant 2^64 across both limbs. + let arr = two_limb( + &[0, 1i128 << 64, (1i128 << 64) | 5, -(1i128 << 64)], + Validity::AllValid, + dt, + ); + let rhs = ConstantArray::new( + Scalar::decimal( + DecimalValue::I128(1i128 << 64), + dt, + Nullability::NonNullable, + ), + arr.len(), + ) + .into_array(); + + let lt = arr.binary(rhs.clone(), Operator::Lt)?; + assert_arrays_eq!( + lt, + BoolArray::from_iter([Some(true), Some(false), Some(false), Some(true)]) + ); + + // Gte is the complement here since all values are valid. + let gte = arr.binary(rhs, Operator::Gte)?; + assert_arrays_eq!( + gte, + BoolArray::from_iter([Some(false), Some(true), Some(true), Some(false)]) + ); + + Ok(()) + } + + #[test] + fn two_limb_compare_lt_nullable() -> VortexResult<()> { + let dt = DecimalDType::new(38, 0); + let arr = two_limb( + &[0, 1i128 << 64, (1i128 << 64) | 5], + Validity::Array(BoolArray::from_iter([true, false, true]).into_array()), + dt, + ); + let rhs = ConstantArray::new( + Scalar::decimal( + DecimalValue::I128(1i128 << 64), + dt, + Nullability::NonNullable, + ), + arr.len(), + ) + .into_array(); + + let lt = arr.binary(rhs, Operator::Lt)?; + assert_arrays_eq!(lt, BoolArray::from_iter([Some(true), None, Some(false)])); + + Ok(()) + } + #[test] fn compare_decimal_const() { let decimal_dtype = DecimalDType::new(8, 2); diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs index fca746b7eb5..44e97887da3 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs @@ -9,6 +9,7 @@ pub(crate) mod is_constant; pub(crate) mod kernel; mod mask; mod take; +mod two_limb; #[cfg(test)] mod tests { diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs new file mode 100644 index 00000000000..91c7a160fd7 --- /dev/null +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Shared helpers for fused single-pass kernels over the two-limb (signed-high i64, unsigned-low +//! u64) i128 representation, used by both `between` and `compare`. + +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::arrays::PrimitiveArray; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +use crate::DecimalByteParts; +use crate::decimal_byte_parts::DecimalBytePartsArrayExt; + +/// Materialize the high (signed `i64`) and low (unsigned `u64`) limbs of a two-limb array. +/// +/// The caller is responsible for checking [`DecimalBytePartsArrayExt::lower`] is present before +/// calling; this panics otherwise. +pub(crate) fn materialize_limbs( + arr: &ArrayView<'_, DecimalByteParts>, + ctx: &mut ExecutionCtx, +) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { + let high = arr.msp().clone().execute::(ctx)?; + let low = arr + .lower() + .vortex_expect("two-limb path requires a lower limb") + .clone() + .execute::(ctx)?; + Ok((high, low)) +} + +/// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb, +/// so `value = (high << 64) | low`. +#[inline(always)] +pub(crate) fn reconstruct(high: i64, low: u64) -> i128 { + ((high as i128) << 64) | (low as i128) +} + +pub(crate) fn i128_eq(a: i128, b: i128) -> bool { + a == b +} +pub(crate) fn i128_ne(a: i128, b: i128) -> bool { + a != b +} +pub(crate) fn i128_ge(a: i128, b: i128) -> bool { + a >= b +} +pub(crate) fn i128_gt(a: i128, b: i128) -> bool { + a > b +} +pub(crate) fn i128_le(a: i128, b: i128) -> bool { + a <= b +} +pub(crate) fn i128_lt(a: i128, b: i128) -> bool { + a < b +} From b1a4c9444910504086a0c5d1fc422cc509a0822f Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:04:35 +0000 Subject: [PATCH 08/13] Add decimal lt benchmark vs arrow-rs Benchmarks single-sided `x < threshold` across the i32/i64 MSP pushdown, the two-limb i128 kernel, the canonical i128 path, and arrow-rs cmp::lt. Unlike between, lt is a single pass for every engine, which isolates the cost of the two-limb reconstruct against a contiguous i128 read. Signed-off-by: Joe Isaacs --- encodings/decimal-byte-parts/Cargo.toml | 4 + .../decimal-byte-parts/benches/decimal_lt.rs | 287 ++++++++++++++++++ 2 files changed, 291 insertions(+) create mode 100644 encodings/decimal-byte-parts/benches/decimal_lt.rs diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml index 549c0b05e7c..783fc815aa0 100644 --- a/encodings/decimal-byte-parts/Cargo.toml +++ b/encodings/decimal-byte-parts/Cargo.toml @@ -37,3 +37,7 @@ vortex-array = { path = "../../vortex-array", features = ["_test-harness"] } [[bench]] name = "decimal_between" harness = false + +[[bench]] +name = "decimal_lt" +harness = false diff --git a/encodings/decimal-byte-parts/benches/decimal_lt.rs b/encodings/decimal-byte-parts/benches/decimal_lt.rs new file mode 100644 index 00000000000..037c8800526 --- /dev/null +++ b/encodings/decimal-byte-parts/benches/decimal_lt.rs @@ -0,0 +1,287 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Compares `x < threshold` (less-than against a constant) over decimal data for: +//! * Vortex `DecimalByteParts` with an i32 most-significant-part (pushdown kernel), +//! * Vortex `DecimalByteParts` with an i64 most-significant-part (pushdown kernel), +//! * Vortex `DecimalByteParts` two-limb i128 (signed-high / unsigned-low limbs, fused kernel), +//! * Vortex canonical `DecimalArray` (i128 storage), +//! * arrow-rs `Decimal128Array` via `cmp::lt`. +//! +//! Unlike `between`, arrow evaluates `lt` in a single pass, so this isolates a one-sided +//! comparison. arrow-rs has no decimal storage narrower than 128 bits, so logically-small or +//! limb-split decimals that Vortex keeps narrower must be materialised as i128 in arrow. + +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + +use arrow_array::Decimal128Array; +use arrow_array::Scalar as ArrowScalar; +use arrow_ord::cmp; +use divan::Bencher; +use divan::black_box; +use rand::RngExt; +use rand::SeedableRng; +use rand::rngs::StdRng; +use vortex_array::IntoArray; +use vortex_array::LEGACY_SESSION; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::DecimalArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DecimalDType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar::DecimalValue; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::operators::Operator; +use vortex_array::validity::Validity; +use vortex_decimal_byte_parts::DecimalByteParts; + +fn main() { + divan::main(); +} + +const LENGTHS: &[usize] = &[1 << 16, 1 << 17]; + +// Logical decimal range [0, 1000); threshold in the middle so ~half the rows pass. +const THRESHOLD: i64 = 500; + +fn values(len: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(0x5eed); + (0..len).map(|_| rng.random_range(0..1000i64)).collect() +} + +// ---- Vortex DecimalByteParts (i32 MSP) ---- + +#[divan::bench(args = LENGTHS)] +fn vortex_byteparts_i32(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(9, 2); + let msp = PrimitiveArray::from_iter(values(len).into_iter().map(|v| v as i32)).into_array(); + let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array(); + let rhs = ConstantArray::new( + Scalar::decimal( + DecimalValue::I32(THRESHOLD as i32), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + rhs.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, rhs, mut ctx)| { + black_box( + arr.binary(rhs, Operator::Lt) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +// ---- Vortex DecimalByteParts (i64 MSP) ---- + +#[divan::bench(args = LENGTHS)] +fn vortex_byteparts_i64(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(18, 2); + let msp = PrimitiveArray::from_iter(values(len)).into_array(); + let arr = DecimalByteParts::try_new(msp, dt).unwrap().into_array(); + let rhs = ConstantArray::new( + Scalar::decimal(DecimalValue::I64(THRESHOLD), dt, Nullability::NonNullable), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + rhs.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, rhs, mut ctx)| { + black_box( + arr.binary(rhs, Operator::Lt) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +// ---- Vortex canonical DecimalArray (i128 storage) ---- + +#[divan::bench(args = LENGTHS)] +fn vortex_canonical_i128(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(9, 2); + let arr = DecimalArray::new( + values(len).into_iter().map(i128::from).collect(), + dt, + Validity::NonNullable, + ) + .into_array(); + let rhs = ConstantArray::new( + Scalar::decimal( + DecimalValue::I128(THRESHOLD as i128), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + rhs.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, rhs, mut ctx)| { + black_box( + arr.binary(rhs, Operator::Lt) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +// ---- arrow-rs Decimal128 (cmp::lt) ---- + +#[divan::bench(args = LENGTHS)] +fn arrow_decimal128(bencher: Bencher, len: usize) { + let arr = Decimal128Array::from_iter_values(values(len).into_iter().map(i128::from)) + .with_precision_and_scale(9, 2) + .unwrap(); + let rhs = ArrowScalar::new( + Decimal128Array::from_iter_values([THRESHOLD as i128]) + .with_precision_and_scale(9, 2) + .unwrap(), + ); + + bencher + .with_inputs(|| (arr.clone(), rhs.clone())) + .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap())); +} + +// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ---- +// +// These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex +// nor arrow can keep them in a narrow integer. The two-limb representation splits each value into a +// signed i64 high limb and an unsigned u64 low limb, letting `lt` reconstruct the i128 once and run +// a single native comparison. + +fn wide_values(len: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(0x5eed); + (0..len) + .map(|_| { + let high = i128::from(rng.random_range(0..1000i64)); + let low = i128::from(rng.random_range(0..u64::MAX)); + (high << 64) | low + }) + .collect() +} + +// Threshold with a non-zero low limb so the low-limb tie-break is exercised at the high-limb edge. +const WIDE_THRESHOLD: i128 = (500i128 << 64) | 0x90ab_cdef; + +#[divan::bench(args = LENGTHS)] +fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(38, 2); + let values = wide_values(len); + let highs = PrimitiveArray::from_iter(values.iter().map(|v| (v >> 64) as i64)).into_array(); + let lows = PrimitiveArray::from_iter(values.iter().map(|v| *v as u64)).into_array(); + let arr = DecimalByteParts::try_new_with_lower(highs, lows, dt) + .unwrap() + .into_array(); + let rhs = ConstantArray::new( + Scalar::decimal( + DecimalValue::I128(WIDE_THRESHOLD), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + rhs.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, rhs, mut ctx)| { + black_box( + arr.binary(rhs, Operator::Lt) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +#[divan::bench(args = LENGTHS)] +fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) { + let dt = DecimalDType::new(38, 2); + let arr = DecimalArray::new( + wide_values(len).into_iter().collect(), + dt, + Validity::NonNullable, + ) + .into_array(); + let rhs = ConstantArray::new( + Scalar::decimal( + DecimalValue::I128(WIDE_THRESHOLD), + dt, + Nullability::NonNullable, + ), + len, + ) + .into_array(); + + bencher + .with_inputs(|| { + ( + arr.clone(), + rhs.clone(), + LEGACY_SESSION.create_execution_ctx(), + ) + }) + .bench_values(|(arr, rhs, mut ctx)| { + black_box( + arr.binary(rhs, Operator::Lt) + .unwrap() + .execute::(&mut ctx) + .unwrap(), + ) + }); +} + +#[divan::bench(args = LENGTHS)] +fn arrow_decimal128_wide(bencher: Bencher, len: usize) { + let arr = Decimal128Array::from_iter_values(wide_values(len)) + .with_precision_and_scale(38, 2) + .unwrap(); + let rhs = ArrowScalar::new( + Decimal128Array::from_iter_values([WIDE_THRESHOLD]) + .with_precision_and_scale(38, 2) + .unwrap(), + ); + + bencher + .with_inputs(|| (arr.clone(), rhs.clone())) + .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap())); +} From 5455f13877442f1df0eb48ff023f355da80c3198 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:11:56 +0000 Subject: [PATCH 09/13] Add limb-SIMD microbenchmark: AVX-512 limb compare vs arrow i128 Isolates the kernel cost of wide-i128 `lt` with no Vortex expression overhead. arrow's i128 has no vector compare on any x86, so it is inherently scalar (raw_i128_scalar tracks arrow). The (i64 high, u64 low) limbs are native SIMD widths: an AVX-512 kernel compares 8 lanes with vpcmpq/vpcmpuq straight to a __mmask8, which is exactly the packed-bit output, beating arrow ~1.4-2.1x at the measured sizes. The AVX-512 output is cross-checked against the scalar reference before timing. Signed-off-by: Joe Isaacs --- encodings/decimal-byte-parts/Cargo.toml | 4 + .../decimal-byte-parts/benches/limb_simd.rs | 188 ++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 encodings/decimal-byte-parts/benches/limb_simd.rs diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml index 783fc815aa0..d35378ea179 100644 --- a/encodings/decimal-byte-parts/Cargo.toml +++ b/encodings/decimal-byte-parts/Cargo.toml @@ -41,3 +41,7 @@ harness = false [[bench]] name = "decimal_lt" harness = false + +[[bench]] +name = "limb_simd" +harness = false diff --git a/encodings/decimal-byte-parts/benches/limb_simd.rs b/encodings/decimal-byte-parts/benches/limb_simd.rs new file mode 100644 index 00000000000..889f12e17f1 --- /dev/null +++ b/encodings/decimal-byte-parts/benches/limb_simd.rs @@ -0,0 +1,188 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Microbenchmark isolating the *kernel* cost of `x < threshold` over wide i128 decimals, with no +//! Vortex expression-execution overhead, to answer: can comparing the (signed-high i64, unsigned-low +//! u64) limb pair beat arrow's i128 comparison? +//! +//! arrow's i128 has no SIMD compare on any x86 (there is no 128-bit-integer vector comparison), so +//! it is inherently scalar. The limbs are native i64/u64, which AVX-512 compares 8-wide with +//! `vpcmpq`/`vpcmpuq`, producing a mask register that is exactly the packed-bit output we want. +//! +//! Kernels: +//! * `arrow_cmp_lt` arrow-rs `cmp::lt` over a `Decimal128Array` (reference). +//! * `raw_i128_scalar` read one contiguous i128 slice, scalar compare, serial bit-pack. +//! * `raw_limb_scalar` read the two limb slices, lexicographic compare, serial bit-pack. +//! * `raw_limb_avx512` read the two limb slices, AVX-512 limb compare straight to a mask byte. + +#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] + +use arrow_array::Decimal128Array; +use arrow_array::Scalar as ArrowScalar; +use arrow_ord::cmp; +use divan::Bencher; +use divan::black_box; +use rand::RngExt; +use rand::SeedableRng; +use rand::rngs::StdRng; + +fn main() { + divan::main(); +} + +const LENGTHS: &[usize] = &[1 << 16, 1 << 17, 1 << 20]; + +const THRESHOLD: i128 = (500i128 << 64) | 0x90ab_cdef; + +fn wide_values(len: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(0x5eed); + (0..len) + .map(|_| { + let high = i128::from(rng.random_range(0..1000i64)); + let low = i128::from(rng.random_range(0..u64::MAX)); + (high << 64) | low + }) + .collect() +} + +fn limbs(values: &[i128]) -> (Vec, Vec) { + ( + values.iter().map(|v| (v >> 64) as i64).collect(), + values.iter().map(|v| *v as u64).collect(), + ) +} + +// ---- arrow-rs reference ---- + +#[divan::bench(args = LENGTHS)] +fn arrow_cmp_lt(bencher: Bencher, len: usize) { + let arr = Decimal128Array::from_iter_values(wide_values(len)) + .with_precision_and_scale(38, 2) + .unwrap(); + let rhs = ArrowScalar::new( + Decimal128Array::from_iter_values([THRESHOLD]) + .with_precision_and_scale(38, 2) + .unwrap(), + ); + bencher + .with_inputs(|| (arr.clone(), rhs.clone())) + .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap())); +} + +// ---- scalar i128 (one contiguous read) ---- + +fn lt_i128_scalar(vals: &[i128], th: i128, out: &mut [u64]) { + for (w, chunk) in vals.chunks(64).enumerate() { + let mut packed = 0u64; + for (b, &v) in chunk.iter().enumerate() { + packed |= u64::from(v < th) << b; + } + out[w] = packed; + } +} + +#[divan::bench(args = LENGTHS)] +fn raw_i128_scalar(bencher: Bencher, len: usize) { + let vals = wide_values(len); + let mut out = vec![0u64; len.div_ceil(64)]; + bencher.bench_local(|| { + lt_i128_scalar(black_box(&vals), black_box(THRESHOLD), &mut out); + black_box(&out); + }); +} + +// ---- scalar limb (lexicographic) ---- + +fn lt_limb_scalar(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u64]) { + for w in 0..hi.len().div_ceil(64) { + let base = w * 64; + let end = (base + 64).min(hi.len()); + let mut packed = 0u64; + for i in base..end { + let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo)); + packed |= u64::from(lt) << (i - base); + } + out[w] = packed; + } +} + +#[divan::bench(args = LENGTHS)] +fn raw_limb_scalar(bencher: Bencher, len: usize) { + let (hi, lo) = limbs(&wide_values(len)); + let mut out = vec![0u64; len.div_ceil(64)]; + let th_hi = (THRESHOLD >> 64) as i64; + let th_lo = THRESHOLD as u64; + bencher.bench_local(|| { + lt_limb_scalar(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out); + black_box(&out); + }); +} + +// ---- AVX-512 limb (8-wide compare straight to mask bytes) ---- + +#[cfg(target_arch = "x86_64")] +#[target_feature(enable = "avx512f")] +unsafe fn lt_limb_avx512(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u8]) { + use std::arch::x86_64::_mm512_cmpeq_epi64_mask; + use std::arch::x86_64::_mm512_cmplt_epi64_mask; + use std::arch::x86_64::_mm512_cmplt_epu64_mask; + use std::arch::x86_64::_mm512_loadu_epi64; + use std::arch::x86_64::_mm512_set1_epi64; + + let n = hi.len(); + let chunks = n / 8; + // SAFETY: this fn is only called when avx512f is detected; all intrinsics/loads below are in + // bounds because `chunks * 8 <= n` and `out.len() >= n.div_ceil(8)`. + unsafe { + let th_hi_v = _mm512_set1_epi64(th_hi); + let th_lo_v = _mm512_set1_epi64(th_lo as i64); + + for chunk in 0..chunks { + let hv = _mm512_loadu_epi64(hi.as_ptr().add(chunk * 8)); + let lv = _mm512_loadu_epi64(lo.as_ptr().add(chunk * 8).cast()); + let hi_lt = _mm512_cmplt_epi64_mask(hv, th_hi_v); + let hi_eq = _mm512_cmpeq_epi64_mask(hv, th_hi_v); + let lo_lt = _mm512_cmplt_epu64_mask(lv, th_lo_v); + // mask = hi < th_hi | (hi == th_hi & lo < th_lo); each is a __mmask8 (one bit/lane). + out[chunk] = hi_lt | (hi_eq & lo_lt); + } + } + + // scalar tail + for i in (chunks * 8)..n { + let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo)); + let byte = &mut out[i / 8]; + *byte |= u8::from(lt) << (i % 8); + } +} + +#[divan::bench(args = LENGTHS)] +fn raw_limb_avx512(bencher: Bencher, len: usize) { + if !is_x86_feature_detected!("avx512f") { + return; + } + let (hi, lo) = limbs(&wide_values(len)); + let mut out = vec![0u8; len.div_ceil(8)]; + let th_hi = (THRESHOLD >> 64) as i64; + let th_lo = THRESHOLD as u64; + + // Cross-check the AVX-512 output against the scalar reference so the timing is for a correct + // kernel. The u8 mask bytes alias the u64 words bit-for-bit (lane i -> bit i). + unsafe { lt_limb_avx512(&hi, &lo, th_hi, th_lo, &mut out) }; + let mut reference = vec![0u64; len.div_ceil(64)]; + lt_limb_scalar(&hi, &lo, th_hi, th_lo, &mut reference); + for (i, &byte) in out.iter().enumerate() { + assert_eq!( + byte, + (reference[i / 8] >> ((i % 8) * 8)) as u8, + "avx512 mask mismatch at byte {i}" + ); + } + out.iter_mut().for_each(|b| *b = 0); + + bencher.bench_local(|| { + // SAFETY: guarded by the avx512f feature check above. + unsafe { lt_limb_avx512(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out) }; + black_box(&out); + }); +} From c542c5280ad7cc6e92250096eda135cd6a78c664 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:37:20 +0000 Subject: [PATCH 10/13] Add AVX-512 limb kernel for two-limb decimal compare and between The two-limb compare/between kernels now dispatch to an AVX-512 path when the host supports it, falling back to the scalar i128-reconstruct path otherwise. The AVX-512 kernel compares 8 lanes of the signed-high i64 and unsigned-low u64 limbs with vpcmpq/vpcmpuq, combining them lexicographically into a __mmask8 that is written directly as one byte of the output bitmap, with no serial bit-packing. arrow's i128 has no vector-compare form on x86, so this is a comparison the limb representation can vectorize and arrow cannot. Dispatch follows the existing take/avx2 pattern (is_x86_feature_detected gate). Operators are monomorphized via const generics so the hot loop carries no operator branch. Correctness is validated against the canonical implementation including a 99-element (non-multiple-of-8) two-limb case that exercises both the vectorized main loop and the scalar tail. Signed-off-by: Joe Isaacs --- .../src/decimal_byte_parts/compute/between.rs | 54 +-- .../src/decimal_byte_parts/compute/compare.rs | 39 +-- .../src/decimal_byte_parts/compute/mod.rs | 23 ++ .../decimal_byte_parts/compute/two_limb.rs | 329 +++++++++++++++++- 4 files changed, 341 insertions(+), 104 deletions(-) diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs index 040b6cde847..69033f12094 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -12,20 +12,14 @@ use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; use vortex_array::scalar_fn::fns::between::BetweenKernel; use vortex_array::scalar_fn::fns::between::BetweenOptions; -use vortex_array::scalar_fn::fns::between::StrictComparison; -use vortex_buffer::BitBuffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive; -use crate::decimal_byte_parts::compute::two_limb::i128_ge; -use crate::decimal_byte_parts::compute::two_limb::i128_gt; -use crate::decimal_byte_parts::compute::two_limb::i128_le; -use crate::decimal_byte_parts::compute::two_limb::i128_lt; +use crate::decimal_byte_parts::compute::two_limb::between_bits; use crate::decimal_byte_parts::compute::two_limb::materialize_limbs; -use crate::decimal_byte_parts::compute::two_limb::reconstruct; impl BetweenKernel for DecimalByteParts { fn between( @@ -105,13 +99,7 @@ impl BetweenKernel for DecimalByteParts { } /// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation in a -/// single fused pass. -/// -/// With high limb `H` (signed i64) and low limb `L` (unsigned u64), the value `v = H<<64 | L` -/// satisfies `v >= lower` iff `H > lo_h OR (H == lo_h AND L >=' lo_l)` and `v <= upper` iff -/// `H < hi_h OR (H == hi_h AND L <=' hi_l)`, where `>='`/`<='` are strict when requested. Each -/// row resolves to native-width integer comparisons in one loop, which vectorizes far better than -/// arrow's 128-bit comparison or a tree of generic array operations with intermediate allocations. +/// single fused pass, dispatching to the AVX-512 limb kernel when available (see [`between_bits`]). fn two_limb_between( arr: &ArrayView<'_, DecimalByteParts>, lower: i128, @@ -126,46 +114,10 @@ fn two_limb_between( let low = low.as_slice::(); assert_eq!(high.len(), low.len(), "limb lengths must match"); - // Pass the comparison as a monomorphized fn so the whole per-element body inlines. - let bits = match (options.lower_strict, options.upper_strict) { - (StrictComparison::Strict, StrictComparison::Strict) => { - collect_two_limb(high, low, lower, i128_gt, upper, i128_lt) - } - (StrictComparison::Strict, StrictComparison::NonStrict) => { - collect_two_limb(high, low, lower, i128_gt, upper, i128_le) - } - (StrictComparison::NonStrict, StrictComparison::Strict) => { - collect_two_limb(high, low, lower, i128_ge, upper, i128_lt) - } - (StrictComparison::NonStrict, StrictComparison::NonStrict) => { - collect_two_limb(high, low, lower, i128_ge, upper, i128_le) - } - }; - + let bits = between_bits(high, low, lower, upper, options); Ok(BoolArray::new(bits, validity).into_array()) } -/// The fused per-element loop. Reconstruct the i128 from its signed-high / unsigned-low limbs and -/// compare against the bounds. A native i128 comparison is ~2 instructions, cheaper than a manual -/// lexicographic decomposition, and the single pass reads each limb once (arrow reads its i128 -/// array twice, once per `gt_eq`/`lt_eq`). Bitwise `&` keeps the body branch-free. -fn collect_two_limb( - high: &[i64], - low: &[u64], - lower: i128, - lower_cmp: impl Fn(i128, i128) -> bool, - upper: i128, - upper_cmp: impl Fn(i128, i128) -> bool, -) -> BitBuffer { - BitBuffer::collect_bool(high.len(), |idx| { - // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). - let hi = unsafe { *high.get_unchecked(idx) }; - let lo = unsafe { *low.get_unchecked(idx) }; - let value = reconstruct(hi, lo); - lower_cmp(value, lower) & upper_cmp(value, upper) - }) -} - #[cfg(test)] mod tests { use rstest::rstest; diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs index 9c193401123..7ebc50e7ac0 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs @@ -22,21 +22,14 @@ use vortex_array::scalar::ScalarValue; use vortex_array::scalar_fn::fns::binary::CompareKernel; use vortex_array::scalar_fn::fns::operators::CompareOperator; use vortex_array::scalar_fn::fns::operators::Operator; -use vortex_buffer::BitBuffer; use vortex_error::VortexExpect; use vortex_error::VortexResult; use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; use crate::decimal_byte_parts::compute::compare::Sign::Positive; -use crate::decimal_byte_parts::compute::two_limb::i128_eq; -use crate::decimal_byte_parts::compute::two_limb::i128_ge; -use crate::decimal_byte_parts::compute::two_limb::i128_gt; -use crate::decimal_byte_parts::compute::two_limb::i128_le; -use crate::decimal_byte_parts::compute::two_limb::i128_lt; -use crate::decimal_byte_parts::compute::two_limb::i128_ne; +use crate::decimal_byte_parts::compute::two_limb::compare_bits; use crate::decimal_byte_parts::compute::two_limb::materialize_limbs; -use crate::decimal_byte_parts::compute::two_limb::reconstruct; impl CompareKernel for DecimalByteParts { fn compare( @@ -108,9 +101,8 @@ impl CompareKernel for DecimalByteParts { } } -/// Compare each reconstructed i128 value against the constant `rhs` in a single fused pass. See -/// `two_limb::reconstruct`; a native i128 compare is cheaper than a manual lexicographic -/// decomposition and reads each limb only once. +/// Compare the two-limb array against the constant `rhs` in a single fused pass, dispatching to the +/// AVX-512 limb kernel when available (see [`compare_bits`]). fn two_limb_compare( lhs: &ArrayView<'_, DecimalByteParts>, rhs: i128, @@ -124,33 +116,10 @@ fn two_limb_compare( let low = low.as_slice::(); assert_eq!(high.len(), low.len(), "limb lengths must match"); - let bits = match operator { - CompareOperator::Eq => collect_compare(high, low, rhs, i128_eq), - CompareOperator::NotEq => collect_compare(high, low, rhs, i128_ne), - CompareOperator::Gt => collect_compare(high, low, rhs, i128_gt), - CompareOperator::Gte => collect_compare(high, low, rhs, i128_ge), - CompareOperator::Lt => collect_compare(high, low, rhs, i128_lt), - CompareOperator::Lte => collect_compare(high, low, rhs, i128_le), - }; - + let bits = compare_bits(high, low, rhs, operator); Ok(BoolArray::new(bits, validity).into_array()) } -fn collect_compare( - high: &[i64], - low: &[u64], - rhs: i128, - cmp: impl Fn(i128, i128) -> bool, -) -> BitBuffer { - BitBuffer::collect_bool(high.len(), |idx| { - // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). - let hi = unsafe { *high.get_unchecked(idx) }; - let lo = unsafe { *low.get_unchecked(idx) }; - let value = reconstruct(hi, lo); - cmp(value, rhs) - }) -} - // Used to represent the overflow direction when trying to // convert into the scalar type. #[derive(Debug)] diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs index 44e97887da3..5ea11ead376 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs @@ -26,6 +26,15 @@ mod tests { use crate::DecimalByteParts; use crate::DecimalBytePartsArray; + // 99 i128 values whose high (i64) limb varies in sign and whose low (u64) limb spans the full + // unsigned range, so limb tie-breaks and sign handling are covered across SIMD chunks. + #[allow(clippy::cast_possible_truncation)] + fn large_two_limb_values() -> Vec { + (0..99i128) + .map(|i| ((i - 50) << 64) | i128::from((i as u64).wrapping_mul(0x0123_4567_89ab_cdef))) + .collect() + } + #[allow(clippy::cast_possible_truncation)] fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> DecimalBytePartsArray { let highs: Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); @@ -95,6 +104,20 @@ mod tests { Validity::Array(BoolArray::from_iter([true, false, true, true, false]).into_array()), DecimalDType::new(38, 2), ))] + // 99 values (not a multiple of 8) so the AVX-512 limb kernel's vectorized main loop *and* its + // scalar tail are both exercised when validating compare/between against canonical. + #[case::two_limb_large(two_limb( + &large_two_limb_values(), + Validity::NonNullable, + DecimalDType::new(38, 2), + ))] + #[case::two_limb_large_nullable(two_limb( + &large_two_limb_values(), + Validity::Array( + BoolArray::from_iter((0..99).map(|i| i % 3 != 0)).into_array(), + ), + DecimalDType::new(38, 2), + ))] fn test_decimal_byte_parts_consistency(#[case] array: DecimalBytePartsArray) { test_array_consistency(&array.into_array()); diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs index 91c7a160fd7..2a4ceea359f 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs @@ -1,12 +1,23 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -//! Shared helpers for fused single-pass kernels over the two-limb (signed-high i64, unsigned-low -//! u64) i128 representation, used by both `between` and `compare`. +//! Fused single-pass kernels over the two-limb (signed-high `i64`, unsigned-low `u64`) i128 +//! representation, shared by `between` and `compare`. +//! +//! arrow's i128 comparison has no SIMD form on any x86 (there is no 128-bit-integer vector +//! compare), so it is inherently scalar. The limbs, by contrast, are native widths: on AVX-512 we +//! compare 8 lanes of `i64`/`u64` with `vpcmpq`/`vpcmpuq`, each producing a `__mmask8` that is +//! exactly one byte of the output bitmap — no serial bit-packing. A scalar reconstruct path (which +//! is itself competitive with arrow) is used when AVX-512 is unavailable. use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::arrays::PrimitiveArray; +use vortex_array::scalar_fn::fns::between::BetweenOptions; +use vortex_array::scalar_fn::fns::between::StrictComparison; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_buffer::BitBuffer; +use vortex_buffer::BufferMut; use vortex_error::VortexExpect; use vortex_error::VortexResult; @@ -15,8 +26,8 @@ use crate::decimal_byte_parts::DecimalBytePartsArrayExt; /// Materialize the high (signed `i64`) and low (unsigned `u64`) limbs of a two-limb array. /// -/// The caller is responsible for checking [`DecimalBytePartsArrayExt::lower`] is present before -/// calling; this panics otherwise. +/// The caller must check [`DecimalBytePartsArrayExt::lower`] is present first; this panics +/// otherwise. pub(crate) fn materialize_limbs( arr: &ArrayView<'_, DecimalByteParts>, ctx: &mut ExecutionCtx, @@ -30,28 +41,310 @@ pub(crate) fn materialize_limbs( Ok((high, low)) } -/// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb, -/// so `value = (high << 64) | low`. +/// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb. #[inline(always)] pub(crate) fn reconstruct(high: i64, low: u64) -> i128 { ((high as i128) << 64) | (low as i128) } -pub(crate) fn i128_eq(a: i128, b: i128) -> bool { - a == b +// Operator codes, used as const generics to monomorphize the comparison out of the hot loop. +const OP_LT: u8 = 0; +const OP_LE: u8 = 1; +const OP_GT: u8 = 2; +const OP_GE: u8 = 3; +const OP_EQ: u8 = 4; +const OP_NE: u8 = 5; + +#[inline(always)] +fn op_code(op: CompareOperator) -> u8 { + match op { + CompareOperator::Lt => OP_LT, + CompareOperator::Lte => OP_LE, + CompareOperator::Gt => OP_GT, + CompareOperator::Gte => OP_GE, + CompareOperator::Eq => OP_EQ, + CompareOperator::NotEq => OP_NE, + } +} + +#[inline(always)] +fn cmp_i128(value: i128, bound: i128) -> bool { + match OP { + OP_LT => value < bound, + OP_LE => value <= bound, + OP_GT => value > bound, + OP_GE => value >= bound, + OP_EQ => value == bound, + OP_NE => value != bound, + _ => unreachable!(), + } +} + +/// `value bound` over the two-limb representation, returned as a packed [`BitBuffer`]. +pub(crate) fn compare_bits( + high: &[i64], + low: &[u64], + bound: i128, + op: CompareOperator, +) -> BitBuffer { + debug_assert_eq!(high.len(), low.len()); + let code = op_code(op); + + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx512f") { + let mut bytes = BufferMut::::zeroed(high.len().div_ceil(8)); + // SAFETY: avx512f is detected, and `bytes` has `len.div_ceil(8)` bytes. + unsafe { avx512::compare(high, low, bound, code, bytes.as_mut_slice()) }; + return BitBuffer::new(bytes.freeze(), high.len()); + } + + compare_scalar(high, low, bound, code) } -pub(crate) fn i128_ne(a: i128, b: i128) -> bool { - a != b + +/// `lower <= value <= upper` (respecting strictness) over the two-limb representation, returned as +/// a packed [`BitBuffer`]. +pub(crate) fn between_bits( + high: &[i64], + low: &[u64], + lower: i128, + upper: i128, + options: &BetweenOptions, +) -> BitBuffer { + debug_assert_eq!(high.len(), low.len()); + let lower_op = match options.lower_strict { + StrictComparison::Strict => OP_GT, + StrictComparison::NonStrict => OP_GE, + }; + let upper_op = match options.upper_strict { + StrictComparison::Strict => OP_LT, + StrictComparison::NonStrict => OP_LE, + }; + + #[cfg(target_arch = "x86_64")] + if std::is_x86_feature_detected!("avx512f") { + let mut bytes = BufferMut::::zeroed(high.len().div_ceil(8)); + // SAFETY: avx512f is detected, and `bytes` has `len.div_ceil(8)` bytes. + unsafe { + avx512::between( + high, + low, + lower, + upper, + lower_op, + upper_op, + bytes.as_mut_slice(), + ) + }; + return BitBuffer::new(bytes.freeze(), high.len()); + } + + between_scalar(high, low, lower, upper, lower_op, upper_op) } -pub(crate) fn i128_ge(a: i128, b: i128) -> bool { - a >= b + +// ---- Scalar reconstruct fallback ---- + +fn compare_scalar(high: &[i64], low: &[u64], bound: i128, code: u8) -> BitBuffer { + match code { + OP_LT => compare_scalar_impl::(high, low, bound), + OP_LE => compare_scalar_impl::(high, low, bound), + OP_GT => compare_scalar_impl::(high, low, bound), + OP_GE => compare_scalar_impl::(high, low, bound), + OP_EQ => compare_scalar_impl::(high, low, bound), + _ => compare_scalar_impl::(high, low, bound), + } } -pub(crate) fn i128_gt(a: i128, b: i128) -> bool { - a > b + +fn compare_scalar_impl(high: &[i64], low: &[u64], bound: i128) -> BitBuffer { + BitBuffer::collect_bool(high.len(), |idx| { + // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). + let value = reconstruct(unsafe { *high.get_unchecked(idx) }, unsafe { + *low.get_unchecked(idx) + }); + cmp_i128::(value, bound) + }) } -pub(crate) fn i128_le(a: i128, b: i128) -> bool { - a <= b + +fn between_scalar( + high: &[i64], + low: &[u64], + lower: i128, + upper: i128, + lower_op: u8, + upper_op: u8, +) -> BitBuffer { + match (lower_op, upper_op) { + (OP_GT, OP_LT) => between_scalar_impl::(high, low, lower, upper), + (OP_GT, _) => between_scalar_impl::(high, low, lower, upper), + (_, OP_LT) => between_scalar_impl::(high, low, lower, upper), + _ => between_scalar_impl::(high, low, lower, upper), + } } -pub(crate) fn i128_lt(a: i128, b: i128) -> bool { - a < b + +fn between_scalar_impl( + high: &[i64], + low: &[u64], + lower: i128, + upper: i128, +) -> BitBuffer { + BitBuffer::collect_bool(high.len(), |idx| { + // SAFETY: collect_bool yields idx in 0..high.len(), and low.len() == high.len(). + let value = reconstruct(unsafe { *high.get_unchecked(idx) }, unsafe { + *low.get_unchecked(idx) + }); + cmp_i128::(value, lower) & cmp_i128::(value, upper) + }) +} + +// ---- AVX-512 fast path ---- + +#[cfg(target_arch = "x86_64")] +mod avx512 { + // Extracting the low 64 bits of an i128 bound via `as u64` is intentional limb truncation. + #![allow(clippy::cast_possible_truncation)] + + use std::arch::x86_64::__m512i; + use std::arch::x86_64::_mm512_cmpeq_epi64_mask; + use std::arch::x86_64::_mm512_cmpge_epu64_mask; + use std::arch::x86_64::_mm512_cmpgt_epi64_mask; + use std::arch::x86_64::_mm512_cmpgt_epu64_mask; + use std::arch::x86_64::_mm512_cmple_epu64_mask; + use std::arch::x86_64::_mm512_cmplt_epi64_mask; + use std::arch::x86_64::_mm512_cmplt_epu64_mask; + use std::arch::x86_64::_mm512_loadu_epi64; + use std::arch::x86_64::_mm512_set1_epi64; + + use super::OP_EQ; + use super::OP_GE; + use super::OP_GT; + use super::OP_LE; + use super::OP_LT; + use super::OP_NE; + use super::cmp_i128; + use super::reconstruct; + + /// Per-chunk mask for `value bound`, combining the signed-high and unsigned-low limb + /// comparisons lexicographically. Each operand is a vector of 8 lanes; the result is a + /// `__mmask8` (one bit per lane). + /// + /// # Safety + /// The caller must ensure the `avx512f` feature is enabled. + #[target_feature(enable = "avx512f")] + fn chunk_mask(h: __m512i, l: __m512i, bh: __m512i, bl: __m512i) -> u8 { + match OP { + OP_LT => { + _mm512_cmplt_epi64_mask(h, bh) + | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmplt_epu64_mask(l, bl)) + } + OP_LE => { + _mm512_cmplt_epi64_mask(h, bh) + | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmple_epu64_mask(l, bl)) + } + OP_GT => { + _mm512_cmpgt_epi64_mask(h, bh) + | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpgt_epu64_mask(l, bl)) + } + OP_GE => { + _mm512_cmpgt_epi64_mask(h, bh) + | (_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpge_epu64_mask(l, bl)) + } + OP_EQ => _mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpeq_epi64_mask(l, bl), + OP_NE => !(_mm512_cmpeq_epi64_mask(h, bh) & _mm512_cmpeq_epi64_mask(l, bl)), + _ => unreachable!(), + } + } + + /// # Safety + /// The caller must ensure `avx512f` is enabled and `out.len() == high.len().div_ceil(8)`. + #[target_feature(enable = "avx512f")] + pub(super) unsafe fn compare(high: &[i64], low: &[u64], bound: i128, code: u8, out: &mut [u8]) { + // SAFETY: avx512f is enabled by the target_feature on this fn and its callees. + unsafe { + match code { + OP_LT => compare_impl::(high, low, bound, out), + OP_LE => compare_impl::(high, low, bound, out), + OP_GT => compare_impl::(high, low, bound, out), + OP_GE => compare_impl::(high, low, bound, out), + OP_EQ => compare_impl::(high, low, bound, out), + _ => compare_impl::(high, low, bound, out), + } + } + } + + #[target_feature(enable = "avx512f")] + unsafe fn compare_impl(high: &[i64], low: &[u64], bound: i128, out: &mut [u8]) { + let bh = (bound >> 64) as i64; + let bl = bound as u64; + // SAFETY: avx512f enabled; loads are unaligned; indices stay within the 8-lane chunks. + unsafe { + let bhv = _mm512_set1_epi64(bh); + let blv = _mm512_set1_epi64(bl as i64); + + let chunks = high.len() / 8; + for c in 0..chunks { + let h = _mm512_loadu_epi64(high.as_ptr().add(c * 8)); + let l = _mm512_loadu_epi64(low.as_ptr().add(c * 8).cast()); + out[c] = chunk_mask::(h, l, bhv, blv); + } + + for i in (chunks * 8)..high.len() { + if cmp_i128::(reconstruct(high[i], low[i]), bound) { + out[i / 8] |= 1 << (i % 8); + } + } + } + } + + /// # Safety + /// The caller must ensure `avx512f` is enabled and `out.len() == high.len().div_ceil(8)`. + #[target_feature(enable = "avx512f")] + pub(super) unsafe fn between( + high: &[i64], + low: &[u64], + lower: i128, + upper: i128, + lower_op: u8, + upper_op: u8, + out: &mut [u8], + ) { + // SAFETY: avx512f is enabled by the target_feature on this fn and its callees. + unsafe { + match (lower_op, upper_op) { + (OP_GT, OP_LT) => between_impl::(high, low, lower, upper, out), + (OP_GT, _) => between_impl::(high, low, lower, upper, out), + (_, OP_LT) => between_impl::(high, low, lower, upper, out), + _ => between_impl::(high, low, lower, upper, out), + } + } + } + + #[target_feature(enable = "avx512f")] + unsafe fn between_impl( + high: &[i64], + low: &[u64], + lower: i128, + upper: i128, + out: &mut [u8], + ) { + // SAFETY: avx512f enabled; loads are unaligned; indices stay within the 8-lane chunks. + unsafe { + let lhv = _mm512_set1_epi64((lower >> 64) as i64); + let llv = _mm512_set1_epi64(lower as u64 as i64); + let uhv = _mm512_set1_epi64((upper >> 64) as i64); + let ulv = _mm512_set1_epi64(upper as u64 as i64); + + let chunks = high.len() / 8; + for c in 0..chunks { + let h = _mm512_loadu_epi64(high.as_ptr().add(c * 8)); + let l = _mm512_loadu_epi64(low.as_ptr().add(c * 8).cast()); + out[c] = chunk_mask::(h, l, lhv, llv) & chunk_mask::(h, l, uhv, ulv); + } + + for i in (chunks * 8)..high.len() { + let value = reconstruct(high[i], low[i]); + if cmp_i128::(value, lower) & cmp_i128::(value, upper) { + out[i / 8] |= 1 << (i % 8); + } + } + } + } } From 96b64b7ab849eb28a0c6b377d4136180d30ea083 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 18:54:36 +0000 Subject: [PATCH 11/13] Trim decimal comparison benchmarks to a justified, <1ms set Audit of the added benches: - Remove limb_simd: it duplicated the AVX-512 kernel that now lives in two_limb.rs (drift risk), benchmarked a lexicographic-scalar kernel that ships nowhere, and its 1<<20 case exceeded the 1ms-per-op budget. The productized path is now measured by decimal_lt/decimal_between. - Drop the `_wide` arrow and canonical baselines from both files: an i128 comparison's cost is independent of values and precision/scale, so they were provably identical to their narrow counterparts (measured 51.4 vs 51.4us). One arrow + one canonical i128 baseline now serves both narrow and wide. Every remaining case stays well under 1ms. Signed-off-by: Joe Isaacs --- encodings/decimal-byte-parts/Cargo.toml | 4 - .../benches/decimal_between.rs | 74 +------ .../decimal-byte-parts/benches/decimal_lt.rs | 62 +----- .../decimal-byte-parts/benches/limb_simd.rs | 188 ------------------ 4 files changed, 13 insertions(+), 315 deletions(-) delete mode 100644 encodings/decimal-byte-parts/benches/limb_simd.rs diff --git a/encodings/decimal-byte-parts/Cargo.toml b/encodings/decimal-byte-parts/Cargo.toml index d35378ea179..783fc815aa0 100644 --- a/encodings/decimal-byte-parts/Cargo.toml +++ b/encodings/decimal-byte-parts/Cargo.toml @@ -41,7 +41,3 @@ harness = false [[bench]] name = "decimal_lt" harness = false - -[[bench]] -name = "limb_simd" -harness = false diff --git a/encodings/decimal-byte-parts/benches/decimal_between.rs b/encodings/decimal-byte-parts/benches/decimal_between.rs index e55f0866c7c..f9e66c444a7 100644 --- a/encodings/decimal-byte-parts/benches/decimal_between.rs +++ b/encodings/decimal-byte-parts/benches/decimal_between.rs @@ -217,12 +217,16 @@ fn arrow_decimal128(bencher: Bencher, len: usize) { }); } -// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ---- +// ---- Wide i128 decimals: two-limb ---- // // These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex // nor arrow can keep them in a narrow integer. The two-limb representation splits each value into -// a signed i64 high limb and an unsigned u64 low limb, letting `between` run as native-width -// (SIMD-friendly) integer comparisons instead of arrow's 128-bit comparison. +// a signed i64 high limb and an unsigned u64 low limb, compared limb-wise (AVX-512 when available) +// instead of arrow's 128-bit comparison. +// +// The i128 baselines for this comparison are `arrow_decimal128` and `vortex_canonical_i128` above: +// an i128 comparison's cost is independent of the values and the declared precision/scale, so those +// benches measure the same kernel regardless of whether the data is logically narrow or wide. fn wide_values(len: usize) -> Vec { let mut rng = StdRng::seed_from_u64(0x5eed); @@ -277,67 +281,3 @@ fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) { ) }); } - -#[divan::bench(args = LENGTHS)] -fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) { - let dt = DecimalDType::new(38, 2); - let arr = DecimalArray::new( - wide_values(len).into_iter().collect(), - dt, - Validity::NonNullable, - ) - .into_array(); - let lower = ConstantArray::new( - Scalar::decimal(DecimalValue::I128(WIDE_LOWER), dt, Nullability::NonNullable), - len, - ) - .into_array(); - let upper = ConstantArray::new( - Scalar::decimal(DecimalValue::I128(WIDE_UPPER), dt, Nullability::NonNullable), - len, - ) - .into_array(); - - bencher - .with_inputs(|| { - ( - arr.clone(), - lower.clone(), - upper.clone(), - LEGACY_SESSION.create_execution_ctx(), - ) - }) - .bench_values(|(arr, lower, upper, mut ctx)| { - black_box( - arr.between(lower, upper, OPTIONS) - .unwrap() - .execute::(&mut ctx) - .unwrap(), - ) - }); -} - -#[divan::bench(args = LENGTHS)] -fn arrow_decimal128_wide(bencher: Bencher, len: usize) { - let arr = Decimal128Array::from_iter_values(wide_values(len)) - .with_precision_and_scale(38, 2) - .unwrap(); - let lower = ArrowScalar::new( - Decimal128Array::from_iter_values([WIDE_LOWER]) - .with_precision_and_scale(38, 2) - .unwrap(), - ); - let upper = ArrowScalar::new( - Decimal128Array::from_iter_values([WIDE_UPPER]) - .with_precision_and_scale(38, 2) - .unwrap(), - ); - - bencher - .with_inputs(|| (arr.clone(), lower.clone(), upper.clone())) - .bench_values(|(arr, lower, upper)| { - let ge = cmp::gt_eq(&arr, &lower).unwrap(); - let le = cmp::lt_eq(&arr, &upper).unwrap(); - black_box(and(&ge, &le).unwrap()) - }); -} diff --git a/encodings/decimal-byte-parts/benches/decimal_lt.rs b/encodings/decimal-byte-parts/benches/decimal_lt.rs index 037c8800526..da84ca4e3e0 100644 --- a/encodings/decimal-byte-parts/benches/decimal_lt.rs +++ b/encodings/decimal-byte-parts/benches/decimal_lt.rs @@ -175,12 +175,15 @@ fn arrow_decimal128(bencher: Bencher, len: usize) { .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap())); } -// ---- Wide i128 decimals: two-limb vs canonical i128 vs arrow Decimal128 ---- +// ---- Wide i128 decimals: two-limb ---- // // These values genuinely occupy the i128 range (the high 64-bit limb varies), so neither Vortex // nor arrow can keep them in a narrow integer. The two-limb representation splits each value into a -// signed i64 high limb and an unsigned u64 low limb, letting `lt` reconstruct the i128 once and run -// a single native comparison. +// signed i64 high limb and an unsigned u64 low limb, compared limb-wise (AVX-512 when available). +// +// The i128 baselines for this comparison are `arrow_decimal128` and `vortex_canonical_i128` above: +// an i128 comparison's cost is independent of the values and the declared precision/scale, so those +// benches measure the same kernel regardless of whether the data is logically narrow or wide. fn wide_values(len: usize) -> Vec { let mut rng = StdRng::seed_from_u64(0x5eed); @@ -232,56 +235,3 @@ fn vortex_byteparts_twolimb(bencher: Bencher, len: usize) { ) }); } - -#[divan::bench(args = LENGTHS)] -fn vortex_canonical_i128_wide(bencher: Bencher, len: usize) { - let dt = DecimalDType::new(38, 2); - let arr = DecimalArray::new( - wide_values(len).into_iter().collect(), - dt, - Validity::NonNullable, - ) - .into_array(); - let rhs = ConstantArray::new( - Scalar::decimal( - DecimalValue::I128(WIDE_THRESHOLD), - dt, - Nullability::NonNullable, - ), - len, - ) - .into_array(); - - bencher - .with_inputs(|| { - ( - arr.clone(), - rhs.clone(), - LEGACY_SESSION.create_execution_ctx(), - ) - }) - .bench_values(|(arr, rhs, mut ctx)| { - black_box( - arr.binary(rhs, Operator::Lt) - .unwrap() - .execute::(&mut ctx) - .unwrap(), - ) - }); -} - -#[divan::bench(args = LENGTHS)] -fn arrow_decimal128_wide(bencher: Bencher, len: usize) { - let arr = Decimal128Array::from_iter_values(wide_values(len)) - .with_precision_and_scale(38, 2) - .unwrap(); - let rhs = ArrowScalar::new( - Decimal128Array::from_iter_values([WIDE_THRESHOLD]) - .with_precision_and_scale(38, 2) - .unwrap(), - ); - - bencher - .with_inputs(|| (arr.clone(), rhs.clone())) - .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap())); -} diff --git a/encodings/decimal-byte-parts/benches/limb_simd.rs b/encodings/decimal-byte-parts/benches/limb_simd.rs deleted file mode 100644 index 889f12e17f1..00000000000 --- a/encodings/decimal-byte-parts/benches/limb_simd.rs +++ /dev/null @@ -1,188 +0,0 @@ -// SPDX-License-Identifier: Apache-2.0 -// SPDX-FileCopyrightText: Copyright the Vortex contributors - -//! Microbenchmark isolating the *kernel* cost of `x < threshold` over wide i128 decimals, with no -//! Vortex expression-execution overhead, to answer: can comparing the (signed-high i64, unsigned-low -//! u64) limb pair beat arrow's i128 comparison? -//! -//! arrow's i128 has no SIMD compare on any x86 (there is no 128-bit-integer vector comparison), so -//! it is inherently scalar. The limbs are native i64/u64, which AVX-512 compares 8-wide with -//! `vpcmpq`/`vpcmpuq`, producing a mask register that is exactly the packed-bit output we want. -//! -//! Kernels: -//! * `arrow_cmp_lt` arrow-rs `cmp::lt` over a `Decimal128Array` (reference). -//! * `raw_i128_scalar` read one contiguous i128 slice, scalar compare, serial bit-pack. -//! * `raw_limb_scalar` read the two limb slices, lexicographic compare, serial bit-pack. -//! * `raw_limb_avx512` read the two limb slices, AVX-512 limb compare straight to a mask byte. - -#![allow(clippy::unwrap_used, clippy::cast_possible_truncation)] - -use arrow_array::Decimal128Array; -use arrow_array::Scalar as ArrowScalar; -use arrow_ord::cmp; -use divan::Bencher; -use divan::black_box; -use rand::RngExt; -use rand::SeedableRng; -use rand::rngs::StdRng; - -fn main() { - divan::main(); -} - -const LENGTHS: &[usize] = &[1 << 16, 1 << 17, 1 << 20]; - -const THRESHOLD: i128 = (500i128 << 64) | 0x90ab_cdef; - -fn wide_values(len: usize) -> Vec { - let mut rng = StdRng::seed_from_u64(0x5eed); - (0..len) - .map(|_| { - let high = i128::from(rng.random_range(0..1000i64)); - let low = i128::from(rng.random_range(0..u64::MAX)); - (high << 64) | low - }) - .collect() -} - -fn limbs(values: &[i128]) -> (Vec, Vec) { - ( - values.iter().map(|v| (v >> 64) as i64).collect(), - values.iter().map(|v| *v as u64).collect(), - ) -} - -// ---- arrow-rs reference ---- - -#[divan::bench(args = LENGTHS)] -fn arrow_cmp_lt(bencher: Bencher, len: usize) { - let arr = Decimal128Array::from_iter_values(wide_values(len)) - .with_precision_and_scale(38, 2) - .unwrap(); - let rhs = ArrowScalar::new( - Decimal128Array::from_iter_values([THRESHOLD]) - .with_precision_and_scale(38, 2) - .unwrap(), - ); - bencher - .with_inputs(|| (arr.clone(), rhs.clone())) - .bench_values(|(arr, rhs)| black_box(cmp::lt(&arr, &rhs).unwrap())); -} - -// ---- scalar i128 (one contiguous read) ---- - -fn lt_i128_scalar(vals: &[i128], th: i128, out: &mut [u64]) { - for (w, chunk) in vals.chunks(64).enumerate() { - let mut packed = 0u64; - for (b, &v) in chunk.iter().enumerate() { - packed |= u64::from(v < th) << b; - } - out[w] = packed; - } -} - -#[divan::bench(args = LENGTHS)] -fn raw_i128_scalar(bencher: Bencher, len: usize) { - let vals = wide_values(len); - let mut out = vec![0u64; len.div_ceil(64)]; - bencher.bench_local(|| { - lt_i128_scalar(black_box(&vals), black_box(THRESHOLD), &mut out); - black_box(&out); - }); -} - -// ---- scalar limb (lexicographic) ---- - -fn lt_limb_scalar(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u64]) { - for w in 0..hi.len().div_ceil(64) { - let base = w * 64; - let end = (base + 64).min(hi.len()); - let mut packed = 0u64; - for i in base..end { - let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo)); - packed |= u64::from(lt) << (i - base); - } - out[w] = packed; - } -} - -#[divan::bench(args = LENGTHS)] -fn raw_limb_scalar(bencher: Bencher, len: usize) { - let (hi, lo) = limbs(&wide_values(len)); - let mut out = vec![0u64; len.div_ceil(64)]; - let th_hi = (THRESHOLD >> 64) as i64; - let th_lo = THRESHOLD as u64; - bencher.bench_local(|| { - lt_limb_scalar(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out); - black_box(&out); - }); -} - -// ---- AVX-512 limb (8-wide compare straight to mask bytes) ---- - -#[cfg(target_arch = "x86_64")] -#[target_feature(enable = "avx512f")] -unsafe fn lt_limb_avx512(hi: &[i64], lo: &[u64], th_hi: i64, th_lo: u64, out: &mut [u8]) { - use std::arch::x86_64::_mm512_cmpeq_epi64_mask; - use std::arch::x86_64::_mm512_cmplt_epi64_mask; - use std::arch::x86_64::_mm512_cmplt_epu64_mask; - use std::arch::x86_64::_mm512_loadu_epi64; - use std::arch::x86_64::_mm512_set1_epi64; - - let n = hi.len(); - let chunks = n / 8; - // SAFETY: this fn is only called when avx512f is detected; all intrinsics/loads below are in - // bounds because `chunks * 8 <= n` and `out.len() >= n.div_ceil(8)`. - unsafe { - let th_hi_v = _mm512_set1_epi64(th_hi); - let th_lo_v = _mm512_set1_epi64(th_lo as i64); - - for chunk in 0..chunks { - let hv = _mm512_loadu_epi64(hi.as_ptr().add(chunk * 8)); - let lv = _mm512_loadu_epi64(lo.as_ptr().add(chunk * 8).cast()); - let hi_lt = _mm512_cmplt_epi64_mask(hv, th_hi_v); - let hi_eq = _mm512_cmpeq_epi64_mask(hv, th_hi_v); - let lo_lt = _mm512_cmplt_epu64_mask(lv, th_lo_v); - // mask = hi < th_hi | (hi == th_hi & lo < th_lo); each is a __mmask8 (one bit/lane). - out[chunk] = hi_lt | (hi_eq & lo_lt); - } - } - - // scalar tail - for i in (chunks * 8)..n { - let lt = (hi[i] < th_hi) | ((hi[i] == th_hi) & (lo[i] < th_lo)); - let byte = &mut out[i / 8]; - *byte |= u8::from(lt) << (i % 8); - } -} - -#[divan::bench(args = LENGTHS)] -fn raw_limb_avx512(bencher: Bencher, len: usize) { - if !is_x86_feature_detected!("avx512f") { - return; - } - let (hi, lo) = limbs(&wide_values(len)); - let mut out = vec![0u8; len.div_ceil(8)]; - let th_hi = (THRESHOLD >> 64) as i64; - let th_lo = THRESHOLD as u64; - - // Cross-check the AVX-512 output against the scalar reference so the timing is for a correct - // kernel. The u8 mask bytes alias the u64 words bit-for-bit (lane i -> bit i). - unsafe { lt_limb_avx512(&hi, &lo, th_hi, th_lo, &mut out) }; - let mut reference = vec![0u64; len.div_ceil(64)]; - lt_limb_scalar(&hi, &lo, th_hi, th_lo, &mut reference); - for (i, &byte) in out.iter().enumerate() { - assert_eq!( - byte, - (reference[i / 8] >> ((i % 8) * 8)) as u8, - "avx512 mask mismatch at byte {i}" - ); - } - out.iter_mut().for_each(|b| *b = 0); - - bencher.bench_local(|| { - // SAFETY: guarded by the avx512f feature check above. - unsafe { lt_limb_avx512(black_box(&hi), black_box(&lo), th_hi, th_lo, &mut out) }; - black_box(&out); - }); -} From 59e50ea6fbb1d1d648b191c5d5f99f64e3d26868 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 19:30:17 +0000 Subject: [PATCH 12/13] Consolidate two-limb compare/between plumbing into one helper Both kernels duplicated the same materialize-limbs / combine-validity / wrap-as- BoolArray boilerplate. Factor it into `two_limb::eval`, which takes a closure mapping the high/low limb slices to a packed BitBuffer, so `compare` and `between` reduce to extracting the bound(s) and calling it. Drop the now-internal `materialize_limbs`/`reconstruct` from the crate-visible surface. Signed-off-by: Joe Isaacs --- .../src/decimal_byte_parts/compute/between.rs | 35 +++---------------- .../src/decimal_byte_parts/compute/compare.rs | 32 +++-------------- .../decimal_byte_parts/compute/two_limb.rs | 24 ++++++++----- 3 files changed, 24 insertions(+), 67 deletions(-) diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs index 69033f12094..c33cd54317b 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -5,10 +5,8 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; use vortex_array::builtins::ArrayBuiltins; -use vortex_array::dtype::Nullability; use vortex_array::scalar::Scalar; use vortex_array::scalar_fn::fns::between::BetweenKernel; use vortex_array::scalar_fn::fns::between::BetweenOptions; @@ -19,7 +17,7 @@ use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; use crate::decimal_byte_parts::compute::compare::decimal_value_wrapper_to_primitive; use crate::decimal_byte_parts::compute::two_limb::between_bits; -use crate::decimal_byte_parts::compute::two_limb::materialize_limbs; +use crate::decimal_byte_parts::compute::two_limb::eval; impl BetweenKernel for DecimalByteParts { fn between( @@ -57,14 +55,9 @@ impl BetweenKernel for DecimalByteParts { else { return Ok(None); }; - return Ok(Some(two_limb_between( - &arr, - lower_i128, - upper_i128, - options, - nullability, - ctx, - )?)); + return Ok(Some(eval(&arr, nullability, ctx, |high, low| { + between_bits(high, low, lower_i128, upper_i128, options) + })?)); } let scalar_type = arr.msp().dtype().with_nullability(nullability); @@ -98,26 +91,6 @@ impl BetweenKernel for DecimalByteParts { } } -/// Evaluate `lower <= value <= upper` (respecting strictness) over the two-limb representation in a -/// single fused pass, dispatching to the AVX-512 limb kernel when available (see [`between_bits`]). -fn two_limb_between( - arr: &ArrayView<'_, DecimalByteParts>, - lower: i128, - upper: i128, - options: &BetweenOptions, - nullability: Nullability, - ctx: &mut ExecutionCtx, -) -> VortexResult { - let (high, low) = materialize_limbs(arr, ctx)?; - let validity = high.validity()?.union_nullability(nullability); - let high = high.as_slice::(); - let low = low.as_slice::(); - assert_eq!(high.len(), low.len(), "limb lengths must match"); - - let bits = between_bits(high, low, lower, upper, options); - Ok(BoolArray::new(bits, validity).into_array()) -} - #[cfg(test)] mod tests { use rstest::rstest; diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs index 7ebc50e7ac0..21223ae92bf 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs @@ -7,7 +7,6 @@ use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; use vortex_array::IntoArray; -use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; use vortex_array::builtins::ArrayBuiltins; use vortex_array::dtype::IntegerPType; @@ -29,7 +28,7 @@ use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; use crate::decimal_byte_parts::compute::compare::Sign::Positive; use crate::decimal_byte_parts::compute::two_limb::compare_bits; -use crate::decimal_byte_parts::compute::two_limb::materialize_limbs; +use crate::decimal_byte_parts::compute::two_limb::eval; impl CompareKernel for DecimalByteParts { fn compare( @@ -56,13 +55,9 @@ impl CompareKernel for DecimalByteParts { let Some(rhs_i128) = rhs_decimal.cast::() else { return Ok(None); }; - return Ok(Some(two_limb_compare( - &lhs, - rhs_i128, - operator, - nullability, - ctx, - )?)); + return Ok(Some(eval(&lhs, nullability, ctx, |high, low| { + compare_bits(high, low, rhs_i128, operator) + })?)); } let scalar_type = lhs.msp().dtype().with_nullability(nullability); @@ -101,25 +96,6 @@ impl CompareKernel for DecimalByteParts { } } -/// Compare the two-limb array against the constant `rhs` in a single fused pass, dispatching to the -/// AVX-512 limb kernel when available (see [`compare_bits`]). -fn two_limb_compare( - lhs: &ArrayView<'_, DecimalByteParts>, - rhs: i128, - operator: CompareOperator, - nullability: Nullability, - ctx: &mut ExecutionCtx, -) -> VortexResult { - let (high, low) = materialize_limbs(lhs, ctx)?; - let validity = high.validity()?.union_nullability(nullability); - let high = high.as_slice::(); - let low = low.as_slice::(); - assert_eq!(high.len(), low.len(), "limb lengths must match"); - - let bits = compare_bits(high, low, rhs, operator); - Ok(BoolArray::new(bits, validity).into_array()) -} - // Used to represent the overflow direction when trying to // convert into the scalar type. #[derive(Debug)] diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs index 2a4ceea359f..272ecf003dc 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs @@ -10,9 +10,13 @@ //! exactly one byte of the output bitmap — no serial bit-packing. A scalar reconstruct path (which //! is itself competitive with arrow) is used when AVX-512 is unavailable. +use vortex_array::ArrayRef; use vortex_array::ArrayView; use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; use vortex_array::arrays::PrimitiveArray; +use vortex_array::dtype::Nullability; use vortex_array::scalar_fn::fns::between::BetweenOptions; use vortex_array::scalar_fn::fns::between::StrictComparison; use vortex_array::scalar_fn::fns::operators::CompareOperator; @@ -24,26 +28,30 @@ use vortex_error::VortexResult; use crate::DecimalByteParts; use crate::decimal_byte_parts::DecimalBytePartsArrayExt; -/// Materialize the high (signed `i64`) and low (unsigned `u64`) limbs of a two-limb array. -/// -/// The caller must check [`DecimalBytePartsArrayExt::lower`] is present first; this panics -/// otherwise. -pub(crate) fn materialize_limbs( +/// Materialize the two limbs, run a fused bit-kernel over them, and wrap the result as a boolean +/// array carrying the combined validity. Shared entry point for the two-limb `compare`/`between` +/// paths: the caller supplies a kernel mapping the high (`i64`) and low (`u64`) limb slices to a +/// packed [`BitBuffer`]. +pub(crate) fn eval( arr: &ArrayView<'_, DecimalByteParts>, + nullability: Nullability, ctx: &mut ExecutionCtx, -) -> VortexResult<(PrimitiveArray, PrimitiveArray)> { + kernel: impl FnOnce(&[i64], &[u64]) -> BitBuffer, +) -> VortexResult { let high = arr.msp().clone().execute::(ctx)?; let low = arr .lower() .vortex_expect("two-limb path requires a lower limb") .clone() .execute::(ctx)?; - Ok((high, low)) + let validity = high.validity()?.union_nullability(nullability); + let bits = kernel(high.as_slice::(), low.as_slice::()); + Ok(BoolArray::new(bits, validity).into_array()) } /// Reconstruct the i128 value from its limbs: sign-extend the high limb, zero-extend the low limb. #[inline(always)] -pub(crate) fn reconstruct(high: i64, low: u64) -> i128 { +fn reconstruct(high: i64, low: u64) -> i128 { ((high as i128) << 64) | (low as i128) } From 990d2f6ed38c71e9659a4edafa0271188c4f329d Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 26 May 2026 19:35:24 +0000 Subject: [PATCH 13/13] Share the two-limb test array builder across compute test modules The mod/compare/between test modules each duplicated the same i128 limb-split construction. Hoist it to a single `#[cfg(test)] two_limb_array` in two_limb.rs; the two thin local adapters just add `.into_array()` / a default validity. Signed-off-by: Joe Isaacs --- .../src/decimal_byte_parts/compute/between.rs | 12 ++---------- .../src/decimal_byte_parts/compute/compare.rs | 13 ++----------- .../src/decimal_byte_parts/compute/mod.rs | 14 +------------- .../decimal_byte_parts/compute/two_limb.rs | 19 +++++++++++++++++++ 4 files changed, 24 insertions(+), 34 deletions(-) diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs index c33cd54317b..3be35fbd563 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/between.rs @@ -115,6 +115,7 @@ mod tests { use vortex_buffer::buffer; use vortex_error::VortexResult; + use super::super::two_limb::two_limb_array; use crate::DecimalByteParts; fn decimal_const(value: DecimalValue, decimal_type: DecimalDType, len: usize) -> ArrayRef { @@ -125,17 +126,8 @@ mod tests { .into_array() } - #[allow(clippy::cast_possible_truncation)] fn two_limb(values: &[i128], decimal_type: DecimalDType) -> ArrayRef { - let highs: Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); - let lows: Buffer = values.iter().map(|v| *v as u64).collect(); - DecimalByteParts::try_new_with_lower( - PrimitiveArray::new(highs, Validity::NonNullable).into_array(), - PrimitiveArray::new(lows, Validity::NonNullable).into_array(), - decimal_type, - ) - .unwrap() - .into_array() + two_limb_array(values, Validity::NonNullable, decimal_type).into_array() } /// The two-limb `between` pushdown must agree with the canonical i128 implementation across diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs index 21223ae92bf..12cd8b40edd 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/compare.rs @@ -176,23 +176,14 @@ mod tests { use vortex_array::scalar::Scalar; use vortex_array::scalar_fn::fns::operators::Operator; use vortex_array::validity::Validity; - use vortex_buffer::Buffer; use vortex_buffer::buffer; use vortex_error::VortexResult; + use super::super::two_limb::two_limb_array; use crate::DecimalByteParts; - #[allow(clippy::cast_possible_truncation)] fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> ArrayRef { - let highs: Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); - let lows: Buffer = values.iter().map(|v| *v as u64).collect(); - DecimalByteParts::try_new_with_lower( - PrimitiveArray::new(highs, validity).into_array(), - PrimitiveArray::new(lows, Validity::NonNullable).into_array(), - dt, - ) - .unwrap() - .into_array() + two_limb_array(values, validity, dt).into_array() } #[test] diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs index 5ea11ead376..53ff8fb9dbe 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/mod.rs @@ -20,9 +20,9 @@ mod tests { use vortex_array::compute::conformance::consistency::test_array_consistency; use vortex_array::dtype::DecimalDType; use vortex_array::validity::Validity; - use vortex_buffer::Buffer; use vortex_buffer::buffer; + use super::two_limb::two_limb_array as two_limb; use crate::DecimalByteParts; use crate::DecimalBytePartsArray; @@ -35,18 +35,6 @@ mod tests { .collect() } - #[allow(clippy::cast_possible_truncation)] - fn two_limb(values: &[i128], validity: Validity, dt: DecimalDType) -> DecimalBytePartsArray { - let highs: Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); - let lows: Buffer = values.iter().map(|v| *v as u64).collect(); - DecimalByteParts::try_new_with_lower( - PrimitiveArray::new(highs, validity).into_array(), - PrimitiveArray::new(lows, Validity::NonNullable).into_array(), - dt, - ) - .unwrap() - } - #[rstest] // Basic decimal byte parts arrays #[case::decimal_i32(DecimalByteParts::try_new( diff --git a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs index 272ecf003dc..f64262ff371 100644 --- a/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs +++ b/encodings/decimal-byte-parts/src/decimal_byte_parts/compute/two_limb.rs @@ -356,3 +356,22 @@ mod avx512 { } } } + +/// Test helper: build a two-limb [`DecimalByteParts`] array from i128 values, splitting each into a +/// signed high limb and an unsigned low limb. +#[cfg(test)] +#[allow(clippy::cast_possible_truncation)] +pub(crate) fn two_limb_array( + values: &[i128], + validity: vortex_array::validity::Validity, + dt: vortex_array::dtype::DecimalDType, +) -> crate::DecimalBytePartsArray { + let highs: vortex_buffer::Buffer = values.iter().map(|v| (v >> 64) as i64).collect(); + let lows: vortex_buffer::Buffer = values.iter().map(|v| *v as u64).collect(); + DecimalByteParts::try_new_with_lower( + PrimitiveArray::new(highs, validity).into_array(), + PrimitiveArray::new(lows, vortex_array::validity::Validity::NonNullable).into_array(), + dt, + ) + .unwrap() +}