From 0fe87bae9dd8ea13bbe7f79a7007d95acaf7b3e9 Mon Sep 17 00:00:00 2001 From: Mikhail Kot Date: Thu, 11 Jun 2026 16:51:01 +0100 Subject: [PATCH] FSST kernel validity for byte_length; comparison OnPair kernel Signed-off-by: Mikhail Kot --- .../onpair/src/compute/byte_length.rs | 34 +++++ .../onpair/src/compute/compare.rs | 143 ++++++++++++++++++ .../experimental/onpair/src/compute/mod.rs | 2 + encodings/experimental/onpair/src/kernel.rs | 9 +- vortex-array/src/scalar_fn/fns/byte_length.rs | 9 ++ 5 files changed, 195 insertions(+), 2 deletions(-) create mode 100644 encodings/experimental/onpair/src/compute/byte_length.rs create mode 100644 encodings/experimental/onpair/src/compute/compare.rs diff --git a/encodings/experimental/onpair/src/compute/byte_length.rs b/encodings/experimental/onpair/src/compute/byte_length.rs new file mode 100644 index 00000000000..5809cd20fd9 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/byte_length.rs @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::IntoArray; +use vortex_array::ValidityVTable; +use vortex_array::arrays::ConstantArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::dtype::PType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::byte_length::ByteLengthKernel; +use vortex_array::validity::Validity; + +use crate::OnPair; +use crate::OnPairArraySlotsExt; + +impl ByteLengthKernel for OnPair { + fn byte_length( + array: vortex_array::ArrayView<'_, Self>, + _ctx: &mut vortex_array::ExecutionCtx, + ) -> vortex_error::VortexResult> { + let nullable = array.dtype().nullability(); + let dtype = DType::Primitive(PType::U64, nullable); + // Uncompressed lengths are non-nullable and may be less than u64 each + let lengths = array.uncompressed_lengths().cast(dtype.clone())?; + Ok(Some(match OnPair::validity(array)? { + Validity::NonNullable | Validity::AllValid => lengths, + Validity::Array(v) => lengths.mask(v)?, + Validity::AllInvalid => { + ConstantArray::new(Scalar::null(dtype), lengths.len()).into_array() + } + })) + } +} diff --git a/encodings/experimental/onpair/src/compute/compare.rs b/encodings/experimental/onpair/src/compute/compare.rs new file mode 100644 index 00000000000..c33f1f10a08 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/compare.rs @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::builtins::ArrayBuiltins; +use vortex_array::dtype::DType; +use vortex_array::scalar::Scalar; +use vortex_array::scalar_fn::fns::binary::CompareKernel; +use vortex_array::scalar_fn::fns::operators::CompareOperator; +use vortex_buffer::BitBuffer; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArraySlotsExt; + +impl CompareKernel for OnPair { + fn compare( + lhs: ArrayView<'_, Self>, + rhs: &ArrayRef, + operator: CompareOperator, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let Some(constant) = rhs.as_constant() else { + return Ok(None); + }; + let is_empty = match constant.dtype() { + DType::Utf8(_) => constant.as_utf8().is_empty(), + DType::Binary(_) => constant.as_binary().is_empty(), + _ => return Ok(None), + }; + if is_empty != Some(true) { + return Ok(None); + } + + let lengths = lhs.uncompressed_lengths(); + let buffer = match operator { + // every value is greater than an empty string + CompareOperator::Gte => BitBuffer::new_set(lhs.len()), + // no value is less than an empty string + CompareOperator::Lt => BitBuffer::new_unset(lhs.len()), + _ => lengths + .binary( + ConstantArray::new(Scalar::zero_value(lengths.dtype()), lengths.len()) + .into_array(), + operator.into(), + )? + .execute(ctx)?, + }; + Ok(Some( + BoolArray::new( + buffer, + lhs.validity()? + .union_nullability(constant.dtype().nullability()), + ) + .into_array(), + )) + } +} + +#[cfg(test)] +mod tests { + use std::sync::LazyLock; + + use rstest::rstest; + use vortex_array::IntoArray; + use vortex_array::VortexSessionExecute; + use vortex_array::arrays::BoolArray; + use vortex_array::arrays::ConstantArray; + use vortex_array::arrays::VarBinArray; + use vortex_array::assert_arrays_eq; + use vortex_array::builtins::ArrayBuiltins; + use vortex_array::dtype::DType; + use vortex_array::dtype::Nullability; + use vortex_array::scalar::Scalar; + use vortex_array::scalar_fn::fns::operators::Operator; + use vortex_array::session::ArraySession; + use vortex_error::VortexResult; + use vortex_session::VortexSession; + + use crate::compress::DEFAULT_DICT12_CONFIG; + use crate::compress::onpair_compress; + + static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + + #[cfg_attr(miri, ignore)] + #[rstest] + #[case(Operator::Eq, [true, false, true, false])] + #[case(Operator::NotEq, [false, true, false, true])] + #[case(Operator::Gt, [false, true, false, true])] + #[case(Operator::Gte, [true, true, true, true])] + #[case(Operator::Lt, [false, false, false, false])] + #[case(Operator::Lte, [true, false, true, false])] + fn compare_empty_string(#[case] op: Operator, #[case] expected: [bool; 4]) -> VortexResult<()> { + let input = VarBinArray::from_iter( + [Some(""), Some("a"), Some(""), Some("bbb")], + DType::Utf8(Nullability::NonNullable), + ); + let arr = onpair_compress(&input, input.len(), input.dtype(), DEFAULT_DICT12_CONFIG)? + .into_array(); + + let mut ctx = SESSION.create_execution_ctx(); + let result = arr + .binary(ConstantArray::new("", input.len()).into_array(), op)? + .execute::(&mut ctx)?; + assert_arrays_eq!(&result, &BoolArray::from_iter(expected)); + Ok(()) + } + + #[cfg_attr(miri, ignore)] + #[test] + fn compare_empty_string_nullable() -> VortexResult<()> { + let input = VarBinArray::from_iter( + [Some(""), None, Some("x")], + DType::Utf8(Nullability::Nullable), + ); + let arr = onpair_compress(&input, input.len(), input.dtype(), DEFAULT_DICT12_CONFIG)? + .into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + let eq_empty = arr + .clone() + .binary(ConstantArray::new("", arr.len()).into_array(), Operator::Eq)? + .execute::(&mut ctx)?; + assert_arrays_eq!( + &eq_empty, + &BoolArray::from_iter([Some(true), None, Some(false)]) + ); + + let null_rhs = + ConstantArray::new(Scalar::null(DType::Utf8(Nullability::Nullable)), arr.len()); + let eq_null = arr + .binary(null_rhs.into_array(), Operator::Eq)? + .execute::(&mut ctx)?; + assert_arrays_eq!(&eq_null, &BoolArray::from_iter([None::, None, None])); + Ok(()) + } +} diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs index 4cb15868625..4ad5f48f578 100644 --- a/encodings/experimental/onpair/src/compute/mod.rs +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -1,6 +1,8 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors +mod byte_length; mod cast; +mod compare; mod filter; mod slice; diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs index fdd521e887e..7eb7b761b4f 100644 --- a/encodings/experimental/onpair/src/kernel.rs +++ b/encodings/experimental/onpair/src/kernel.rs @@ -3,9 +3,14 @@ use vortex_array::arrays::filter::FilterExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::binary::CompareExecuteAdaptor; +use vortex_array::scalar_fn::fns::byte_length::ByteLengthExecuteAdaptor; use crate::OnPair; // TODO: implement ListExecute & TakeExecute for OnPair -pub(super) const PARENT_KERNELS: ParentKernelSet = - ParentKernelSet::new(&[ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair))]); +pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&CompareExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&ByteLengthExecuteAdaptor(OnPair)), +]); diff --git a/vortex-array/src/scalar_fn/fns/byte_length.rs b/vortex-array/src/scalar_fn/fns/byte_length.rs index 13a4f3158b5..aa9c508ea89 100644 --- a/vortex-array/src/scalar_fn/fns/byte_length.rs +++ b/vortex-array/src/scalar_fn/fns/byte_length.rs @@ -24,6 +24,7 @@ use crate::arrays::varbinview::VarBinViewArrayExt; use crate::dtype::DType; use crate::dtype::Nullability; use crate::dtype::PType; +use crate::expr::Expression; use crate::kernel::ExecuteParentKernel; use crate::scalar::Scalar; use crate::scalar_fn::Arity; @@ -122,6 +123,14 @@ impl ScalarFnVTable for ByteLength { } } + fn validity( + &self, + _: &Self::Options, + expression: &Expression, + ) -> VortexResult> { + Ok(Some(expression.child(0).validity()?)) + } + fn is_null_sensitive(&self, _options: &Self::Options) -> bool { false }