From 3ba50e26d4fceaf85b84b225f312445bf3e07208 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 14:25:24 +0000 Subject: [PATCH 1/5] feat(onpair): LIKE pushdown via per-code DFA (prefix + contains) Evaluate `prefix%` and `%needle%` LIKE patterns directly on OnPair compressed code streams, mirroring the FSST DFA pushdown. Each u16 code is lifted to a byte-level DFA transition (KMP for contains, linear for prefix) by feeding its dictionary token's bytes through the byte table; scanning a row's codes is then one table lookup per code and is exactly equivalent to byte-level matching over the decompressed row. OnPair has no escape code (the trainer always emits all 256 single-byte tokens), so the DFA is strictly simpler than FSST's: no escape sentinel and no escape table. Unsupported pattern shapes (`_`, suffix, ILIKE, needles beyond the u8 state space) return None and fall back to decompression. Wires `LikeExecuteAdaptor(OnPair)` into the parent kernel set. Adds unit tests plus a randomised cross-check against ground-truth starts_with / contains over 600 rows and 14 needles. Signed-off-by: Joe Isaacs --- .../experimental/onpair/src/compute/like.rs | 102 +++++ .../experimental/onpair/src/compute/mod.rs | 1 + .../experimental/onpair/src/dfa/contains.rs | 64 +++ encodings/experimental/onpair/src/dfa/mod.rs | 322 ++++++++++++++++ .../experimental/onpair/src/dfa/prefix.rs | 105 +++++ .../experimental/onpair/src/dfa/tests.rs | 364 ++++++++++++++++++ encodings/experimental/onpair/src/kernel.rs | 7 +- encodings/experimental/onpair/src/lib.rs | 1 + 8 files changed, 964 insertions(+), 2 deletions(-) create mode 100644 encodings/experimental/onpair/src/compute/like.rs create mode 100644 encodings/experimental/onpair/src/dfa/contains.rs create mode 100644 encodings/experimental/onpair/src/dfa/mod.rs create mode 100644 encodings/experimental/onpair/src/dfa/prefix.rs create mode 100644 encodings/experimental/onpair/src/dfa/tests.rs diff --git a/encodings/experimental/onpair/src/compute/like.rs b/encodings/experimental/onpair/src/compute/like.rs new file mode 100644 index 00000000000..eca5064dc99 --- /dev/null +++ b/encodings/experimental/onpair/src/compute/like.rs @@ -0,0 +1,102 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! `LIKE` pushdown for OnPair, evaluating `prefix%` and `%needle%` patterns +//! directly on the compressed code stream via a per-code DFA (see [`crate::dfa`]). + +use num_traits::AsPrimitive; +use vortex_array::ArrayRef; +use vortex_array::ArrayView; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::PrimitiveArray; +use vortex_array::match_each_integer_ptype; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_error::VortexResult; + +use crate::OnPair; +use crate::OnPairArraySlotsExt; +use crate::decode::collect_widened; +use crate::dfa::OnPairMatcher; +use crate::dfa::dfa_scan_to_bitbuf; + +impl LikeKernel for OnPair { + fn like( + array: ArrayView<'_, Self>, + pattern: &ArrayRef, + options: LikeOptions, + ctx: &mut ExecutionCtx, + ) -> VortexResult> { + let Some(pattern_scalar) = pattern.as_constant() else { + return Ok(None); + }; + + if options.case_insensitive { + return Ok(None); + } + + let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() { + let Some(v) = s.value() else { + return Ok(None); + }; + v.as_ref() + } else if let Some(b) = pattern_scalar.as_binary_opt() { + let Some(v) = b.value() else { + return Ok(None); + }; + v + } else { + return Ok(None); + }; + + let dict_bytes = array.dict_bytes(); + let dict_offsets = collect_widened::(array.dict_offsets(), ctx)?; + + let Some(matcher) = OnPairMatcher::try_new( + dict_bytes.as_slice(), + dict_offsets.as_slice(), + pattern_bytes, + )? + else { + return Ok(None); + }; + + let negated = options.negated; + let n = array.len(); + + // `codes_offsets` are per-row boundaries into the (possibly sliced) + // `codes` child. A sliced OnPair keeps the full `codes` child and only + // narrows these offsets, so `offsets[0]` may be > 0; slice the `codes` + // window to `[offsets[0], offsets[n])` before materialising it, exactly + // as the canonical decoder does. + let offsets = array + .codes_offsets() + .clone() + .execute::(ctx)?; + let (code_start, code_end): (usize, usize) = + match_each_integer_ptype!(offsets.ptype(), |T| { + let s = offsets.as_slice::(); + ( + AsPrimitive::::as_(s[0]), + AsPrimitive::::as_(s[n]), + ) + }); + + let codes = collect_widened::(&array.codes().slice(code_start..code_end)?, ctx)?; + let codes = codes.as_slice(); + + let result = match_each_integer_ptype!(offsets.ptype(), |T| { + let off = offsets.as_slice::(); + dfa_scan_to_bitbuf(n, off, code_start, codes, negated, |c| matcher.matches(c)) + }); + + let validity = array + .array() + .validity()? + .union_nullability(pattern_scalar.dtype().nullability()); + + Ok(Some(BoolArray::new(result, validity).into_array())) + } +} diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs index 4cb15868625..1d506e6b377 100644 --- a/encodings/experimental/onpair/src/compute/mod.rs +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -3,4 +3,5 @@ mod cast; mod filter; +mod like; mod slice; diff --git a/encodings/experimental/onpair/src/dfa/contains.rs b/encodings/experimental/onpair/src/dfa/contains.rs new file mode 100644 index 00000000000..4c37c417ed6 --- /dev/null +++ b/encodings/experimental/onpair/src/dfa/contains.rs @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Flat per-code transition table DFA for contains matching (`LIKE '%needle%'`). +//! +//! Built from the standard KMP failure function lifted to the token level. As +//! OnPair has no escape code, there is no escape sentinel: every code maps +//! directly to its byte-derived transition. + +use vortex_error::VortexExpect; + +use super::build_code_transitions; +use super::kmp_byte_transitions; +use super::n_codes; + +/// Flat per-code transition table DFA for contains matching on OnPair codes. +/// +/// States `0..needle_len` track match progress, `needle_len` is the (sticky) +/// accept state. Transitions are stored in a flat `Vec` indexed as +/// `[state * n_codes + code]`. +pub(crate) struct FlatContainsDfa { + /// `transitions[state * n_codes + code]` -> next state. + transitions: Vec, + n_codes: usize, + accept_state: u8, +} + +impl FlatContainsDfa { + /// Maximum needle length: the accept state must fit in `u8`. + pub(crate) const MAX_NEEDLE_LEN: usize = u8::MAX as usize - 1; + + pub(crate) fn new(dict_bytes: &[u8], dict_offsets: &[u32], needle: &[u8]) -> Self { + debug_assert!(needle.len() <= Self::MAX_NEEDLE_LEN); + + let accept_state = u8::try_from(needle.len()).vortex_expect("needle length fits in u8"); + let n_states = usize::from(accept_state) + 1; + + let byte_table = kmp_byte_transitions(needle); + let transitions = build_code_transitions( + dict_bytes, + dict_offsets, + &byte_table, + n_states, + accept_state, + ); + + Self { + transitions, + n_codes: n_codes(dict_offsets), + accept_state, + } + } + + pub(crate) fn matches(&self, codes: &[u16]) -> bool { + let mut state = 0u8; + for &code in codes { + state = self.transitions[usize::from(state) * self.n_codes + usize::from(code)]; + if state == self.accept_state { + return true; + } + } + false + } +} diff --git a/encodings/experimental/onpair/src/dfa/mod.rs b/encodings/experimental/onpair/src/dfa/mod.rs new file mode 100644 index 00000000000..b6a74e462cd --- /dev/null +++ b/encodings/experimental/onpair/src/dfa/mod.rs @@ -0,0 +1,322 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! # OnPair LIKE Pushdown via DFA Construction +//! +//! This module evaluates `LIKE` patterns directly on OnPair-compressed code +//! streams, without decompressing them. It handles two pattern shapes: +//! +//! - **Prefix**: `'prefix%'` — matches strings starting with a literal prefix. +//! - **Contains**: `'%needle%'` — matches strings containing a literal substring. +//! +//! Pushdown is conservative. If the pattern shape is unsupported (e.g. `_` +//! wildcards, suffix patterns, or patterns that exceed the DFA's representable +//! state space), [`OnPairMatcher::try_new`] returns `None` and the caller falls +//! back to ordinary decompression-based LIKE evaluation. +//! +//! ## Background: OnPair encoding +//! +//! OnPair compresses strings by replacing byte sequences with single `u16` +//! **token codes**. A code indexes a variable-length (1–16 byte) token in the +//! dictionary: token `c` is `dict_bytes[dict_offsets[c]..dict_offsets[c + 1]]`. +//! A compressed row is just a sequence of codes, and the concatenation of those +//! tokens' bytes is exactly the decompressed row. +//! +//! Unlike FSST, OnPair has **no escape code**: the trainer always emits all 256 +//! single-byte tokens, so every byte is representable as a token and every code +//! is a valid dictionary index. That makes the DFA strictly simpler than the +//! FSST one — there is no escape sentinel and no separate escape table. +//! +//! ## The algorithm: byte DFA → per-code transition table → scan +//! +//! 1. Build a byte-level transition table for the needle/prefix (KMP for +//! contains, a linear table for prefix), exactly as FSST does. +//! 2. Lift it to a **per-code** table: for each `(state, code)` pair, feed the +//! code's token bytes through the byte table and record the resulting state. +//! Because a row's token bytes are its decompressed bytes, stepping this +//! table over a row's codes is equivalent to running the byte DFA over the +//! decompressed string — so the result is exactly correct regardless of how +//! the encoder tokenized the row. +//! 3. Scan: walk a row's `u16` codes, one table lookup per code, and test the +//! accept state. +//! +//! The per-code table has `n_states * n_codes` entries. For the default +//! `dict-12` preset (≤ 4096 tokens) this is at most ~1 MiB and is built once per +//! query. + +mod contains; +mod prefix; +#[cfg(test)] +mod tests; + +use std::borrow::Cow; + +use contains::FlatContainsDfa; +use prefix::FlatPrefixDfa; +use vortex_array::dtype::IntegerPType; +use vortex_buffer::BitBuffer; +use vortex_error::VortexExpect; +use vortex_error::VortexResult; + +// --------------------------------------------------------------------------- +// OnPairMatcher — unified public API +// --------------------------------------------------------------------------- + +/// A compiled matcher for LIKE patterns on OnPair-compressed code streams. +/// +/// Encapsulates pattern parsing and DFA variant selection. Returns `None` from +/// [`try_new`](Self::try_new) for patterns that cannot be evaluated without +/// decompression (e.g. `_` wildcards, suffix patterns, or patterns that exceed +/// the DFA's representable byte-length limits). +pub(crate) struct OnPairMatcher { + inner: MatcherInner, +} + +enum MatcherInner { + MatchAll, + Prefix(FlatPrefixDfa), + Contains(FlatContainsDfa), +} + +impl OnPairMatcher { + /// Try to build a matcher for the given LIKE pattern over the OnPair + /// dictionary (`dict_bytes` + `dict_offsets`). + /// + /// Returns `Ok(None)` if the pattern shape is not supported for pushdown + /// (e.g. `_` wildcards, non-bookend `%`, `prefix%` longer than 253 bytes, or + /// `%needle%` longer than 254 bytes). + pub(crate) fn try_new( + dict_bytes: &[u8], + dict_offsets: &[u32], + pattern: &[u8], + ) -> VortexResult> { + let Some(like_kind) = LikeKind::parse(pattern) else { + return Ok(None); + }; + + let inner = match like_kind { + LikeKind::Prefix(pattern) | LikeKind::Contains(pattern) if pattern.is_empty() => { + MatcherInner::MatchAll + } + LikeKind::Prefix(prefix) => { + if prefix.len() > FlatPrefixDfa::MAX_PREFIX_LEN { + return Ok(None); + } + MatcherInner::Prefix(FlatPrefixDfa::new( + dict_bytes, + dict_offsets, + prefix.as_ref(), + )) + } + LikeKind::Contains(needle) => { + if needle.len() > FlatContainsDfa::MAX_NEEDLE_LEN { + return Ok(None); + } + MatcherInner::Contains(FlatContainsDfa::new( + dict_bytes, + dict_offsets, + needle.as_ref(), + )) + } + }; + + Ok(Some(Self { inner })) + } + + /// Run the matcher on a single row's OnPair code sequence. + pub(crate) fn matches(&self, codes: &[u16]) -> bool { + match &self.inner { + MatcherInner::MatchAll => true, + MatcherInner::Prefix(dfa) => dfa.matches(codes), + MatcherInner::Contains(dfa) => dfa.matches(codes), + } + } +} + +/// The subset of LIKE patterns we can handle without decompression. +enum LikeKind<'a> { + /// `prefix%` + Prefix(Cow<'a, [u8]>), + /// `%needle%` + Contains(Cow<'a, [u8]>), +} + +impl<'a> LikeKind<'a> { + fn parse(pattern: &'a [u8]) -> Option { + Self::parse_prefix(pattern).or_else(|| Self::parse_contains(pattern)) + } + + fn parse_prefix(pattern: &'a [u8]) -> Option { + Self::parse_literal_until_final_percent(pattern, 0).map(LikeKind::Prefix) + } + + fn parse_contains(pattern: &'a [u8]) -> Option { + if !pattern.starts_with(b"%") { + return None; + } + Self::parse_literal_until_final_percent(pattern, 1).map(LikeKind::Contains) + } + + /// Parse `pattern[literal_start..]` as a literal terminated by a single + /// trailing `%`. Returns `None` if `_` or a non-final `%` is encountered. + /// + /// `literal` stays `None` until an escape forces us to materialize bytes; + /// from then on we push into the owned `Vec`. Otherwise we return a borrowed + /// slice straight from `pattern`. + fn parse_literal_until_final_percent( + pattern: &'a [u8], + literal_start: usize, + ) -> Option> { + let mut literal: Option> = None; + let mut idx = literal_start; + while idx < pattern.len() { + match pattern[idx] { + b'\\' => { + // Trailing `\` is treated as a literal backslash. + let escaped = pattern.get(idx + 1).copied().unwrap_or(b'\\'); + literal + .get_or_insert_with(|| pattern[literal_start..idx].to_vec()) + .push(escaped); + idx = (idx + 2).min(pattern.len()); + } + b'%' if idx + 1 == pattern.len() => { + return Some(match literal { + Some(buf) => Cow::Owned(buf), + None => Cow::Borrowed(&pattern[literal_start..idx]), + }); + } + b'%' | b'_' => return None, + byte => { + // No-op on the borrowed path; only push once we've started copying. + if let Some(literal) = &mut literal { + literal.push(byte); + } + idx += 1; + } + } + } + None + } +} + +// --------------------------------------------------------------------------- +// Scan helper +// --------------------------------------------------------------------------- + +/// Evaluate `matcher` against every row of an OnPair `codes` window. +/// +/// `offsets` are the per-row boundaries into the *original* `codes` child; +/// `code_start` is the absolute index the sliced `codes` window begins at, so +/// `offsets[i] - code_start` indexes `codes`. +pub(crate) fn dfa_scan_to_bitbuf( + n: usize, + offsets: &[T], + code_start: usize, + codes: &[u16], + negated: bool, + matcher: F, +) -> BitBuffer +where + T: IntegerPType, + F: Fn(&[u16]) -> bool, +{ + BitBuffer::collect_bool(n, |i| { + let start: usize = offsets[i].as_() - code_start; + let end: usize = offsets[i + 1].as_() - code_start; + matcher(&codes[start..end]) != negated + }) +} + +// --------------------------------------------------------------------------- +// Per-code transition table +// --------------------------------------------------------------------------- + +/// Number of dictionary tokens (`= dict_offsets.len() - 1`). +fn n_codes(dict_offsets: &[u32]) -> usize { + dict_offsets.len().saturating_sub(1) +} + +/// Lift a byte-level transition table to a per-code table. +/// +/// For each `(state, code)` pair, feed the code's token bytes through +/// `byte_table` and record the resulting state. `accept_state` is sticky in +/// `byte_table`, so we can short-circuit once it is reached. +/// +/// Returns a flat `Vec` indexed as `[state * n_codes + code]`. +fn build_code_transitions( + dict_bytes: &[u8], + dict_offsets: &[u32], + byte_table: &[u8], + n_states: usize, + accept_state: u8, +) -> Vec { + let n_codes = n_codes(dict_offsets); + let mut trans = vec![0u8; n_states * n_codes]; + for state in 0..n_states { + let state = u8::try_from(state).vortex_expect("state fits in u8"); + for code in 0..n_codes { + let begin = dict_offsets[code] as usize; + let end = dict_offsets[code + 1] as usize; + let mut s = state; + if s != accept_state { + for &b in &dict_bytes[begin..end] { + s = byte_table[usize::from(s) * 256 + usize::from(b)]; + if s == accept_state { + break; + } + } + } + trans[usize::from(state) * n_codes + code] = s; + } + } + trans +} + +// --------------------------------------------------------------------------- +// KMP helpers (shared with the contains DFA) +// --------------------------------------------------------------------------- + +fn kmp_byte_transitions(needle: &[u8]) -> Vec { + let n_states = u8::try_from(needle.len() + 1) + .vortex_expect("kmp_byte_transitions: must have needle.len() <= 255"); + let accept = n_states - 1; + let failure = kmp_failure_table(needle); + + let mut table = vec![0u8; usize::from(n_states) * 256]; + for state in 0..n_states { + for byte in 0..256usize { + if state == accept { + table[usize::from(state) * 256 + byte] = accept; + continue; + } + let mut s = state; + loop { + if byte == usize::from(needle[usize::from(s)]) { + s += 1; + break; + } + if s == 0 { + break; + } + s = failure[usize::from(s) - 1]; + } + table[usize::from(state) * 256 + byte] = s; + } + } + table +} + +fn kmp_failure_table(needle: &[u8]) -> Vec { + let mut failure = vec![0u8; needle.len()]; + let mut k = 0u8; + for i in 1..needle.len() { + while k > 0 && needle[usize::from(k)] != needle[i] { + k = failure[usize::from(k) - 1]; + } + if needle[usize::from(k)] == needle[i] { + k += 1; + } + failure[i] = k; + } + failure +} diff --git a/encodings/experimental/onpair/src/dfa/prefix.rs b/encodings/experimental/onpair/src/dfa/prefix.rs new file mode 100644 index 00000000000..db1efe62d02 --- /dev/null +++ b/encodings/experimental/onpair/src/dfa/prefix.rs @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Flat per-code transition table DFA for prefix matching (`LIKE 'prefix%'`). +//! +//! Supports prefixes up to 253 bytes (states: 0..N progress + accept + fail ≤ +//! 256). OnPair has no escape code, so — unlike the FSST prefix DFA — there is +//! no escape sentinel or escape table. + +use vortex_error::VortexExpect; + +use super::build_code_transitions; +use super::n_codes; + +/// Flat per-code transition table DFA for prefix matching on OnPair codes. +/// +/// States `0..prefix_len` track match progress, plus ACCEPT and FAIL. +/// Transitions are stored in a flat `Vec` indexed as +/// `[state * n_codes + code]`. +/// +/// ```text +/// Prefix: "http" (4 progress states + accept + fail) +/// +/// Token bytes feed the byte table; the lifted code table records the +/// resulting state for each dictionary token. +/// State 0 wants 'h', 1 wants 't', 2 wants 't', 3 wants 'p', 4 = accept (sticky), +/// 5 = fail (sticky). +/// ``` +pub(crate) struct FlatPrefixDfa { + /// `transitions[state * n_codes + code]` -> next state. + transitions: Vec, + n_codes: usize, + accept_state: u8, + fail_state: u8, +} + +impl FlatPrefixDfa { + /// Maximum prefix length: need progress + accept + fail to fit in `u8`. + pub(crate) const MAX_PREFIX_LEN: usize = (u8::MAX - 2) as usize; + + pub(crate) fn new(dict_bytes: &[u8], dict_offsets: &[u32], prefix: &[u8]) -> Self { + debug_assert!(prefix.len() <= Self::MAX_PREFIX_LEN); + + let accept_state = u8::try_from(prefix.len()).vortex_expect("prefix length fits in u8"); + let fail_state = accept_state + 1; + let n_states = usize::from(fail_state) + 1; + + let byte_table = build_prefix_byte_table(prefix, accept_state, fail_state); + let transitions = build_code_transitions( + dict_bytes, + dict_offsets, + &byte_table, + n_states, + accept_state, + ); + + Self { + transitions, + n_codes: n_codes(dict_offsets), + accept_state, + fail_state, + } + } + + pub(crate) fn matches(&self, codes: &[u16]) -> bool { + let mut state = 0u8; + for &code in codes { + state = self.transitions[usize::from(state) * self.n_codes + usize::from(code)]; + if state == self.accept_state { + return true; + } + if state == self.fail_state { + return false; + } + } + state == self.accept_state + } +} + +/// Build a byte-level transition table for prefix matching. +/// +/// For each state, only the correct next byte advances; everything else goes to +/// the fail state. ACCEPT and FAIL are both sticky. +fn build_prefix_byte_table(prefix: &[u8], accept_state: u8, fail_state: u8) -> Vec { + let n_states = usize::from(fail_state) + 1; + let mut table = vec![fail_state; n_states * 256]; + + for state in 0..n_states { + if state == usize::from(accept_state) { + for byte in 0..256 { + table[state * 256 + byte] = accept_state; + } + } else if state != usize::from(fail_state) { + // Only the correct next byte advances; everything else fails. + let next_byte = prefix[state]; + let next_state = if state + 1 >= prefix.len() { + accept_state + } else { + u8::try_from(state + 1).vortex_expect("progress state fits in u8") + }; + table[state * 256 + usize::from(next_byte)] = next_state; + } + } + table +} diff --git a/encodings/experimental/onpair/src/dfa/tests.rs b/encodings/experimental/onpair/src/dfa/tests.rs new file mode 100644 index 00000000000..c1c75d97f72 --- /dev/null +++ b/encodings/experimental/onpair/src/dfa/tests.rs @@ -0,0 +1,364 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +// The randomised cross-check uses a tiny LCG whose `u64 -> usize` reductions are +// bounded well below `usize::MAX`; the truncation lint is noise here. +#![allow(clippy::cast_possible_truncation)] + +use std::sync::LazyLock; + +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::BoolArray; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; +use vortex_array::assert_arrays_eq; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::session::ArraySession; +use vortex_error::VortexResult; +use vortex_session::VortexSession; + +use crate::OnPair; +use crate::OnPairArray; +use crate::compress::DEFAULT_DICT12_CONFIG; +use crate::compress::onpair_compress; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn make_onpair(strings: &[Option<&str>], nullability: Nullability) -> OnPairArray { + let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability)); + let len = varbin.len(); + let dtype = varbin.dtype().clone(); + onpair_compress(&varbin, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap() +} + +fn run_like(array: OnPairArray, pattern: &str, opts: LikeOptions) -> VortexResult { + let len = array.len(); + let arr = array.into_array(); + let pattern = ConstantArray::new(pattern, len).into_array(); + let result = Like + .try_new_array(len, opts, [arr, pattern])? + .into_array() + .execute::(&mut SESSION.create_execution_ctx())?; + Ok(result.into_bool()) +} + +fn like(array: OnPairArray, pattern: &str) -> VortexResult { + run_like(array, pattern, LikeOptions::default()) +} + +#[test] +fn test_like_prefix() -> VortexResult<()> { + let onpair = make_onpair( + &[ + Some("http://example.com"), + Some("http://test.org"), + Some("ftp://files.net"), + Some("http://vortex.dev"), + Some("ssh://server.io"), + ], + Nullability::NonNullable, + ); + let result = like(onpair, "http%")?; + assert_arrays_eq!( + &result, + &BoolArray::from_iter([true, true, false, true, false]) + ); + Ok(()) +} + +#[test] +fn test_like_prefix_with_nulls() -> VortexResult<()> { + let onpair = make_onpair( + &[Some("hello"), None, Some("help"), None, Some("goodbye")], + Nullability::Nullable, + ); + let result = like(onpair, "hel%")?; // spellchecker:disable-line + assert_arrays_eq!( + &result, + &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)]) + ); + Ok(()) +} + +#[test] +fn test_like_contains() -> VortexResult<()> { + let onpair = make_onpair( + &[ + Some("hello world"), + Some("say hello"), + Some("goodbye"), + Some("hellooo"), + ], + Nullability::NonNullable, + ); + let result = like(onpair, "%hello%")?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true])); + Ok(()) +} + +#[test] +fn test_like_contains_cross_token() -> VortexResult<()> { + let onpair = make_onpair( + &[ + Some("the quick brown fox jumps over the lazy dog"), + Some("a short string"), + Some("the lazy dog sleeps"), + Some("no match"), + ], + Nullability::NonNullable, + ); + let result = like(onpair, "%lazy dog%")?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false])); + Ok(()) +} + +#[test] +fn test_not_like_contains() -> VortexResult<()> { + let onpair = make_onpair( + &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")], + Nullability::NonNullable, + ); + let opts = LikeOptions { + negated: true, + case_insensitive: false, + }; + let result = run_like(onpair, "%sdf%", opts)?; + assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true])); + Ok(()) +} + +#[test] +fn test_like_match_all() -> VortexResult<()> { + let onpair = make_onpair( + &[Some("abc"), Some(""), Some("xyz")], + Nullability::NonNullable, + ); + let result = like(onpair, "%")?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true])); + Ok(()) +} + +/// Call `LikeKernel::like` directly and verify it returns `Some(...)` (i.e. the +/// kernel handles it, rather than returning `None` = "fall back to decompress"). +#[test] +fn test_like_prefix_kernel_handles() -> VortexResult<()> { + let onpair = make_onpair( + &[Some("http://a.com"), Some("ftp://b.com")], + Nullability::NonNullable, + ); + let pattern = ConstantArray::new("http%", onpair.len()).into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + let onpair = onpair.as_view(); + let result = ::like(onpair, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_some(), "OnPair LikeKernel should handle prefix%"); + assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false])); + Ok(()) +} + +#[test] +fn test_like_contains_kernel_handles() -> VortexResult<()> { + let onpair = make_onpair( + &[Some("hello world"), Some("goodbye")], + Nullability::NonNullable, + ); + let pattern = ConstantArray::new("%world%", onpair.len()).into_array(); + let mut ctx = SESSION.create_execution_ctx(); + + let onpair = onpair.as_view(); + let result = ::like(onpair, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_some(), "OnPair LikeKernel should handle %needle%"); + assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false])); + Ok(()) +} + +/// Patterns we can't handle should return `None` (fall back). +#[test] +fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> { + let onpair = make_onpair(&[Some("abc"), Some("def")], Nullability::NonNullable); + let mut ctx = SESSION.create_execution_ctx(); + + // Underscore wildcard -- not handled. + let pattern = ConstantArray::new("a_c", onpair.len()).into_array(); + let onpair_v = onpair.as_view(); + let result = + ::like(onpair_v, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_none(), "underscore pattern should fall back"); + + // Case-insensitive -- not handled. + let pattern = ConstantArray::new("abc%", onpair.len()).into_array(); + let opts = LikeOptions { + negated: false, + case_insensitive: true, + }; + let result = ::like(onpair_v, &pattern, opts, &mut ctx)?; + assert!(result.is_none(), "ilike should fall back"); + + // Suffix patterns are unsupported, even when the suffix is an escaped literal. + let pattern = ConstantArray::new(r"%\%", onpair.len()).into_array(); + let result = + ::like(onpair_v, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_none(), "escaped suffix pattern should fall back"); + + Ok(()) +} + +#[test] +fn test_like_kernel_handles_escaped_prefix_and_contains() -> VortexResult<()> { + let onpair = make_onpair( + &[ + Some("%front"), + Some("_front"), + Some("\\front"), + Some("middle%value"), + Some("middle_value"), + Some("middle\\value"), + Some("front"), + ], + Nullability::NonNullable, + ); + let onpair_v = onpair.as_view(); + let mut ctx = SESSION.create_execution_ctx(); + + let pattern = ConstantArray::new(r"\%%", onpair.len()).into_array(); + let result = + ::like(onpair_v, &pattern, LikeOptions::default(), &mut ctx)?; + assert!(result.is_some(), "escaped percent prefix should use OnPair"); + assert_arrays_eq!( + result.unwrap(), + BoolArray::from_iter([true, false, false, false, false, false, false]) + ); + + let pattern = ConstantArray::new(r"%\_%", onpair.len()).into_array(); + let result = + ::like(onpair_v, &pattern, LikeOptions::default(), &mut ctx)?; + assert!( + result.is_some(), + "escaped underscore contains should use OnPair" + ); + assert_arrays_eq!( + result.unwrap(), + BoolArray::from_iter([false, true, false, false, true, false, false]) + ); + + let pattern = ConstantArray::new(r"%\\%", onpair.len()).into_array(); + let result = + ::like(onpair_v, &pattern, LikeOptions::default(), &mut ctx)?; + assert!( + result.is_some(), + "escaped backslash contains should use OnPair" + ); + assert_arrays_eq!( + result.unwrap(), + BoolArray::from_iter([false, false, true, false, false, true, false]) + ); + + Ok(()) +} + +/// Longer multi-token needle that spans many dictionary tokens, validating the +/// per-code lift over a realistic dictionary. +#[test] +fn test_like_contains_long_needle() -> VortexResult<()> { + let rows: Vec> = (0..256) + .map(|i| Some(format!("https://www.example.com/path/{}/segment", i % 17))) + .collect(); + let refs: Vec> = rows.iter().map(|s| s.as_deref()).collect(); + let onpair = make_onpair(&refs, Nullability::NonNullable); + + let result = like(onpair, "%example.com/path%")?; + assert_arrays_eq!(&result, &BoolArray::from_iter(vec![true; 256])); + Ok(()) +} + +/// Randomised cross-check: the compressed-domain DFA must agree with +/// ground-truth `starts_with` / `contains` over the original strings, across +/// many rows and many needles. This is the soundness guard — a row's +/// concatenated token bytes are its decompressed bytes, so the DFA result must +/// match a byte-level match exactly. +#[test] +fn test_like_matches_ground_truth_fuzz() -> VortexResult<()> { + // Small deterministic LCG so the test is reproducible without extra deps. + let mut state = 0x2545_F491_4F6C_DD1Du64; + let mut next = move || { + state = state + .wrapping_mul(6364136223846793005) + .wrapping_add(1442695040888963407); + state + }; + + let alphabet = b"abcde /.:"; + let rng_string = |n: usize, r: &mut dyn FnMut() -> u64| -> String { + (0..n) + .map(|_| alphabet[(r() as usize) % alphabet.len()] as char) + .collect() + }; + + let rows: Vec = (0..600) + .map(|_| { + let len = 1 + (next() as usize) % 24; + rng_string(len, &mut next) + }) + .collect(); + let refs: Vec> = rows.iter().map(|s| Some(s.as_str())).collect(); + let onpair = make_onpair(&refs, Nullability::NonNullable); + + // A mix of prefix and contains needles, including ones unlikely to match. + let needles = [ + "a", "ab", "abc", "/", ".", ":", "a/", "/.", "de", "cde", "abcde", "zz", "a.b", " ", + ]; + for needle in needles { + for (kind, pattern) in [ + ("prefix", format!("{needle}%")), + ("contains", format!("%{needle}%")), + ] { + let got = like(onpair.clone(), &pattern)?; + let expected: Vec = rows + .iter() + .map(|s| { + if kind == "prefix" { + s.starts_with(needle) + } else { + s.contains(needle) + } + }) + .collect(); + assert_arrays_eq!(&got, &BoolArray::from_iter(expected.clone())); + } + } + Ok(()) +} + +/// A `%needle%` longer than the contains DFA's state space must fall back. +#[test] +fn test_like_long_contains_falls_back() -> VortexResult<()> { + let needle = "a".repeat(255); + let matching = format!("xx{needle}yy"); + let pattern = format!("%{needle}%"); + + let onpair = make_onpair(&[Some(&matching)], Nullability::NonNullable); + let onpair_v = onpair.as_view(); + let direct = ::like( + onpair_v, + &ConstantArray::new(pattern.as_str(), onpair.len()).into_array(), + LikeOptions::default(), + &mut SESSION.create_execution_ctx(), + )?; + assert!( + direct.is_none(), + "contains needles longer than 254 bytes exceed the DFA's u8 state space" + ); + + // ...but the generic fallback path still produces the right answer. + let result = like(onpair, &pattern)?; + assert_arrays_eq!(&result, &BoolArray::from_iter([true])); + Ok(()) +} diff --git a/encodings/experimental/onpair/src/kernel.rs b/encodings/experimental/onpair/src/kernel.rs index fdd521e887e..0e288cb4bb7 100644 --- a/encodings/experimental/onpair/src/kernel.rs +++ b/encodings/experimental/onpair/src/kernel.rs @@ -3,9 +3,12 @@ use vortex_array::arrays::filter::FilterExecuteAdaptor; use vortex_array::kernel::ParentKernelSet; +use vortex_array::scalar_fn::fns::like::LikeExecuteAdaptor; use crate::OnPair; // TODO: implement ListExecute & TakeExecute for OnPair -pub(super) const PARENT_KERNELS: ParentKernelSet = - ParentKernelSet::new(&[ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair))]); +pub(super) const PARENT_KERNELS: ParentKernelSet = ParentKernelSet::new(&[ + ParentKernelSet::lift(&FilterExecuteAdaptor(OnPair)), + ParentKernelSet::lift(&LikeExecuteAdaptor(OnPair)), +]); diff --git a/encodings/experimental/onpair/src/lib.rs b/encodings/experimental/onpair/src/lib.rs index 94c18b6dec8..240526fa4fe 100644 --- a/encodings/experimental/onpair/src/lib.rs +++ b/encodings/experimental/onpair/src/lib.rs @@ -15,6 +15,7 @@ mod canonical; mod compress; mod compute; mod decode; +mod dfa; mod kernel; mod ops; mod rules; From 916eb928152fe2d3f439b968f403f6334295ab24 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 15:09:33 +0000 Subject: [PATCH 2/5] bench(onpair): LIKE pushdown microbench + benchmark knobs Add a divan microbenchmark comparing the compressed-domain LIKE pushdown against the decompress-and-match fallback on a 200k-row OnPair-encoded URL column. On this corpus the pushdown is ~1.9-2.2x faster for prefix and ~2.4-3.3x for contains. Two benchmark-enablement knobs: - `VORTEX_ONPAIR_LIKE_PUSHDOWN=0` forces the OnPair LikeKernel to decline (fall back to decompression), so the same binary can A/B the pushdown end-to-end without a rebuild. Read once. - `CLICKBENCH_PARTITIONS=N` caps how many ClickBench shards are fetched and queried, for local/iterative runs (the full suite still defaults to 100). Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/Cargo.toml | 4 + encodings/experimental/onpair/benches/like.rs | 153 ++++++++++++++++++ .../experimental/onpair/src/compute/like.rs | 16 ++ vortex-bench/src/clickbench/data.rs | 13 +- 4 files changed, 184 insertions(+), 2 deletions(-) create mode 100644 encodings/experimental/onpair/benches/like.rs diff --git a/encodings/experimental/onpair/Cargo.toml b/encodings/experimental/onpair/Cargo.toml index 258568cc75a..5fd4cdfe98e 100644 --- a/encodings/experimental/onpair/Cargo.toml +++ b/encodings/experimental/onpair/Cargo.toml @@ -38,3 +38,7 @@ vortex-array = { workspace = true, features = ["_test-harness"] } [[bench]] name = "decode" harness = false + +[[bench]] +name = "like" +harness = false diff --git a/encodings/experimental/onpair/benches/like.rs b/encodings/experimental/onpair/benches/like.rs new file mode 100644 index 00000000000..4eaf345da74 --- /dev/null +++ b/encodings/experimental/onpair/benches/like.rs @@ -0,0 +1,153 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors +// +//! LIKE-pushdown microbenchmarks for the OnPair Vortex array. +//! +//! Compares two ways of evaluating `LIKE 'prefix%'` / `LIKE '%needle%'` on an +//! OnPair-encoded URL column: +//! +//! * `pushdown` — the compressed-domain per-code DFA kernel +//! (`::like`), which never decompresses. +//! * `fallback` — what the engine does when no pushdown kernel handles the +//! predicate: canonicalise to `VarBinViewArray` (decompress) and run the +//! standard string LIKE. +//! +//! Both arms produce the same boolean result; the delta is the cost the +//! pushdown avoids. + +#![allow( + clippy::cast_possible_truncation, + clippy::cast_lossless, + clippy::panic, + clippy::tests_outside_test_module, + clippy::unwrap_used, + clippy::expect_used +)] + +use std::sync::LazyLock; + +use divan::Bencher; +use divan::black_box; +use vortex_array::ArrayRef; +use vortex_array::Canonical; +use vortex_array::ExecutionCtx; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::VarBinArray; +use vortex_array::arrays::VarBinViewArray; +use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeKernel; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::session::ArraySession; +use vortex_onpair::DEFAULT_DICT12_CONFIG; +use vortex_onpair::OnPair; +use vortex_onpair::OnPairArray; +use vortex_onpair::onpair_compress; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +const N_ROWS: usize = 200_000; + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +fn ctx() -> ExecutionCtx { + SESSION.create_execution_ctx() +} + +/// A ClickBench-shaped URL corpus: high lexical overlap, a handful of hosts and +/// path templates, so OnPair's learned dictionary applies. +static CORPUS: LazyLock = LazyLock::new(|| { + let hosts = [ + "https://www.example.com", + "http://ads.adriver.ru", + "https://yandex.ru/search", + "https://www.google.com/maps", + "http://shop.bonprix.ru", + "https://video.yandex.ru/users", + ]; + let paths = [ + "/page/", + "/product/", + "/category/", + "/search?text=", + "/index?id=", + "/cart/item/", + ]; + let rows: Vec = (0..N_ROWS) + .map(|i| { + let h = hosts[i % hosts.len()]; + let p = paths[(i / 7) % paths.len()]; + format!("{h}{p}{}", i % 9973) + }) + .collect(); + let refs: Vec> = rows.iter().map(|s| Some(s.as_str())).collect(); + let varbin = VarBinArray::from_iter(refs, DType::Utf8(Nullability::NonNullable)); + let len = varbin.len(); + let dtype = varbin.dtype().clone(); + onpair_compress(&varbin, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap() +}); + +fn pattern_array(pattern: &str) -> ArrayRef { + ConstantArray::new(pattern, CORPUS.len()).into_array() +} + +/// Compressed-domain pushdown: the DFA kernel, no decompression. +fn run_pushdown(pattern: &ArrayRef) -> ArrayRef { + let mut ctx = ctx(); + ::like(CORPUS.as_view(), pattern, LikeOptions::default(), &mut ctx) + .unwrap() + .expect("OnPair pushdown should handle this pattern") +} + +/// Fallback: decompress to `VarBinViewArray`, then run the standard LIKE. +fn run_fallback(pattern: &ArrayRef) -> ArrayRef { + let mut ctx = ctx(); + let canonical = CORPUS + .clone() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array(); + Like.try_new_array( + CORPUS.len(), + LikeOptions::default(), + [canonical, pattern.clone()], + ) + .unwrap() + .into_array() + .execute::(&mut ctx) + .unwrap() + .into_array() +} + +#[divan::bench(args = ["https://www.example.com%", "https://www.google.com/maps%"])] +fn prefix_pushdown(bencher: Bencher, pattern: &str) { + let p = pattern_array(pattern); + bencher.bench(|| black_box(run_pushdown(&p))); +} + +#[divan::bench(args = ["https://www.example.com%", "https://www.google.com/maps%"])] +fn prefix_fallback(bencher: Bencher, pattern: &str) { + let p = pattern_array(pattern); + bencher.bench(|| black_box(run_fallback(&p))); +} + +#[divan::bench(args = ["%yandex%", "%/search?text=%", "%bonprix%"])] +fn contains_pushdown(bencher: Bencher, pattern: &str) { + let p = pattern_array(pattern); + bencher.bench(|| black_box(run_pushdown(&p))); +} + +#[divan::bench(args = ["%yandex%", "%/search?text=%", "%bonprix%"])] +fn contains_fallback(bencher: Bencher, pattern: &str) { + let p = pattern_array(pattern); + bencher.bench(|| black_box(run_fallback(&p))); +} diff --git a/encodings/experimental/onpair/src/compute/like.rs b/encodings/experimental/onpair/src/compute/like.rs index eca5064dc99..bad4a78229f 100644 --- a/encodings/experimental/onpair/src/compute/like.rs +++ b/encodings/experimental/onpair/src/compute/like.rs @@ -4,6 +4,8 @@ //! `LIKE` pushdown for OnPair, evaluating `prefix%` and `%needle%` patterns //! directly on the compressed code stream via a per-code DFA (see [`crate::dfa`]). +use std::sync::LazyLock; + use num_traits::AsPrimitive; use vortex_array::ArrayRef; use vortex_array::ArrayView; @@ -22,6 +24,16 @@ use crate::decode::collect_widened; use crate::dfa::OnPairMatcher; use crate::dfa::dfa_scan_to_bitbuf; +/// Escape hatch for measuring or debugging the compressed-domain LIKE pushdown: +/// set `VORTEX_ONPAIR_LIKE_PUSHDOWN=0` (or `off`/`false`) to force the kernel to +/// decline, falling back to canonical decompression + LIKE. Read once. +static PUSHDOWN_DISABLED: LazyLock = LazyLock::new(|| { + matches!( + std::env::var("VORTEX_ONPAIR_LIKE_PUSHDOWN").as_deref(), + Ok("0") | Ok("off") | Ok("false") + ) +}); + impl LikeKernel for OnPair { fn like( array: ArrayView<'_, Self>, @@ -29,6 +41,10 @@ impl LikeKernel for OnPair { options: LikeOptions, ctx: &mut ExecutionCtx, ) -> VortexResult> { + if *PUSHDOWN_DISABLED { + return Ok(None); + } + let Some(pattern_scalar) = pattern.as_constant() else { return Ok(None); }; diff --git a/vortex-bench/src/clickbench/data.rs b/vortex-bench/src/clickbench/data.rs index ba2a2104192..62cee11d6ea 100644 --- a/vortex-bench/src/clickbench/data.rs +++ b/vortex-bench/src/clickbench/data.rs @@ -193,9 +193,18 @@ impl Flavor { Flavor::Partitioned => { // The clickbench-provided file is missing some higher-level type info, so we reprocess it // to add that info, see https://github.com/ClickHouse/ClickBench/issues/7. - info!("Downloading 100 ClickBench parquet shards"); + // + // The full benchmark uses all 100 shards. For local/iterative runs the + // `CLICKBENCH_PARTITIONS` env var caps how many shards are fetched (and, + // since the directory is converted as-is, how many are queried). + let n_shards: u32 = std::env::var("CLICKBENCH_PARTITIONS") + .ok() + .and_then(|v| v.parse().ok()) + .map(|n: u32| n.clamp(1, 100)) + .unwrap_or(100); + info!("Downloading {n_shards} ClickBench parquet shards"); let parquet_dir = basepath.join(Format::Parquet.name()); - let downloads = (0_u32..100).map(|idx| { + let downloads = (0_u32..n_shards).map(|idx| { let output_path = parquet_dir.join(format!("hits_{idx}.parquet")); let url = format!("https://pub-3ba949c0f0354ac18db1f0f14f0a2c52.r2.dev/clickbench/parquet_many/hits_{idx}.parquet"); (output_path, url) From 527ada3fc20e97c55e5b3bc5fe8a53548aa98939 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 19:42:09 +0000 Subject: [PATCH 3/5] perf(onpair): hoist LIKE matcher dispatch out of the row loop Select the DFA variant once in `OnPairMatcher::scan_to_bitbuf` instead of re-matching the matcher enum per row through a closure, mark the concrete `FlatContainsDfa`/`FlatPrefixDfa::matches` `#[inline]`, and walk row offsets with a running cursor. This lets the row scan monomorphise and inline the DFA step. Controlled microbench (same machine, back-to-back): contains pushdown ~1.16-1.26x faster (e.g. %bonprix% 1.84ms -> 1.46ms), prefix marginally faster. Also add an instrumented characterization test proving where the pushdown actually fires through the execution engine: bare OnPair and Dict(OnPair) both route the predicate to the kernel, but Dict(Shared(OnPair)) -- the shape a dict-encoded column takes when read back from a multi-chunk file -- does not, because `Shared` has no parent-reduce forwarding and canonicalizes (decompresses) instead. This is why the compressed-domain LIKE pushdown does not move end-to-end ClickBench/TPC-H numbers, and it affects FSST identically. Signed-off-by: Joe Isaacs --- .../experimental/onpair/src/compute/like.rs | 17 +++- .../experimental/onpair/src/compute/mod.rs | 2 +- .../experimental/onpair/src/dfa/contains.rs | 1 + encodings/experimental/onpair/src/dfa/mod.rs | 47 +++++++---- .../experimental/onpair/src/dfa/prefix.rs | 1 + .../experimental/onpair/src/dfa/tests.rs | 84 +++++++++++++++++++ 6 files changed, 135 insertions(+), 17 deletions(-) diff --git a/encodings/experimental/onpair/src/compute/like.rs b/encodings/experimental/onpair/src/compute/like.rs index bad4a78229f..236aa953bd4 100644 --- a/encodings/experimental/onpair/src/compute/like.rs +++ b/encodings/experimental/onpair/src/compute/like.rs @@ -5,6 +5,10 @@ //! directly on the compressed code stream via a per-code DFA (see [`crate::dfa`]). use std::sync::LazyLock; +#[cfg(test)] +use std::sync::atomic::AtomicUsize; +#[cfg(test)] +use std::sync::atomic::Ordering; use num_traits::AsPrimitive; use vortex_array::ArrayRef; @@ -22,7 +26,13 @@ use crate::OnPair; use crate::OnPairArraySlotsExt; use crate::decode::collect_widened; use crate::dfa::OnPairMatcher; -use crate::dfa::dfa_scan_to_bitbuf; + +/// Test-only counter of how many times the kernel actually committed to the +/// compressed-domain pushdown (i.e. returned `Some` via the DFA path). Lets +/// tests assert that the pushdown genuinely fires through the execution engine, +/// including through wrappers like `Dict` and `Shared`. +#[cfg(test)] +pub(crate) static PUSHDOWN_HITS: AtomicUsize = AtomicUsize::new(0); /// Escape hatch for measuring or debugging the compressed-domain LIKE pushdown: /// set `VORTEX_ONPAIR_LIKE_PUSHDOWN=0` (or `off`/`false`) to force the kernel to @@ -79,6 +89,9 @@ impl LikeKernel for OnPair { return Ok(None); }; + #[cfg(test)] + PUSHDOWN_HITS.fetch_add(1, Ordering::Relaxed); + let negated = options.negated; let n = array.len(); @@ -105,7 +118,7 @@ impl LikeKernel for OnPair { let result = match_each_integer_ptype!(offsets.ptype(), |T| { let off = offsets.as_slice::(); - dfa_scan_to_bitbuf(n, off, code_start, codes, negated, |c| matcher.matches(c)) + matcher.scan_to_bitbuf(n, off, code_start, codes, negated) }); let validity = array diff --git a/encodings/experimental/onpair/src/compute/mod.rs b/encodings/experimental/onpair/src/compute/mod.rs index 1d506e6b377..eb4ede0406b 100644 --- a/encodings/experimental/onpair/src/compute/mod.rs +++ b/encodings/experimental/onpair/src/compute/mod.rs @@ -3,5 +3,5 @@ mod cast; mod filter; -mod like; +pub(crate) mod like; mod slice; diff --git a/encodings/experimental/onpair/src/dfa/contains.rs b/encodings/experimental/onpair/src/dfa/contains.rs index 4c37c417ed6..ed562e33e19 100644 --- a/encodings/experimental/onpair/src/dfa/contains.rs +++ b/encodings/experimental/onpair/src/dfa/contains.rs @@ -51,6 +51,7 @@ impl FlatContainsDfa { } } + #[inline] pub(crate) fn matches(&self, codes: &[u16]) -> bool { let mut state = 0u8; for &code in codes { diff --git a/encodings/experimental/onpair/src/dfa/mod.rs b/encodings/experimental/onpair/src/dfa/mod.rs index b6a74e462cd..829c4ac40c2 100644 --- a/encodings/experimental/onpair/src/dfa/mod.rs +++ b/encodings/experimental/onpair/src/dfa/mod.rs @@ -123,12 +123,31 @@ impl OnPairMatcher { Ok(Some(Self { inner })) } - /// Run the matcher on a single row's OnPair code sequence. - pub(crate) fn matches(&self, codes: &[u16]) -> bool { + /// Evaluate every row of an OnPair `codes` window into a [`BitBuffer`]. + /// + /// `offsets` are the per-row boundaries into the *original* `codes` child; + /// `code_start` is the absolute index the sliced `codes` window begins at, + /// so `offsets[i] - code_start` indexes `codes`. + /// + /// The matcher variant is selected once, outside the row loop, so the + /// concrete DFA's `matches` inlines into a monomorphic scan rather than + /// re-dispatching the enum per row. + pub(crate) fn scan_to_bitbuf( + &self, + n: usize, + offsets: &[T], + code_start: usize, + codes: &[u16], + negated: bool, + ) -> BitBuffer { match &self.inner { - MatcherInner::MatchAll => true, - MatcherInner::Prefix(dfa) => dfa.matches(codes), - MatcherInner::Contains(dfa) => dfa.matches(codes), + MatcherInner::MatchAll => BitBuffer::collect_bool(n, |_| !negated), + MatcherInner::Prefix(dfa) => { + scan_rows(n, offsets, code_start, codes, negated, |c| dfa.matches(c)) + } + MatcherInner::Contains(dfa) => { + scan_rows(n, offsets, code_start, codes, negated, |c| dfa.matches(c)) + } } } } @@ -203,27 +222,27 @@ impl<'a> LikeKind<'a> { // Scan helper // --------------------------------------------------------------------------- -/// Evaluate `matcher` against every row of an OnPair `codes` window. -/// -/// `offsets` are the per-row boundaries into the *original* `codes` child; -/// `code_start` is the absolute index the sliced `codes` window begins at, so -/// `offsets[i] - code_start` indexes `codes`. -pub(crate) fn dfa_scan_to_bitbuf( +/// Walk a `codes` window row by row with a single concrete row matcher, +/// carrying a running start cursor (consecutive rows are contiguous in +/// `codes`) instead of re-reading both boundaries each row. +fn scan_rows( n: usize, offsets: &[T], code_start: usize, codes: &[u16], negated: bool, - matcher: F, + row_matches: F, ) -> BitBuffer where T: IntegerPType, F: Fn(&[u16]) -> bool, { + let mut start: usize = offsets[0].as_() - code_start; BitBuffer::collect_bool(n, |i| { - let start: usize = offsets[i].as_() - code_start; let end: usize = offsets[i + 1].as_() - code_start; - matcher(&codes[start..end]) != negated + let result = row_matches(&codes[start..end]) != negated; + start = end; + result }) } diff --git a/encodings/experimental/onpair/src/dfa/prefix.rs b/encodings/experimental/onpair/src/dfa/prefix.rs index db1efe62d02..6b108e09b1a 100644 --- a/encodings/experimental/onpair/src/dfa/prefix.rs +++ b/encodings/experimental/onpair/src/dfa/prefix.rs @@ -62,6 +62,7 @@ impl FlatPrefixDfa { } } + #[inline] pub(crate) fn matches(&self, codes: &[u16]) -> bool { let mut state = 0u8; for &code in codes { diff --git a/encodings/experimental/onpair/src/dfa/tests.rs b/encodings/experimental/onpair/src/dfa/tests.rs index c1c75d97f72..9ceb24857a6 100644 --- a/encodings/experimental/onpair/src/dfa/tests.rs +++ b/encodings/experimental/onpair/src/dfa/tests.rs @@ -6,21 +6,27 @@ #![allow(clippy::cast_possible_truncation)] use std::sync::LazyLock; +use std::sync::atomic::Ordering::Relaxed; +use vortex_array::ArrayRef; use vortex_array::Canonical; use vortex_array::IntoArray; use vortex_array::VortexSessionExecute; use vortex_array::arrays::BoolArray; use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::DictArray; +use vortex_array::arrays::SharedArray; use vortex_array::arrays::VarBinArray; use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt; use vortex_array::assert_arrays_eq; use vortex_array::dtype::DType; use vortex_array::dtype::Nullability; +use vortex_array::optimizer::ArrayOptimizer; use vortex_array::scalar_fn::fns::like::Like; use vortex_array::scalar_fn::fns::like::LikeKernel; use vortex_array::scalar_fn::fns::like::LikeOptions; use vortex_array::session::ArraySession; +use vortex_buffer::buffer; use vortex_error::VortexResult; use vortex_session::VortexSession; @@ -28,6 +34,7 @@ use crate::OnPair; use crate::OnPairArray; use crate::compress::DEFAULT_DICT12_CONFIG; use crate::compress::onpair_compress; +use crate::compute::like::PUSHDOWN_HITS; static SESSION: LazyLock = LazyLock::new(|| VortexSession::empty().with::()); @@ -337,6 +344,83 @@ fn test_like_matches_ground_truth_fuzz() -> VortexResult<()> { Ok(()) } +/// Prove the pushdown *fires through the execution engine* — not just when the +/// kernel is called directly — for a bare OnPair array, a `Dict(OnPair)`, and a +/// `Dict(Shared(OnPair))` (the shape a dict-encoded column takes when read back +/// from a file, where the layout wraps shared dictionary values in `Shared`). +/// +/// Correct results alone can't distinguish "pushdown ran" from "decompressed and +/// matched", so this asserts on the kernel's hit counter. +#[test] +fn test_pushdown_fires_through_dict_and_shared() -> VortexResult<()> { + let values = make_onpair( + &[ + Some("https://google.com"), + Some("http://yandex.ru"), + Some("https://google.com/maps"), + ], + Nullability::NonNullable, + ); + // 5 rows referencing the 3 dictionary values; values.len() <= codes.len() so + // Dict's LikeReduce pushes the predicate down to the values. + let codes = buffer![0u8, 1, 2, 0, 1].into_array(); + // `%google%` over the 3 dictionary values, then over the 5 dict rows. + let expected_values = BoolArray::from_iter([true, false, true]); + let expected_rows = BoolArray::from_iter([true, false, true, true, false]); + + let run = |arr: ArrayRef| -> VortexResult<(BoolArray, usize)> { + let before = PUSHDOWN_HITS.load(Relaxed); + let len = arr.len(); + let pattern = ConstantArray::new("%google%", len).into_array(); + // Optimize first (as the engine does): this runs Dict's LIKE reduce, + // which pushes the predicate down to the dictionary values. + let result = Like + .try_new_array(len, LikeOptions::default(), [arr, pattern])? + .into_array() + .optimize()? + .execute::(&mut SESSION.create_execution_ctx())? + .into_bool(); + Ok((result, PUSHDOWN_HITS.load(Relaxed) - before)) + }; + + // (a) bare OnPair — the predicate dispatches straight to our kernel. + let (ra, hits_a) = run(values.clone().into_array())?; + assert_arrays_eq!(&ra, &expected_values); + + // (b) Dict(OnPair) — Dict::like pushes the predicate to the OnPair values. + let dict = DictArray::try_new(codes.clone(), values.clone().into_array())?; + let (rb, hits_b) = run(dict.into_array())?; + assert_arrays_eq!(&rb, &expected_rows); + + // (c) Dict(Shared(OnPair)) — the read-back shape. + let shared = SharedArray::new(values.into_array()).into_array(); + let dict = DictArray::try_new(codes, shared)?; + let (rc, hits_c) = run(dict.into_array())?; + assert_arrays_eq!(&rc, &expected_rows); + + eprintln!("pushdown hits: bare={hits_a} dict={hits_b} dict_shared={hits_c}"); + + // Bare OnPair and Dict(OnPair) both route the predicate to our kernel. + assert!(hits_a >= 1, "bare OnPair LIKE should fire the pushdown"); + assert!(hits_b >= 1, "Dict(OnPair) LIKE should fire the pushdown"); + + // KNOWN GAP: `Dict(Shared(OnPair))` — the shape a dict-encoded column takes + // when read back from a multi-chunk file — does NOT fire the pushdown, because + // `Shared` has no parent-reduce forwarding: a `LIKE` over it canonicalizes + // (decompresses) the source instead of pushing the predicate into the OnPair + // array. This is why the compressed-domain LIKE pushdown does not move + // end-to-end ClickBench/TPC-H numbers, and it affects FSST identically. The + // fix lives in `vortex-array`'s `Shared` (forward scalar-fn reduces to the + // source); when that lands, `hits_c` becomes >= 1 and this guard should flip. + assert_eq!( + hits_c, 0, + "Dict(Shared(OnPair)) unexpectedly fired the pushdown ({hits_c}); if Shared \ + now forwards LIKE to its source, update this characterization assertion" + ); + + Ok(()) +} + /// A `%needle%` longer than the contains DFA's state space must fall back. #[test] fn test_like_long_contains_falls_back() -> VortexResult<()> { From b6cc8bd213776878a432943aab7a910075397154 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 20:18:21 +0000 Subject: [PATCH 4/5] fix(layout): apply dict predicates to bare values so pushdown fires A dict-encoded string column reads back as `Dict(codes, Shared(values))`. `Shared` (which dedups the decoded dictionary across row splits) has no parent-reduce forwarding, so a predicate pushed to the values -- `like(Shared(onpair))` -- canonicalizes (decompresses) the source instead of reaching the OnPair/FSST LIKE kernel. Because the filter path's `values_array_uncanonical` reused the projection's `Shared`-wrapped cache, any query that both projects and filters the same column (e.g. ClickBench Q22's `MIN(URL)` + `WHERE URL LIKE`) silently lost the pushdown. Give the predicate path its own bare (non-`Shared`) values cache, built on the same underlying read as the `Shared` projection cache (values are read once). Projection keeps `Shared` for cross-split decode reuse; predicates get bare values so the optimizer can push them into the values encoding. Verified end-to-end on a ClickBench shard (OnPair-encoded `URL`): - Q22-shape (filter + project URL): kernel firings 0 -> 44, query faster. - count(*) filter: still 44 firings, result unchanged. - Q34 (GROUP BY URL, pure decode): unchanged (no decode-cache regression). Also retarget the OnPair characterization test's comment at this layout fix (the array-level `Shared`-blocks-pushdown behavior it pins is what motivates applying predicates to bare values). Signed-off-by: Joe Isaacs --- .../experimental/onpair/src/dfa/tests.rs | 16 ++--- vortex-layout/src/layouts/dict/reader.rs | 66 ++++++++++--------- 2 files changed, 43 insertions(+), 39 deletions(-) diff --git a/encodings/experimental/onpair/src/dfa/tests.rs b/encodings/experimental/onpair/src/dfa/tests.rs index 9ceb24857a6..def371d0e6e 100644 --- a/encodings/experimental/onpair/src/dfa/tests.rs +++ b/encodings/experimental/onpair/src/dfa/tests.rs @@ -404,14 +404,14 @@ fn test_pushdown_fires_through_dict_and_shared() -> VortexResult<()> { assert!(hits_a >= 1, "bare OnPair LIKE should fire the pushdown"); assert!(hits_b >= 1, "Dict(OnPair) LIKE should fire the pushdown"); - // KNOWN GAP: `Dict(Shared(OnPair))` — the shape a dict-encoded column takes - // when read back from a multi-chunk file — does NOT fire the pushdown, because - // `Shared` has no parent-reduce forwarding: a `LIKE` over it canonicalizes - // (decompresses) the source instead of pushing the predicate into the OnPair - // array. This is why the compressed-domain LIKE pushdown does not move - // end-to-end ClickBench/TPC-H numbers, and it affects FSST identically. The - // fix lives in `vortex-array`'s `Shared` (forward scalar-fn reduces to the - // source); when that lands, `hits_c` becomes >= 1 and this guard should flip. + // At the *array* level, `LIKE` over `Dict(Shared(OnPair))` still does NOT push + // down: `Shared` has no parent-reduce forwarding, so the predicate + // canonicalizes (decompresses) the source instead of reaching the OnPair + // kernel. The fix is at the *layout* level: the dict reader applies predicates + // to the bare (non-`Shared`) values so a column read back from a multi-chunk + // file fires the pushdown (verified end-to-end) — see + // `vortex-layout`'s `DictReader::values_array_uncanonical`. This guard pins the + // array-level behavior that motivates that layout choice. assert_eq!( hits_c, 0, "Dict(Shared(OnPair)) unexpectedly fired the pushdown ({hits_c}); if Shared \ diff --git a/vortex-layout/src/layouts/dict/reader.rs b/vortex-layout/src/layouts/dict/reader.rs index 002b4b1e902..937d2ff5269 100644 --- a/vortex-layout/src/layouts/dict/reader.rs +++ b/vortex-layout/src/layouts/dict/reader.rs @@ -43,8 +43,15 @@ pub struct DictReader { /// Length of the values array values_len: usize, - /// Cached dict values array + /// Cached dict values array, wrapped in `Shared` so a pure-decode projection + /// reuses one canonicalization across row splits. values_array: OnceLock, + /// Cached dict values array *without* the `Shared` wrapper. Predicate + /// pushdown (see [`Self::values_eval`]) applies the expression to this bare + /// array so the optimizer can push it into the values encoding (e.g. an + /// OnPair/FSST `LIKE` kernel) rather than canonicalizing through `Shared`, + /// which would force a full decompress. + values_array_bare: OnceLock, /// Cache of expression evaluation results on the values array by expression values_evals: DashMap, @@ -80,53 +87,50 @@ impl DictReader { session, values_len, values_array: Default::default(), + values_array_bare: Default::default(), values_evals: Default::default(), values, codes, }) } + /// Read the full dictionary values array (uncanonicalized, bare). + fn read_values(&self) -> SharedArrayFuture { + let values_len = self.values_len; + self.values + .projection_evaluation( + &(0..values_len as u64), + &root(), + MaskFuture::new_true(values_len), + ) + .vortex_expect("must construct dict values array evaluation") + .map_err(Arc::new) + .boxed() + .shared() + } + fn values_array(&self) -> SharedArrayFuture { // We capture the name, so it may be wrong if we re-use the same reader within multiple // different parent readers. But that's rare... - let values_len = self.values_len; + // + // Wrap the *same* bare values read (so the dictionary values are read + // once) in `Shared`, which caches one canonicalization across splits. + let bare = self.values_array_uncanonical(); self.values_array - .get_or_init(move || { - self.values - .projection_evaluation( - &(0..values_len as u64), - &root(), - MaskFuture::new_true(values_len), - ) - .vortex_expect("must construct dict values array evaluation") - .map_err(Arc::new) - .map(move |array| { - let array = array?; - Ok(SharedArray::new(array).into_array()) - }) + .get_or_init(|| { + bare.map(move |array| Ok(SharedArray::new(array?).into_array())) .boxed() .shared() }) .clone() } - // This is the dict values array without canonicalization, if not already canonical + /// The dict values array without the `Shared` wrapper, so a predicate applied + /// to it can be pushed into the values encoding instead of canonicalizing. fn values_array_uncanonical(&self) -> SharedArrayFuture { - // We capture the name, so it may be wrong if we re-use the same reader within multiple - // different parent readers. But that's rare... - let values_len = self.values_len; - self.values_array.get().cloned().unwrap_or_else(|| { - self.values - .projection_evaluation( - &(0..values_len as u64), - &root(), - MaskFuture::new_true(values_len), - ) - .vortex_expect("must construct dict values array evaluation") - .map_err(Arc::new) - .boxed() - .shared() - }) + self.values_array_bare + .get_or_init(|| self.read_values()) + .clone() } fn values_eval(&self, expr: Expression) -> SharedArrayFuture { From 525788872d73573c1038eb9a929bc0ef3852a4bc Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 21:39:24 +0000 Subject: [PATCH 5/5] perf(onpair): only build DFA columns for codes the pattern can match The per-call DFA table was the dominant cost of the LIKE pushdown on dict-encoded columns (~17% of ClickBench Q21 in a samply profile): it built an `n_states x n_codes` transition for every one of the (up to 4096) dictionary tokens, even though the needle/prefix can only interact with the tokens that contain one of its bytes. A token whose bytes are all absent from the pattern drives the byte table to the same reset state from every *live* state (a non-needle byte falls back to 0 via KMP from any non-accept state; a non-prefix byte fails), and the accept/fail rows are never read because the scan returns the instant it reaches them. So such a token's whole column is just the skip value. Pre-fill the table with the skip value and only compute columns for codes containing a pattern byte; for those, read the token once while advancing all `n_states` start states in lockstep (a per-byte gather). Build-heavy microbench (build + 4k-row scan): ~1.3-1.6x faster, more for rare-byte needles (most tokens skipped), less for common-byte needles like `%google%` on URLs. Randomized ground-truth fuzz test still passes. Signed-off-by: Joe Isaacs --- encodings/experimental/onpair/benches/like.rs | 38 +++++++++++ .../experimental/onpair/src/dfa/contains.rs | 6 +- encodings/experimental/onpair/src/dfa/mod.rs | 66 +++++++++++++------ .../experimental/onpair/src/dfa/prefix.rs | 6 +- 4 files changed, 93 insertions(+), 23 deletions(-) diff --git a/encodings/experimental/onpair/benches/like.rs b/encodings/experimental/onpair/benches/like.rs index 4eaf345da74..14f7516a23c 100644 --- a/encodings/experimental/onpair/benches/like.rs +++ b/encodings/experimental/onpair/benches/like.rs @@ -95,10 +95,48 @@ static CORPUS: LazyLock = LazyLock::new(|| { onpair_compress(&varbin, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap() }); +/// A small (4k-row) corpus with a full-sized dictionary, so the per-call DFA +/// table construction dominates — the dict-encoded ClickBench regime, where the +/// kernel scans only the deduplicated values. +static CORPUS_SMALL: LazyLock = LazyLock::new(|| { + let rows: Vec = (0..4000) + .map(|i| { + format!( + "https://host{}.example.com/path/{}/item/{}?ref={}", + i % 89, + (i * 7) % 1000, + (i * 13) % 5000, + i + ) + }) + .collect(); + let refs: Vec> = rows.iter().map(|s| Some(s.as_str())).collect(); + let varbin = VarBinArray::from_iter(refs, DType::Utf8(Nullability::NonNullable)); + let len = varbin.len(); + let dtype = varbin.dtype().clone(); + onpair_compress(&varbin, len, &dtype, DEFAULT_DICT12_CONFIG).unwrap() +}); + fn pattern_array(pattern: &str) -> ArrayRef { ConstantArray::new(pattern, CORPUS.len()).into_array() } +/// Run the pushdown kernel on a given corpus (build + scan). +fn run_pushdown_on(corpus: &OnPairArray, pattern: &str) -> ArrayRef { + let mut ctx = ctx(); + let p = ConstantArray::new(pattern, corpus.len()).into_array(); + ::like(corpus.as_view(), &p, LikeOptions::default(), &mut ctx) + .unwrap() + .expect("OnPair pushdown should handle this pattern") +} + +/// Build-dominated: needles range from common bytes (many relevant codes) to +/// rare bytes (almost all codes skipped). +#[divan::bench(args = ["%google%", "%example%", "%zqxj%"])] +fn contains_pushdown_buildheavy(bencher: Bencher, pattern: &str) { + bencher.bench(|| black_box(run_pushdown_on(&CORPUS_SMALL, pattern))); +} + /// Compressed-domain pushdown: the DFA kernel, no decompression. fn run_pushdown(pattern: &ArrayRef) -> ArrayRef { let mut ctx = ctx(); diff --git a/encodings/experimental/onpair/src/dfa/contains.rs b/encodings/experimental/onpair/src/dfa/contains.rs index ed562e33e19..167c03a9075 100644 --- a/encodings/experimental/onpair/src/dfa/contains.rs +++ b/encodings/experimental/onpair/src/dfa/contains.rs @@ -10,6 +10,7 @@ use vortex_error::VortexExpect; use super::build_code_transitions; +use super::byte_mask; use super::kmp_byte_transitions; use super::n_codes; @@ -36,12 +37,15 @@ impl FlatContainsDfa { let n_states = usize::from(accept_state) + 1; let byte_table = kmp_byte_transitions(needle); + // A non-needle byte resets the KMP automaton to state 0 from any live + // state, so tokens with no needle byte have an all-zero column. let transitions = build_code_transitions( dict_bytes, dict_offsets, &byte_table, n_states, - accept_state, + 0, + &byte_mask(needle), ); Self { diff --git a/encodings/experimental/onpair/src/dfa/mod.rs b/encodings/experimental/onpair/src/dfa/mod.rs index 829c4ac40c2..15950511075 100644 --- a/encodings/experimental/onpair/src/dfa/mod.rs +++ b/encodings/experimental/onpair/src/dfa/mod.rs @@ -255,42 +255,66 @@ fn n_codes(dict_offsets: &[u32]) -> usize { dict_offsets.len().saturating_sub(1) } -/// Lift a byte-level transition table to a per-code table. +/// Lift a byte-level transition table to a per-code table, indexed as +/// `[state * n_codes + code]`. /// -/// For each `(state, code)` pair, feed the code's token bytes through -/// `byte_table` and record the resulting state. `accept_state` is sticky in -/// `byte_table`, so we can short-circuit once it is reached. +/// ## Only the relevant codes are built /// -/// Returns a flat `Vec` indexed as `[state * n_codes + code]`. +/// The needle/prefix can only interact with the (usually tiny) set of dictionary +/// tokens that contain one of its bytes. A token whose bytes are **all** absent +/// from `relevant` drives the byte table from every live state to the same +/// `skip_value` (for contains: a non-needle byte falls all the way back to 0 +/// via KMP from any non-accept state; for prefix: it fails). The accept/fail +/// rows are never read — the scan returns the instant it reaches them — so such +/// a token's whole column is just `skip_value`. We pre-fill the table with +/// `skip_value` and only compute columns for codes that contain a relevant byte. +/// +/// For a column that *is* built, the token is read once while advancing all +/// `n_states` starting states in lockstep (`cur[s] = byte_table[cur[s]*256+b]`), +/// a per-byte gather over the independent states. fn build_code_transitions( dict_bytes: &[u8], dict_offsets: &[u32], byte_table: &[u8], n_states: usize, - accept_state: u8, + skip_value: u8, + relevant: &[bool; 256], ) -> Vec { let n_codes = n_codes(dict_offsets); - let mut trans = vec![0u8; n_states * n_codes]; - for state in 0..n_states { - let state = u8::try_from(state).vortex_expect("state fits in u8"); - for code in 0..n_codes { - let begin = dict_offsets[code] as usize; - let end = dict_offsets[code + 1] as usize; - let mut s = state; - if s != accept_state { - for &b in &dict_bytes[begin..end] { - s = byte_table[usize::from(s) * 256 + usize::from(b)]; - if s == accept_state { - break; - } - } + let mut trans = vec![skip_value; n_states * n_codes]; + let n_states_u8 = u8::try_from(n_states).vortex_expect("n_states fits in u8"); + let identity: Vec = (0..n_states_u8).collect(); + let mut cur = identity.clone(); + for code in 0..n_codes { + let begin = dict_offsets[code] as usize; + let end = dict_offsets[code + 1] as usize; + let token = &dict_bytes[begin..end]; + if !token.iter().any(|&b| relevant[usize::from(b)]) { + continue; // column is entirely `skip_value` (already filled) + } + cur.copy_from_slice(&identity); + for &b in token { + let col = usize::from(b); + for c in &mut cur { + *c = byte_table[usize::from(*c) * 256 + col]; } - trans[usize::from(state) * n_codes + code] = s; + } + for (s, &c) in cur.iter().enumerate() { + trans[s * n_codes + code] = c; } } trans } +/// Build a 256-entry presence mask of the bytes that appear in `bytes`. +fn byte_mask(bytes: &[u8]) -> [bool; 256] { + let mut mask = [false; 256]; + for &b in bytes { + mask[usize::from(b)] = true; + } + mask +} + // --------------------------------------------------------------------------- // KMP helpers (shared with the contains DFA) // --------------------------------------------------------------------------- diff --git a/encodings/experimental/onpair/src/dfa/prefix.rs b/encodings/experimental/onpair/src/dfa/prefix.rs index 6b108e09b1a..4a45382e772 100644 --- a/encodings/experimental/onpair/src/dfa/prefix.rs +++ b/encodings/experimental/onpair/src/dfa/prefix.rs @@ -10,6 +10,7 @@ use vortex_error::VortexExpect; use super::build_code_transitions; +use super::byte_mask; use super::n_codes; /// Flat per-code transition table DFA for prefix matching on OnPair codes. @@ -46,12 +47,15 @@ impl FlatPrefixDfa { let n_states = usize::from(fail_state) + 1; let byte_table = build_prefix_byte_table(prefix, accept_state, fail_state); + // A byte not in the prefix fails from every live state, so tokens with no + // prefix byte have an all-fail column. let transitions = build_code_transitions( dict_bytes, dict_offsets, &byte_table, n_states, - accept_state, + fail_state, + &byte_mask(prefix), ); Self {