Skip to main content

vortex_fsst/compute/
like.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4#![allow(clippy::cast_possible_truncation)]
5
6use vortex_array::ArrayRef;
7use vortex_array::ExecutionCtx;
8use vortex_array::IntoArray;
9use vortex_array::ToCanonical;
10use vortex_array::arrays::BoolArray;
11use vortex_array::match_each_integer_ptype;
12use vortex_array::scalar_fn::fns::like::LikeKernel;
13use vortex_array::scalar_fn::fns::like::LikeOptions;
14use vortex_error::VortexResult;
15
16use crate::FSST;
17use crate::FSSTArray;
18use crate::dfa::FsstMatcher;
19use crate::dfa::dfa_scan_to_bitbuf;
20
21impl LikeKernel for FSST {
22    fn like(
23        array: &FSSTArray,
24        pattern: &ArrayRef,
25        options: LikeOptions,
26        _ctx: &mut ExecutionCtx,
27    ) -> VortexResult<Option<ArrayRef>> {
28        let Some(pattern_scalar) = pattern.as_constant() else {
29            return Ok(None);
30        };
31
32        if options.case_insensitive {
33            return Ok(None);
34        }
35
36        let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37            let Some(v) = s.value() else {
38                return Ok(None);
39            };
40            v.as_ref()
41        } else if let Some(b) = pattern_scalar.as_binary_opt() {
42            let Some(v) = b.value() else {
43                return Ok(None);
44            };
45            v
46        } else {
47            return Ok(None);
48        };
49
50        let symbols = array.symbols();
51        let symbol_lengths = array.symbol_lengths();
52
53        let Some(matcher) =
54            FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
55        else {
56            return Ok(None);
57        };
58
59        let negated = options.negated;
60        let codes = array.codes();
61        let offsets = codes.offsets().to_primitive();
62        let all_bytes = codes.bytes();
63        let all_bytes = all_bytes.as_slice();
64        let n = codes.len();
65
66        let result = match_each_integer_ptype!(offsets.ptype(), |T| {
67            let off = offsets.as_slice::<T>();
68            dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
69        });
70
71        // FSST delegates validity to its codes array, so we can read it
72        // directly without cloning the entire FSSTArray into an ArrayRef.
73        let validity = array
74            .codes()
75            .validity()?
76            .union_nullability(pattern_scalar.dtype().nullability());
77
78        Ok(Some(BoolArray::new(result, validity).into_array()))
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use std::sync::LazyLock;
85
86    use vortex_array::Canonical;
87    use vortex_array::IntoArray;
88    use vortex_array::VortexSessionExecute;
89    use vortex_array::arrays::BoolArray;
90    use vortex_array::arrays::ConstantArray;
91    use vortex_array::arrays::VarBinArray;
92    use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
93    use vortex_array::assert_arrays_eq;
94    use vortex_array::dtype::DType;
95    use vortex_array::dtype::Nullability;
96    use vortex_array::scalar_fn::fns::like::Like;
97    use vortex_array::scalar_fn::fns::like::LikeKernel;
98    use vortex_array::scalar_fn::fns::like::LikeOptions;
99    use vortex_array::session::ArraySession;
100    use vortex_error::VortexResult;
101    use vortex_session::VortexSession;
102
103    use crate::FSST;
104    use crate::FSSTArray;
105    use crate::fsst_compress;
106    use crate::fsst_train_compressor;
107
108    static SESSION: LazyLock<VortexSession> =
109        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
110
111    fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
112        let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
113        let compressor = fsst_train_compressor(&varbin);
114        fsst_compress(varbin, &compressor)
115    }
116
117    fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
118        let len = array.len();
119        let arr = array.into_array();
120        let pattern = ConstantArray::new(pattern, len).into_array();
121        let result = Like
122            .try_new_array(len, opts, [arr, pattern])?
123            .into_array()
124            .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
125        Ok(result.into_bool())
126    }
127
128    fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
129        run_like(array, pattern, LikeOptions::default())
130    }
131
132    #[test]
133    fn test_like_prefix() -> VortexResult<()> {
134        let fsst = make_fsst(
135            &[
136                Some("http://example.com"),
137                Some("http://test.org"),
138                Some("ftp://files.net"),
139                Some("http://vortex.dev"),
140                Some("ssh://server.io"),
141            ],
142            Nullability::NonNullable,
143        );
144        let result = like(fsst, "http%")?;
145        assert_arrays_eq!(
146            &result,
147            &BoolArray::from_iter([true, true, false, true, false])
148        );
149        Ok(())
150    }
151
152    #[test]
153    fn test_like_prefix_with_nulls() -> VortexResult<()> {
154        let fsst = make_fsst(
155            &[Some("hello"), None, Some("help"), None, Some("goodbye")],
156            Nullability::Nullable,
157        );
158        let result = like(fsst, "hel%")?; // spellchecker:disable-line
159        assert_arrays_eq!(
160            &result,
161            &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
162        );
163        Ok(())
164    }
165
166    #[test]
167    fn test_like_contains() -> VortexResult<()> {
168        let fsst = make_fsst(
169            &[
170                Some("hello world"),
171                Some("say hello"),
172                Some("goodbye"),
173                Some("hellooo"),
174            ],
175            Nullability::NonNullable,
176        );
177        let result = like(fsst, "%hello%")?;
178        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
179        Ok(())
180    }
181
182    #[test]
183    fn test_like_contains_cross_symbol() -> VortexResult<()> {
184        let fsst = make_fsst(
185            &[
186                Some("the quick brown fox jumps over the lazy dog"),
187                Some("a short string"),
188                Some("the lazy dog sleeps"),
189                Some("no match"),
190            ],
191            Nullability::NonNullable,
192        );
193        let result = like(fsst, "%lazy dog%")?;
194        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
195        Ok(())
196    }
197
198    #[test]
199    fn test_not_like_contains() -> VortexResult<()> {
200        let fsst = make_fsst(
201            &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
202            Nullability::NonNullable,
203        );
204        let opts = LikeOptions {
205            negated: true,
206            case_insensitive: false,
207        };
208        let result = run_like(fsst, "%sdf%", opts)?;
209        assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
210        Ok(())
211    }
212
213    #[test]
214    fn test_like_match_all() -> VortexResult<()> {
215        let fsst = make_fsst(
216            &[Some("abc"), Some(""), Some("xyz")],
217            Nullability::NonNullable,
218        );
219        let result = like(fsst, "%")?;
220        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
221        Ok(())
222    }
223
224    /// Call `LikeKernel::like` directly on the FSSTArray and verify it
225    /// returns `Some(...)` (i.e. the kernel handles it, rather than
226    /// returning `None` which would mean "fall back to decompress").
227    #[test]
228    fn test_like_prefix_kernel_handles() -> VortexResult<()> {
229        let fsst = make_fsst(
230            &[Some("http://a.com"), Some("ftp://b.com")],
231            Nullability::NonNullable,
232        );
233        let pattern = ConstantArray::new("http%", fsst.len()).into_array();
234        let mut ctx = SESSION.create_execution_ctx();
235
236        let result = <FSST as LikeKernel>::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?;
237        assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
238        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
239        Ok(())
240    }
241
242    /// Same direct-call check for the contains pattern `%needle%`.
243    #[test]
244    fn test_like_contains_kernel_handles() -> VortexResult<()> {
245        let fsst = make_fsst(
246            &[Some("hello world"), Some("goodbye")],
247            Nullability::NonNullable,
248        );
249        let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
250        let mut ctx = SESSION.create_execution_ctx();
251
252        let result = <FSST as LikeKernel>::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?;
253        assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
254        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
255        Ok(())
256    }
257
258    /// Patterns we can't handle should return `None` (fall back).
259    #[test]
260    fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
261        let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
262        let mut ctx = SESSION.create_execution_ctx();
263
264        // Underscore wildcard -- not handled.
265        let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
266        let result = <FSST as LikeKernel>::like(&fsst, &pattern, LikeOptions::default(), &mut ctx)?;
267        assert!(result.is_none(), "underscore pattern should fall back");
268
269        // Case-insensitive -- not handled.
270        let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
271        let opts = LikeOptions {
272            negated: false,
273            case_insensitive: true,
274        };
275        let result = <FSST as LikeKernel>::like(&fsst, &pattern, opts, &mut ctx)?;
276        assert!(result.is_none(), "ilike should fall back");
277
278        Ok(())
279    }
280
281    #[test]
282    fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
283        let fsst = make_fsst(
284            &[
285                Some("abcdefghijklmn-tail"),
286                Some("abcdefghijklmx-tail"),
287                Some("abcdefghijklmn"),
288            ],
289            Nullability::NonNullable,
290        );
291        let pattern = "abcdefghijklmn%";
292
293        let direct = <FSST as LikeKernel>::like(
294            &fsst,
295            &ConstantArray::new(pattern, fsst.len()).into_array(),
296            LikeOptions::default(),
297            &mut SESSION.create_execution_ctx(),
298        )?;
299        assert!(
300            direct.is_some(),
301            "14-byte prefixes are now handled by the flat prefix DFA"
302        );
303        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
304        Ok(())
305    }
306
307    #[test]
308    fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
309        let needle = "a".repeat(255);
310        let matching = format!("xx{needle}yy");
311        let non_matching = format!("xx{}byy", "a".repeat(254));
312        let exact = needle.clone();
313        let pattern = format!("%{needle}%");
314
315        let fsst = make_fsst(
316            &[Some(&matching), Some(&non_matching), Some(&exact)],
317            Nullability::NonNullable,
318        );
319
320        let direct = <FSST as LikeKernel>::like(
321            &fsst,
322            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
323            LikeOptions::default(),
324            &mut SESSION.create_execution_ctx(),
325        )?;
326        assert!(
327            direct.is_none(),
328            "contains needles longer than 254 bytes exceed the DFA's u8 state space"
329        );
330
331        let result = like(fsst, &pattern)?;
332        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
333        Ok(())
334    }
335
336    #[test]
337    fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
338        let needle = "a".repeat(254);
339        let matching = format!("xx{needle}yy");
340        let non_matching = format!("xx{}byy", "a".repeat(253));
341        let pattern = format!("%{needle}%");
342
343        let fsst = make_fsst(
344            &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
345            Nullability::NonNullable,
346        );
347
348        let direct = <FSST as LikeKernel>::like(
349            &fsst,
350            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
351            LikeOptions::default(),
352            &mut SESSION.create_execution_ctx(),
353        )?;
354        assert!(
355            direct.is_some(),
356            "254-byte contains needle should stay on the DFA path"
357        );
358        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
359        Ok(())
360    }
361}