Skip to main content

vortex_fsst/compute/
like.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use vortex_array::ArrayRef;
5use vortex_array::ArrayView;
6use vortex_array::ExecutionCtx;
7use vortex_array::IntoArray;
8use vortex_array::arrays::BoolArray;
9use vortex_array::arrays::PrimitiveArray;
10use vortex_array::arrays::varbin::VarBinArrayExt;
11use vortex_array::match_each_integer_ptype;
12use vortex_array::scalar_fn::fns::like::LikeKernel;
13use vortex_array::scalar_fn::fns::like::LikeOptions;
14use vortex_error::VortexResult;
15
16use crate::FSST;
17use crate::FSSTArrayExt;
18use crate::dfa::FsstMatcher;
19use crate::dfa::dfa_scan_to_bitbuf;
20
21impl LikeKernel for FSST {
22    fn like(
23        array: ArrayView<'_, Self>,
24        pattern: &ArrayRef,
25        options: LikeOptions,
26        ctx: &mut ExecutionCtx,
27    ) -> VortexResult<Option<ArrayRef>> {
28        let Some(pattern_scalar) = pattern.as_constant() else {
29            return Ok(None);
30        };
31
32        if options.case_insensitive {
33            return Ok(None);
34        }
35
36        let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37            let Some(v) = s.value() else {
38                return Ok(None);
39            };
40            v.as_ref()
41        } else if let Some(b) = pattern_scalar.as_binary_opt() {
42            let Some(v) = b.value() else {
43                return Ok(None);
44            };
45            v
46        } else {
47            return Ok(None);
48        };
49
50        let symbols = array.symbols();
51        let symbol_lengths = array.symbol_lengths();
52
53        let Some(matcher) =
54            FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
55        else {
56            return Ok(None);
57        };
58
59        let negated = options.negated;
60        let codes = array.codes();
61        let offsets = codes.offsets().clone().execute::<PrimitiveArray>(ctx)?;
62        let all_bytes = codes.bytes();
63        let all_bytes = all_bytes.as_slice();
64        let n = codes.len();
65
66        let result = match_each_integer_ptype!(offsets.ptype(), |T| {
67            let off = offsets.as_slice::<T>();
68            dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
69        });
70
71        // FSST delegates validity to its codes array, so we can read it
72        // directly without cloning the entire FSSTArray into an ArrayRef.
73        let validity = array
74            .codes()
75            .validity()?
76            .union_nullability(pattern_scalar.dtype().nullability());
77
78        Ok(Some(BoolArray::new(result, validity).into_array()))
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use std::sync::LazyLock;
85
86    use vortex_array::Canonical;
87    use vortex_array::IntoArray;
88    use vortex_array::VortexSessionExecute;
89    use vortex_array::arrays::BoolArray;
90    use vortex_array::arrays::ConstantArray;
91    use vortex_array::arrays::VarBinArray;
92    use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
93    use vortex_array::assert_arrays_eq;
94    use vortex_array::dtype::DType;
95    use vortex_array::dtype::Nullability;
96    use vortex_array::scalar_fn::fns::like::Like;
97    use vortex_array::scalar_fn::fns::like::LikeKernel;
98    use vortex_array::scalar_fn::fns::like::LikeOptions;
99    use vortex_array::session::ArraySession;
100    use vortex_error::VortexResult;
101    use vortex_session::VortexSession;
102
103    use crate::FSST;
104    use crate::FSSTArray;
105    use crate::fsst_compress;
106    use crate::fsst_train_compressor;
107
108    static SESSION: LazyLock<VortexSession> =
109        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
110
111    fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
112        let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
113        let compressor = fsst_train_compressor(&varbin);
114        let len = varbin.len();
115        let dtype = varbin.dtype().clone();
116        fsst_compress(
117            varbin,
118            len,
119            &dtype,
120            &compressor,
121            &mut SESSION.create_execution_ctx(),
122        )
123    }
124
125    fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
126        let len = array.len();
127        let arr = array.into_array();
128        let pattern = ConstantArray::new(pattern, len).into_array();
129        let result = Like
130            .try_new_array(len, opts, [arr, pattern])?
131            .into_array()
132            .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
133        Ok(result.into_bool())
134    }
135
136    fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
137        run_like(array, pattern, LikeOptions::default())
138    }
139
140    #[test]
141    fn test_like_prefix() -> VortexResult<()> {
142        let fsst = make_fsst(
143            &[
144                Some("http://example.com"),
145                Some("http://test.org"),
146                Some("ftp://files.net"),
147                Some("http://vortex.dev"),
148                Some("ssh://server.io"),
149            ],
150            Nullability::NonNullable,
151        );
152        let result = like(fsst, "http%")?;
153        assert_arrays_eq!(
154            &result,
155            &BoolArray::from_iter([true, true, false, true, false])
156        );
157        Ok(())
158    }
159
160    #[test]
161    fn test_like_prefix_with_nulls() -> VortexResult<()> {
162        let fsst = make_fsst(
163            &[Some("hello"), None, Some("help"), None, Some("goodbye")],
164            Nullability::Nullable,
165        );
166        let result = like(fsst, "hel%")?; // spellchecker:disable-line
167        assert_arrays_eq!(
168            &result,
169            &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
170        );
171        Ok(())
172    }
173
174    #[test]
175    fn test_like_contains() -> VortexResult<()> {
176        let fsst = make_fsst(
177            &[
178                Some("hello world"),
179                Some("say hello"),
180                Some("goodbye"),
181                Some("hellooo"),
182            ],
183            Nullability::NonNullable,
184        );
185        let result = like(fsst, "%hello%")?;
186        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
187        Ok(())
188    }
189
190    #[test]
191    fn test_like_contains_cross_symbol() -> VortexResult<()> {
192        let fsst = make_fsst(
193            &[
194                Some("the quick brown fox jumps over the lazy dog"),
195                Some("a short string"),
196                Some("the lazy dog sleeps"),
197                Some("no match"),
198            ],
199            Nullability::NonNullable,
200        );
201        let result = like(fsst, "%lazy dog%")?;
202        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
203        Ok(())
204    }
205
206    #[test]
207    fn test_not_like_contains() -> VortexResult<()> {
208        let fsst = make_fsst(
209            &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
210            Nullability::NonNullable,
211        );
212        let opts = LikeOptions {
213            negated: true,
214            case_insensitive: false,
215        };
216        let result = run_like(fsst, "%sdf%", opts)?;
217        assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
218        Ok(())
219    }
220
221    #[test]
222    fn test_like_match_all() -> VortexResult<()> {
223        let fsst = make_fsst(
224            &[Some("abc"), Some(""), Some("xyz")],
225            Nullability::NonNullable,
226        );
227        let result = like(fsst, "%")?;
228        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
229        Ok(())
230    }
231
232    /// Call `LikeKernel::like` directly on the FSSTArray and verify it
233    /// returns `Some(...)` (i.e. the kernel handles it, rather than
234    /// returning `None` which would mean "fall back to decompress").
235    #[test]
236    fn test_like_prefix_kernel_handles() -> VortexResult<()> {
237        let fsst = make_fsst(
238            &[Some("http://a.com"), Some("ftp://b.com")],
239            Nullability::NonNullable,
240        );
241        let pattern = ConstantArray::new("http%", fsst.len()).into_array();
242        let mut ctx = SESSION.create_execution_ctx();
243
244        let fsst = fsst.as_view();
245        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
246        assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
247        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
248        Ok(())
249    }
250
251    /// Same direct-call check for the contains pattern `%needle%`.
252    #[test]
253    fn test_like_contains_kernel_handles() -> VortexResult<()> {
254        let fsst = make_fsst(
255            &[Some("hello world"), Some("goodbye")],
256            Nullability::NonNullable,
257        );
258        let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
259        let mut ctx = SESSION.create_execution_ctx();
260
261        let fsst = fsst.as_view();
262        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
263        assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
264        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
265        Ok(())
266    }
267
268    /// Patterns we can't handle should return `None` (fall back).
269    #[test]
270    fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
271        let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
272        let mut ctx = SESSION.create_execution_ctx();
273
274        // Underscore wildcard -- not handled.
275        let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
276        let fsst_v = fsst.as_view();
277        let result =
278            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
279        assert!(result.is_none(), "underscore pattern should fall back");
280
281        // Case-insensitive -- not handled.
282        let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
283        let opts = LikeOptions {
284            negated: false,
285            case_insensitive: true,
286        };
287        let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
288        assert!(result.is_none(), "ilike should fall back");
289
290        // Suffix patterns are still unsupported, even when the suffix is an escaped literal.
291        let pattern = ConstantArray::new(r"%\%", fsst.len()).into_array();
292        let result =
293            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
294        assert!(result.is_none(), "escaped suffix pattern should fall back");
295
296        Ok(())
297    }
298
299    #[test]
300    fn test_like_kernel_handles_escaped_prefix_and_contains() -> VortexResult<()> {
301        let fsst = make_fsst(
302            &[
303                Some("%front"),
304                Some("_front"),
305                Some("\\front"),
306                Some("middle%value"),
307                Some("middle_value"),
308                Some("middle\\value"),
309                Some("front"),
310            ],
311            Nullability::NonNullable,
312        );
313        let fsst_v = fsst.as_view();
314        let mut ctx = SESSION.create_execution_ctx();
315
316        let pattern = ConstantArray::new(r"\%%", fsst.len()).into_array();
317        let result =
318            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
319        assert!(result.is_some(), "escaped percent prefix should use FSST");
320        assert_arrays_eq!(
321            result.unwrap(),
322            BoolArray::from_iter([true, false, false, false, false, false, false])
323        );
324
325        let pattern = ConstantArray::new(r"\_%", fsst.len()).into_array();
326        let result =
327            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
328        assert!(
329            result.is_some(),
330            "escaped underscore prefix should use FSST"
331        );
332        assert_arrays_eq!(
333            result.unwrap(),
334            BoolArray::from_iter([false, true, false, false, false, false, false])
335        );
336
337        let pattern = ConstantArray::new(r"\\%", fsst.len()).into_array();
338        let result =
339            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
340        assert!(result.is_some(), "escaped backslash prefix should use FSST");
341        assert_arrays_eq!(
342            result.unwrap(),
343            BoolArray::from_iter([false, false, true, false, false, false, false])
344        );
345
346        let pattern = ConstantArray::new(r"%\%%", fsst.len()).into_array();
347        let result =
348            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
349        assert!(result.is_some(), "escaped percent contains should use FSST");
350        assert_arrays_eq!(
351            result.unwrap(),
352            BoolArray::from_iter([true, false, false, true, false, false, false])
353        );
354
355        let pattern = ConstantArray::new(r"%\_%", fsst.len()).into_array();
356        let result =
357            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
358        assert!(
359            result.is_some(),
360            "escaped underscore contains should use FSST"
361        );
362        assert_arrays_eq!(
363            result.unwrap(),
364            BoolArray::from_iter([false, true, false, false, true, false, false])
365        );
366
367        let pattern = ConstantArray::new(r"%\\%", fsst.len()).into_array();
368        let result =
369            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
370        assert!(
371            result.is_some(),
372            "escaped backslash contains should use FSST"
373        );
374        assert_arrays_eq!(
375            result.unwrap(),
376            BoolArray::from_iter([false, false, true, false, false, true, false])
377        );
378
379        Ok(())
380    }
381
382    #[test]
383    fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
384        let fsst = make_fsst(
385            &[
386                Some("abcdefghijklmn-tail"),
387                Some("abcdefghijklmx-tail"),
388                Some("abcdefghijklmn"),
389            ],
390            Nullability::NonNullable,
391        );
392        let pattern = "abcdefghijklmn%";
393
394        let fsst = fsst.as_view();
395        let direct = <FSST as LikeKernel>::like(
396            fsst,
397            &ConstantArray::new(pattern, fsst.len()).into_array(),
398            LikeOptions::default(),
399            &mut SESSION.create_execution_ctx(),
400        )?;
401        assert!(
402            direct.is_some(),
403            "14-byte prefixes are now handled by the flat prefix DFA"
404        );
405        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
406        Ok(())
407    }
408
409    #[test]
410    fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
411        let needle = "a".repeat(255);
412        let matching = format!("xx{needle}yy");
413        let non_matching = format!("xx{}byy", "a".repeat(254));
414        let exact = needle.clone();
415        let pattern = format!("%{needle}%");
416
417        let fsst = make_fsst(
418            &[Some(&matching), Some(&non_matching), Some(&exact)],
419            Nullability::NonNullable,
420        );
421
422        let fsst_v = fsst.as_view();
423        let direct = <FSST as LikeKernel>::like(
424            fsst_v,
425            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
426            LikeOptions::default(),
427            &mut SESSION.create_execution_ctx(),
428        )?;
429        assert!(
430            direct.is_none(),
431            "contains needles longer than 254 bytes exceed the DFA's u8 state space"
432        );
433
434        let result = like(fsst, &pattern)?;
435        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
436        Ok(())
437    }
438
439    #[test]
440    fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
441        let needle = "a".repeat(254);
442        let matching = format!("xx{needle}yy");
443        let non_matching = format!("xx{}byy", "a".repeat(253));
444        let pattern = format!("%{needle}%");
445
446        let fsst = make_fsst(
447            &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
448            Nullability::NonNullable,
449        );
450
451        let fsst = fsst.as_view();
452        let direct = <FSST as LikeKernel>::like(
453            fsst,
454            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
455            LikeOptions::default(),
456            &mut SESSION.create_execution_ctx(),
457        )?;
458        assert!(
459            direct.is_some(),
460            "254-byte contains needle should stay on the DFA path"
461        );
462        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
463        Ok(())
464    }
465}