Skip to main content

vortex_fsst/compute/
like.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use vortex_array::ArrayRef;
5use vortex_array::ArrayView;
6use vortex_array::ExecutionCtx;
7use vortex_array::IntoArray;
8use vortex_array::ToCanonical;
9use vortex_array::arrays::BoolArray;
10use vortex_array::arrays::varbin::VarBinArrayExt;
11use vortex_array::match_each_integer_ptype;
12use vortex_array::scalar_fn::fns::like::LikeKernel;
13use vortex_array::scalar_fn::fns::like::LikeOptions;
14use vortex_error::VortexResult;
15
16use crate::FSST;
17use crate::FSSTArrayExt;
18use crate::dfa::FsstMatcher;
19use crate::dfa::dfa_scan_to_bitbuf;
20
21impl LikeKernel for FSST {
22    fn like(
23        array: ArrayView<'_, Self>,
24        pattern: &ArrayRef,
25        options: LikeOptions,
26        _ctx: &mut ExecutionCtx,
27    ) -> VortexResult<Option<ArrayRef>> {
28        let Some(pattern_scalar) = pattern.as_constant() else {
29            return Ok(None);
30        };
31
32        if options.case_insensitive {
33            return Ok(None);
34        }
35
36        let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37            let Some(v) = s.value() else {
38                return Ok(None);
39            };
40            v.as_ref()
41        } else if let Some(b) = pattern_scalar.as_binary_opt() {
42            let Some(v) = b.value() else {
43                return Ok(None);
44            };
45            v
46        } else {
47            return Ok(None);
48        };
49
50        let symbols = array.symbols();
51        let symbol_lengths = array.symbol_lengths();
52
53        let Some(matcher) =
54            FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
55        else {
56            return Ok(None);
57        };
58
59        let negated = options.negated;
60        let codes = array.codes();
61        let offsets = codes.offsets().to_primitive();
62        let all_bytes = codes.bytes();
63        let all_bytes = all_bytes.as_slice();
64        let n = codes.len();
65
66        let result = match_each_integer_ptype!(offsets.ptype(), |T| {
67            let off = offsets.as_slice::<T>();
68            dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
69        });
70
71        // FSST delegates validity to its codes array, so we can read it
72        // directly without cloning the entire FSSTArray into an ArrayRef.
73        let validity = array
74            .codes()
75            .validity()?
76            .union_nullability(pattern_scalar.dtype().nullability());
77
78        Ok(Some(BoolArray::new(result, validity).into_array()))
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use std::sync::LazyLock;
85
86    use vortex_array::Canonical;
87    use vortex_array::IntoArray;
88    use vortex_array::VortexSessionExecute;
89    use vortex_array::arrays::BoolArray;
90    use vortex_array::arrays::ConstantArray;
91    use vortex_array::arrays::VarBinArray;
92    use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
93    use vortex_array::assert_arrays_eq;
94    use vortex_array::dtype::DType;
95    use vortex_array::dtype::Nullability;
96    use vortex_array::scalar_fn::fns::like::Like;
97    use vortex_array::scalar_fn::fns::like::LikeKernel;
98    use vortex_array::scalar_fn::fns::like::LikeOptions;
99    use vortex_array::session::ArraySession;
100    use vortex_error::VortexResult;
101    use vortex_session::VortexSession;
102
103    use crate::FSST;
104    use crate::FSSTArray;
105    use crate::fsst_compress;
106    use crate::fsst_train_compressor;
107
108    static SESSION: LazyLock<VortexSession> =
109        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
110
111    fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
112        let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
113        let compressor = fsst_train_compressor(&varbin);
114        let len = varbin.len();
115        let dtype = varbin.dtype().clone();
116        fsst_compress(varbin, len, &dtype, &compressor)
117    }
118
119    fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
120        let len = array.len();
121        let arr = array.into_array();
122        let pattern = ConstantArray::new(pattern, len).into_array();
123        let result = Like
124            .try_new_array(len, opts, [arr, pattern])?
125            .into_array()
126            .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
127        Ok(result.into_bool())
128    }
129
130    fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
131        run_like(array, pattern, LikeOptions::default())
132    }
133
134    #[test]
135    fn test_like_prefix() -> VortexResult<()> {
136        let fsst = make_fsst(
137            &[
138                Some("http://example.com"),
139                Some("http://test.org"),
140                Some("ftp://files.net"),
141                Some("http://vortex.dev"),
142                Some("ssh://server.io"),
143            ],
144            Nullability::NonNullable,
145        );
146        let result = like(fsst, "http%")?;
147        assert_arrays_eq!(
148            &result,
149            &BoolArray::from_iter([true, true, false, true, false])
150        );
151        Ok(())
152    }
153
154    #[test]
155    fn test_like_prefix_with_nulls() -> VortexResult<()> {
156        let fsst = make_fsst(
157            &[Some("hello"), None, Some("help"), None, Some("goodbye")],
158            Nullability::Nullable,
159        );
160        let result = like(fsst, "hel%")?; // spellchecker:disable-line
161        assert_arrays_eq!(
162            &result,
163            &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
164        );
165        Ok(())
166    }
167
168    #[test]
169    fn test_like_contains() -> VortexResult<()> {
170        let fsst = make_fsst(
171            &[
172                Some("hello world"),
173                Some("say hello"),
174                Some("goodbye"),
175                Some("hellooo"),
176            ],
177            Nullability::NonNullable,
178        );
179        let result = like(fsst, "%hello%")?;
180        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
181        Ok(())
182    }
183
184    #[test]
185    fn test_like_contains_cross_symbol() -> VortexResult<()> {
186        let fsst = make_fsst(
187            &[
188                Some("the quick brown fox jumps over the lazy dog"),
189                Some("a short string"),
190                Some("the lazy dog sleeps"),
191                Some("no match"),
192            ],
193            Nullability::NonNullable,
194        );
195        let result = like(fsst, "%lazy dog%")?;
196        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
197        Ok(())
198    }
199
200    #[test]
201    fn test_not_like_contains() -> VortexResult<()> {
202        let fsst = make_fsst(
203            &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
204            Nullability::NonNullable,
205        );
206        let opts = LikeOptions {
207            negated: true,
208            case_insensitive: false,
209        };
210        let result = run_like(fsst, "%sdf%", opts)?;
211        assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
212        Ok(())
213    }
214
215    #[test]
216    fn test_like_match_all() -> VortexResult<()> {
217        let fsst = make_fsst(
218            &[Some("abc"), Some(""), Some("xyz")],
219            Nullability::NonNullable,
220        );
221        let result = like(fsst, "%")?;
222        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
223        Ok(())
224    }
225
226    /// Call `LikeKernel::like` directly on the FSSTArray and verify it
227    /// returns `Some(...)` (i.e. the kernel handles it, rather than
228    /// returning `None` which would mean "fall back to decompress").
229    #[test]
230    fn test_like_prefix_kernel_handles() -> VortexResult<()> {
231        let fsst = make_fsst(
232            &[Some("http://a.com"), Some("ftp://b.com")],
233            Nullability::NonNullable,
234        );
235        let pattern = ConstantArray::new("http%", fsst.len()).into_array();
236        let mut ctx = SESSION.create_execution_ctx();
237
238        let fsst = fsst.as_view();
239        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
240        assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
241        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
242        Ok(())
243    }
244
245    /// Same direct-call check for the contains pattern `%needle%`.
246    #[test]
247    fn test_like_contains_kernel_handles() -> VortexResult<()> {
248        let fsst = make_fsst(
249            &[Some("hello world"), Some("goodbye")],
250            Nullability::NonNullable,
251        );
252        let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
253        let mut ctx = SESSION.create_execution_ctx();
254
255        let fsst = fsst.as_view();
256        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
257        assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
258        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
259        Ok(())
260    }
261
262    /// Patterns we can't handle should return `None` (fall back).
263    #[test]
264    fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
265        let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
266        let mut ctx = SESSION.create_execution_ctx();
267
268        // Underscore wildcard -- not handled.
269        let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
270        let fsst_v = fsst.as_view();
271        let result =
272            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
273        assert!(result.is_none(), "underscore pattern should fall back");
274
275        // Case-insensitive -- not handled.
276        let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
277        let opts = LikeOptions {
278            negated: false,
279            case_insensitive: true,
280        };
281        let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
282        assert!(result.is_none(), "ilike should fall back");
283
284        Ok(())
285    }
286
287    #[test]
288    fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
289        let fsst = make_fsst(
290            &[
291                Some("abcdefghijklmn-tail"),
292                Some("abcdefghijklmx-tail"),
293                Some("abcdefghijklmn"),
294            ],
295            Nullability::NonNullable,
296        );
297        let pattern = "abcdefghijklmn%";
298
299        let fsst = fsst.as_view();
300        let direct = <FSST as LikeKernel>::like(
301            fsst,
302            &ConstantArray::new(pattern, fsst.len()).into_array(),
303            LikeOptions::default(),
304            &mut SESSION.create_execution_ctx(),
305        )?;
306        assert!(
307            direct.is_some(),
308            "14-byte prefixes are now handled by the flat prefix DFA"
309        );
310        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
311        Ok(())
312    }
313
314    #[test]
315    fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
316        let needle = "a".repeat(255);
317        let matching = format!("xx{needle}yy");
318        let non_matching = format!("xx{}byy", "a".repeat(254));
319        let exact = needle.clone();
320        let pattern = format!("%{needle}%");
321
322        let fsst = make_fsst(
323            &[Some(&matching), Some(&non_matching), Some(&exact)],
324            Nullability::NonNullable,
325        );
326
327        let fsst_v = fsst.as_view();
328        let direct = <FSST as LikeKernel>::like(
329            fsst_v,
330            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
331            LikeOptions::default(),
332            &mut SESSION.create_execution_ctx(),
333        )?;
334        assert!(
335            direct.is_none(),
336            "contains needles longer than 254 bytes exceed the DFA's u8 state space"
337        );
338
339        let result = like(fsst, &pattern)?;
340        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
341        Ok(())
342    }
343
344    #[test]
345    fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
346        let needle = "a".repeat(254);
347        let matching = format!("xx{needle}yy");
348        let non_matching = format!("xx{}byy", "a".repeat(253));
349        let pattern = format!("%{needle}%");
350
351        let fsst = make_fsst(
352            &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
353            Nullability::NonNullable,
354        );
355
356        let fsst = fsst.as_view();
357        let direct = <FSST as LikeKernel>::like(
358            fsst,
359            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
360            LikeOptions::default(),
361            &mut SESSION.create_execution_ctx(),
362        )?;
363        assert!(
364            direct.is_some(),
365            "254-byte contains needle should stay on the DFA path"
366        );
367        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
368        Ok(())
369    }
370}