Skip to main content

vortex_fsst/compute/
like.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use vortex_array::ArrayRef;
5use vortex_array::ArrayView;
6use vortex_array::ExecutionCtx;
7use vortex_array::IntoArray;
8use vortex_array::arrays::BoolArray;
9use vortex_array::arrays::PrimitiveArray;
10use vortex_array::arrays::varbin::VarBinArrayExt;
11use vortex_array::match_each_integer_ptype;
12use vortex_array::scalar_fn::fns::like::LikeKernel;
13use vortex_array::scalar_fn::fns::like::LikeOptions;
14use vortex_error::VortexResult;
15
16use crate::FSST;
17use crate::FSSTArrayExt;
18use crate::dfa::FsstMatcher;
19use crate::dfa::dfa_scan_to_bitbuf;
20
21impl LikeKernel for FSST {
22    fn like(
23        array: ArrayView<'_, Self>,
24        pattern: &ArrayRef,
25        options: LikeOptions,
26        ctx: &mut ExecutionCtx,
27    ) -> VortexResult<Option<ArrayRef>> {
28        let Some(pattern_scalar) = pattern.as_constant() else {
29            return Ok(None);
30        };
31
32        if options.case_insensitive {
33            return Ok(None);
34        }
35
36        let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
37            let Some(v) = s.value() else {
38                return Ok(None);
39            };
40            v.as_ref()
41        } else if let Some(b) = pattern_scalar.as_binary_opt() {
42            let Some(v) = b.value() else {
43                return Ok(None);
44            };
45            v
46        } else {
47            return Ok(None);
48        };
49
50        let symbols = array.symbols();
51        let symbol_lengths = array.symbol_lengths();
52
53        let Some(matcher) =
54            FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
55        else {
56            return Ok(None);
57        };
58
59        let negated = options.negated;
60        let codes = array.codes();
61        let offsets = codes.offsets().clone().execute::<PrimitiveArray>(ctx)?;
62        let all_bytes = codes.bytes();
63        let all_bytes = all_bytes.as_slice();
64        let n = codes.len();
65
66        let result = match_each_integer_ptype!(offsets.ptype(), |T| {
67            let off = offsets.as_slice::<T>();
68            dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
69        });
70
71        // FSST delegates validity to its codes array, so we can read it
72        // directly without cloning the entire FSSTArray into an ArrayRef.
73        let validity = array
74            .codes()
75            .validity()?
76            .union_nullability(pattern_scalar.dtype().nullability());
77
78        Ok(Some(BoolArray::new(result, validity).into_array()))
79    }
80}
81
82#[cfg(test)]
83mod tests {
84    use std::sync::LazyLock;
85
86    use vortex_array::Canonical;
87    use vortex_array::IntoArray;
88    use vortex_array::VortexSessionExecute;
89    use vortex_array::arrays::BoolArray;
90    use vortex_array::arrays::ConstantArray;
91    use vortex_array::arrays::VarBinArray;
92    use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
93    use vortex_array::assert_arrays_eq;
94    use vortex_array::dtype::DType;
95    use vortex_array::dtype::Nullability;
96    use vortex_array::scalar_fn::fns::like::Like;
97    use vortex_array::scalar_fn::fns::like::LikeKernel;
98    use vortex_array::scalar_fn::fns::like::LikeOptions;
99    use vortex_array::session::ArraySession;
100    use vortex_error::VortexResult;
101    use vortex_session::VortexSession;
102
103    use crate::FSST;
104    use crate::FSSTArray;
105    use crate::fsst_compress;
106    use crate::fsst_train_compressor;
107
108    static SESSION: LazyLock<VortexSession> =
109        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
110
111    fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
112        let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
113        let compressor = fsst_train_compressor(&varbin);
114        let len = varbin.len();
115        let dtype = varbin.dtype().clone();
116        fsst_compress(
117            varbin,
118            len,
119            &dtype,
120            &compressor,
121            &mut SESSION.create_execution_ctx(),
122        )
123    }
124
125    fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
126        let len = array.len();
127        let arr = array.into_array();
128        let pattern = ConstantArray::new(pattern, len).into_array();
129        let result = Like
130            .try_new_array(len, opts, [arr, pattern])?
131            .into_array()
132            .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
133        Ok(result.into_bool())
134    }
135
136    fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
137        run_like(array, pattern, LikeOptions::default())
138    }
139
140    #[test]
141    fn test_like_prefix() -> VortexResult<()> {
142        let fsst = make_fsst(
143            &[
144                Some("http://example.com"),
145                Some("http://test.org"),
146                Some("ftp://files.net"),
147                Some("http://vortex.dev"),
148                Some("ssh://server.io"),
149            ],
150            Nullability::NonNullable,
151        );
152        let result = like(fsst, "http%")?;
153        assert_arrays_eq!(
154            &result,
155            &BoolArray::from_iter([true, true, false, true, false])
156        );
157        Ok(())
158    }
159
160    #[test]
161    fn test_like_prefix_with_nulls() -> VortexResult<()> {
162        let fsst = make_fsst(
163            &[Some("hello"), None, Some("help"), None, Some("goodbye")],
164            Nullability::Nullable,
165        );
166        let result = like(fsst, "hel%")?; // spellchecker:disable-line
167        assert_arrays_eq!(
168            &result,
169            &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
170        );
171        Ok(())
172    }
173
174    #[test]
175    fn test_like_contains() -> VortexResult<()> {
176        let fsst = make_fsst(
177            &[
178                Some("hello world"),
179                Some("say hello"),
180                Some("goodbye"),
181                Some("hellooo"),
182            ],
183            Nullability::NonNullable,
184        );
185        let result = like(fsst, "%hello%")?;
186        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
187        Ok(())
188    }
189
190    #[test]
191    fn test_like_contains_cross_symbol() -> VortexResult<()> {
192        let fsst = make_fsst(
193            &[
194                Some("the quick brown fox jumps over the lazy dog"),
195                Some("a short string"),
196                Some("the lazy dog sleeps"),
197                Some("no match"),
198            ],
199            Nullability::NonNullable,
200        );
201        let result = like(fsst, "%lazy dog%")?;
202        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
203        Ok(())
204    }
205
206    #[test]
207    fn test_not_like_contains() -> VortexResult<()> {
208        let fsst = make_fsst(
209            &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
210            Nullability::NonNullable,
211        );
212        let opts = LikeOptions {
213            negated: true,
214            case_insensitive: false,
215        };
216        let result = run_like(fsst, "%sdf%", opts)?;
217        assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
218        Ok(())
219    }
220
221    #[test]
222    fn test_like_match_all() -> VortexResult<()> {
223        let fsst = make_fsst(
224            &[Some("abc"), Some(""), Some("xyz")],
225            Nullability::NonNullable,
226        );
227        let result = like(fsst, "%")?;
228        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
229        Ok(())
230    }
231
232    /// Call `LikeKernel::like` directly on the FSSTArray and verify it
233    /// returns `Some(...)` (i.e. the kernel handles it, rather than
234    /// returning `None` which would mean "fall back to decompress").
235    #[test]
236    fn test_like_prefix_kernel_handles() -> VortexResult<()> {
237        let fsst = make_fsst(
238            &[Some("http://a.com"), Some("ftp://b.com")],
239            Nullability::NonNullable,
240        );
241        let pattern = ConstantArray::new("http%", fsst.len()).into_array();
242        let mut ctx = SESSION.create_execution_ctx();
243
244        let fsst = fsst.as_view();
245        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
246        assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
247        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
248        Ok(())
249    }
250
251    /// Same direct-call check for the contains pattern `%needle%`.
252    #[test]
253    fn test_like_contains_kernel_handles() -> VortexResult<()> {
254        let fsst = make_fsst(
255            &[Some("hello world"), Some("goodbye")],
256            Nullability::NonNullable,
257        );
258        let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
259        let mut ctx = SESSION.create_execution_ctx();
260
261        let fsst = fsst.as_view();
262        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
263        assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
264        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
265        Ok(())
266    }
267
268    /// Patterns we can't handle should return `None` (fall back).
269    #[test]
270    fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
271        let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
272        let mut ctx = SESSION.create_execution_ctx();
273
274        // Underscore wildcard -- not handled.
275        let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
276        let fsst_v = fsst.as_view();
277        let result =
278            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
279        assert!(result.is_none(), "underscore pattern should fall back");
280
281        // Case-insensitive -- not handled.
282        let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
283        let opts = LikeOptions {
284            negated: false,
285            case_insensitive: true,
286        };
287        let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
288        assert!(result.is_none(), "ilike should fall back");
289
290        Ok(())
291    }
292
293    #[test]
294    fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
295        let fsst = make_fsst(
296            &[
297                Some("abcdefghijklmn-tail"),
298                Some("abcdefghijklmx-tail"),
299                Some("abcdefghijklmn"),
300            ],
301            Nullability::NonNullable,
302        );
303        let pattern = "abcdefghijklmn%";
304
305        let fsst = fsst.as_view();
306        let direct = <FSST as LikeKernel>::like(
307            fsst,
308            &ConstantArray::new(pattern, fsst.len()).into_array(),
309            LikeOptions::default(),
310            &mut SESSION.create_execution_ctx(),
311        )?;
312        assert!(
313            direct.is_some(),
314            "14-byte prefixes are now handled by the flat prefix DFA"
315        );
316        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
317        Ok(())
318    }
319
320    #[test]
321    fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
322        let needle = "a".repeat(255);
323        let matching = format!("xx{needle}yy");
324        let non_matching = format!("xx{}byy", "a".repeat(254));
325        let exact = needle.clone();
326        let pattern = format!("%{needle}%");
327
328        let fsst = make_fsst(
329            &[Some(&matching), Some(&non_matching), Some(&exact)],
330            Nullability::NonNullable,
331        );
332
333        let fsst_v = fsst.as_view();
334        let direct = <FSST as LikeKernel>::like(
335            fsst_v,
336            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
337            LikeOptions::default(),
338            &mut SESSION.create_execution_ctx(),
339        )?;
340        assert!(
341            direct.is_none(),
342            "contains needles longer than 254 bytes exceed the DFA's u8 state space"
343        );
344
345        let result = like(fsst, &pattern)?;
346        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
347        Ok(())
348    }
349
350    #[test]
351    fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
352        let needle = "a".repeat(254);
353        let matching = format!("xx{needle}yy");
354        let non_matching = format!("xx{}byy", "a".repeat(253));
355        let pattern = format!("%{needle}%");
356
357        let fsst = make_fsst(
358            &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
359            Nullability::NonNullable,
360        );
361
362        let fsst = fsst.as_view();
363        let direct = <FSST as LikeKernel>::like(
364            fsst,
365            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
366            LikeOptions::default(),
367            &mut SESSION.create_execution_ctx(),
368        )?;
369        assert!(
370            direct.is_some(),
371            "254-byte contains needle should stay on the DFA path"
372        );
373        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
374        Ok(())
375    }
376}