Skip to main content

vortex_fsst/compute/
like.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4#![allow(clippy::cast_possible_truncation)]
5
6use vortex_array::ArrayRef;
7use vortex_array::ArrayView;
8use vortex_array::ExecutionCtx;
9use vortex_array::IntoArray;
10use vortex_array::ToCanonical;
11use vortex_array::arrays::BoolArray;
12use vortex_array::arrays::varbin::VarBinArrayExt;
13use vortex_array::match_each_integer_ptype;
14use vortex_array::scalar_fn::fns::like::LikeKernel;
15use vortex_array::scalar_fn::fns::like::LikeOptions;
16use vortex_error::VortexResult;
17
18use crate::FSST;
19use crate::FSSTArrayExt;
20use crate::dfa::FsstMatcher;
21use crate::dfa::dfa_scan_to_bitbuf;
22
23impl LikeKernel for FSST {
24    fn like(
25        array: ArrayView<'_, Self>,
26        pattern: &ArrayRef,
27        options: LikeOptions,
28        _ctx: &mut ExecutionCtx,
29    ) -> VortexResult<Option<ArrayRef>> {
30        let Some(pattern_scalar) = pattern.as_constant() else {
31            return Ok(None);
32        };
33
34        if options.case_insensitive {
35            return Ok(None);
36        }
37
38        let pattern_bytes: &[u8] = if let Some(s) = pattern_scalar.as_utf8_opt() {
39            let Some(v) = s.value() else {
40                return Ok(None);
41            };
42            v.as_ref()
43        } else if let Some(b) = pattern_scalar.as_binary_opt() {
44            let Some(v) = b.value() else {
45                return Ok(None);
46            };
47            v
48        } else {
49            return Ok(None);
50        };
51
52        let symbols = array.symbols();
53        let symbol_lengths = array.symbol_lengths();
54
55        let Some(matcher) =
56            FsstMatcher::try_new(symbols.as_slice(), symbol_lengths.as_slice(), pattern_bytes)?
57        else {
58            return Ok(None);
59        };
60
61        let negated = options.negated;
62        let codes = array.codes();
63        let offsets = codes.offsets().to_primitive();
64        let all_bytes = codes.bytes();
65        let all_bytes = all_bytes.as_slice();
66        let n = codes.len();
67
68        let result = match_each_integer_ptype!(offsets.ptype(), |T| {
69            let off = offsets.as_slice::<T>();
70            dfa_scan_to_bitbuf(n, off, all_bytes, negated, |codes| matcher.matches(codes))
71        });
72
73        // FSST delegates validity to its codes array, so we can read it
74        // directly without cloning the entire FSSTArray into an ArrayRef.
75        let validity = array
76            .codes()
77            .validity()?
78            .union_nullability(pattern_scalar.dtype().nullability());
79
80        Ok(Some(BoolArray::new(result, validity).into_array()))
81    }
82}
83
84#[cfg(test)]
85mod tests {
86    use std::sync::LazyLock;
87
88    use vortex_array::Canonical;
89    use vortex_array::IntoArray;
90    use vortex_array::VortexSessionExecute;
91    use vortex_array::arrays::BoolArray;
92    use vortex_array::arrays::ConstantArray;
93    use vortex_array::arrays::VarBinArray;
94    use vortex_array::arrays::scalar_fn::ScalarFnFactoryExt;
95    use vortex_array::assert_arrays_eq;
96    use vortex_array::dtype::DType;
97    use vortex_array::dtype::Nullability;
98    use vortex_array::scalar_fn::fns::like::Like;
99    use vortex_array::scalar_fn::fns::like::LikeKernel;
100    use vortex_array::scalar_fn::fns::like::LikeOptions;
101    use vortex_array::session::ArraySession;
102    use vortex_error::VortexResult;
103    use vortex_session::VortexSession;
104
105    use crate::FSST;
106    use crate::FSSTArray;
107    use crate::fsst_compress;
108    use crate::fsst_train_compressor;
109
110    static SESSION: LazyLock<VortexSession> =
111        LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
112
113    fn make_fsst(strings: &[Option<&str>], nullability: Nullability) -> FSSTArray {
114        let varbin = VarBinArray::from_iter(strings.iter().copied(), DType::Utf8(nullability));
115        let compressor = fsst_train_compressor(&varbin);
116        let len = varbin.len();
117        let dtype = varbin.dtype().clone();
118        fsst_compress(varbin, len, &dtype, &compressor)
119    }
120
121    fn run_like(array: FSSTArray, pattern: &str, opts: LikeOptions) -> VortexResult<BoolArray> {
122        let len = array.len();
123        let arr = array.into_array();
124        let pattern = ConstantArray::new(pattern, len).into_array();
125        let result = Like
126            .try_new_array(len, opts, [arr, pattern])?
127            .into_array()
128            .execute::<Canonical>(&mut SESSION.create_execution_ctx())?;
129        Ok(result.into_bool())
130    }
131
132    fn like(array: FSSTArray, pattern: &str) -> VortexResult<BoolArray> {
133        run_like(array, pattern, LikeOptions::default())
134    }
135
136    #[test]
137    fn test_like_prefix() -> VortexResult<()> {
138        let fsst = make_fsst(
139            &[
140                Some("http://example.com"),
141                Some("http://test.org"),
142                Some("ftp://files.net"),
143                Some("http://vortex.dev"),
144                Some("ssh://server.io"),
145            ],
146            Nullability::NonNullable,
147        );
148        let result = like(fsst, "http%")?;
149        assert_arrays_eq!(
150            &result,
151            &BoolArray::from_iter([true, true, false, true, false])
152        );
153        Ok(())
154    }
155
156    #[test]
157    fn test_like_prefix_with_nulls() -> VortexResult<()> {
158        let fsst = make_fsst(
159            &[Some("hello"), None, Some("help"), None, Some("goodbye")],
160            Nullability::Nullable,
161        );
162        let result = like(fsst, "hel%")?; // spellchecker:disable-line
163        assert_arrays_eq!(
164            &result,
165            &BoolArray::from_iter([Some(true), None, Some(true), None, Some(false)])
166        );
167        Ok(())
168    }
169
170    #[test]
171    fn test_like_contains() -> VortexResult<()> {
172        let fsst = make_fsst(
173            &[
174                Some("hello world"),
175                Some("say hello"),
176                Some("goodbye"),
177                Some("hellooo"),
178            ],
179            Nullability::NonNullable,
180        );
181        let result = like(fsst, "%hello%")?;
182        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, false, true]));
183        Ok(())
184    }
185
186    #[test]
187    fn test_like_contains_cross_symbol() -> VortexResult<()> {
188        let fsst = make_fsst(
189            &[
190                Some("the quick brown fox jumps over the lazy dog"),
191                Some("a short string"),
192                Some("the lazy dog sleeps"),
193                Some("no match"),
194            ],
195            Nullability::NonNullable,
196        );
197        let result = like(fsst, "%lazy dog%")?;
198        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true, false]));
199        Ok(())
200    }
201
202    #[test]
203    fn test_not_like_contains() -> VortexResult<()> {
204        let fsst = make_fsst(
205            &[Some("foobar_sdf"), Some("sdf_start"), Some("nothing")],
206            Nullability::NonNullable,
207        );
208        let opts = LikeOptions {
209            negated: true,
210            case_insensitive: false,
211        };
212        let result = run_like(fsst, "%sdf%", opts)?;
213        assert_arrays_eq!(&result, &BoolArray::from_iter([false, false, true]));
214        Ok(())
215    }
216
217    #[test]
218    fn test_like_match_all() -> VortexResult<()> {
219        let fsst = make_fsst(
220            &[Some("abc"), Some(""), Some("xyz")],
221            Nullability::NonNullable,
222        );
223        let result = like(fsst, "%")?;
224        assert_arrays_eq!(&result, &BoolArray::from_iter([true, true, true]));
225        Ok(())
226    }
227
228    /// Call `LikeKernel::like` directly on the FSSTArray and verify it
229    /// returns `Some(...)` (i.e. the kernel handles it, rather than
230    /// returning `None` which would mean "fall back to decompress").
231    #[test]
232    fn test_like_prefix_kernel_handles() -> VortexResult<()> {
233        let fsst = make_fsst(
234            &[Some("http://a.com"), Some("ftp://b.com")],
235            Nullability::NonNullable,
236        );
237        let pattern = ConstantArray::new("http%", fsst.len()).into_array();
238        let mut ctx = SESSION.create_execution_ctx();
239
240        let fsst = fsst.as_view();
241        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
242        assert!(result.is_some(), "FSST LikeKernel should handle prefix%");
243        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
244        Ok(())
245    }
246
247    /// Same direct-call check for the contains pattern `%needle%`.
248    #[test]
249    fn test_like_contains_kernel_handles() -> VortexResult<()> {
250        let fsst = make_fsst(
251            &[Some("hello world"), Some("goodbye")],
252            Nullability::NonNullable,
253        );
254        let pattern = ConstantArray::new("%world%", fsst.len()).into_array();
255        let mut ctx = SESSION.create_execution_ctx();
256
257        let fsst = fsst.as_view();
258        let result = <FSST as LikeKernel>::like(fsst, &pattern, LikeOptions::default(), &mut ctx)?;
259        assert!(result.is_some(), "FSST LikeKernel should handle %needle%");
260        assert_arrays_eq!(result.unwrap(), BoolArray::from_iter([true, false]));
261        Ok(())
262    }
263
264    /// Patterns we can't handle should return `None` (fall back).
265    #[test]
266    fn test_like_kernel_falls_back_for_complex_pattern() -> VortexResult<()> {
267        let fsst = make_fsst(&[Some("abc"), Some("def")], Nullability::NonNullable);
268        let mut ctx = SESSION.create_execution_ctx();
269
270        // Underscore wildcard -- not handled.
271        let pattern = ConstantArray::new("a_c", fsst.len()).into_array();
272        let fsst_v = fsst.as_view();
273        let result =
274            <FSST as LikeKernel>::like(fsst_v, &pattern, LikeOptions::default(), &mut ctx)?;
275        assert!(result.is_none(), "underscore pattern should fall back");
276
277        // Case-insensitive -- not handled.
278        let pattern = ConstantArray::new("abc%", fsst.len()).into_array();
279        let opts = LikeOptions {
280            negated: false,
281            case_insensitive: true,
282        };
283        let result = <FSST as LikeKernel>::like(fsst_v, &pattern, opts, &mut ctx)?;
284        assert!(result.is_none(), "ilike should fall back");
285
286        Ok(())
287    }
288
289    #[test]
290    fn test_like_long_prefix_handled_by_flat_dfa() -> VortexResult<()> {
291        let fsst = make_fsst(
292            &[
293                Some("abcdefghijklmn-tail"),
294                Some("abcdefghijklmx-tail"),
295                Some("abcdefghijklmn"),
296            ],
297            Nullability::NonNullable,
298        );
299        let pattern = "abcdefghijklmn%";
300
301        let fsst = fsst.as_view();
302        let direct = <FSST as LikeKernel>::like(
303            fsst,
304            &ConstantArray::new(pattern, fsst.len()).into_array(),
305            LikeOptions::default(),
306            &mut SESSION.create_execution_ctx(),
307        )?;
308        assert!(
309            direct.is_some(),
310            "14-byte prefixes are now handled by the flat prefix DFA"
311        );
312        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
313        Ok(())
314    }
315
316    #[test]
317    fn test_like_long_contains_falls_back_but_still_matches() -> VortexResult<()> {
318        let needle = "a".repeat(255);
319        let matching = format!("xx{needle}yy");
320        let non_matching = format!("xx{}byy", "a".repeat(254));
321        let exact = needle.clone();
322        let pattern = format!("%{needle}%");
323
324        let fsst = make_fsst(
325            &[Some(&matching), Some(&non_matching), Some(&exact)],
326            Nullability::NonNullable,
327        );
328
329        let fsst_v = fsst.as_view();
330        let direct = <FSST as LikeKernel>::like(
331            fsst_v,
332            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
333            LikeOptions::default(),
334            &mut SESSION.create_execution_ctx(),
335        )?;
336        assert!(
337            direct.is_none(),
338            "contains needles longer than 254 bytes exceed the DFA's u8 state space"
339        );
340
341        let result = like(fsst, &pattern)?;
342        assert_arrays_eq!(&result, &BoolArray::from_iter([true, false, true]));
343        Ok(())
344    }
345
346    #[test]
347    fn test_like_contains_len_254_kernel_handles() -> VortexResult<()> {
348        let needle = "a".repeat(254);
349        let matching = format!("xx{needle}yy");
350        let non_matching = format!("xx{}byy", "a".repeat(253));
351        let pattern = format!("%{needle}%");
352
353        let fsst = make_fsst(
354            &[Some(&matching), Some(&non_matching), Some(needle.as_str())],
355            Nullability::NonNullable,
356        );
357
358        let fsst = fsst.as_view();
359        let direct = <FSST as LikeKernel>::like(
360            fsst,
361            &ConstantArray::new(pattern.as_str(), fsst.len()).into_array(),
362            LikeOptions::default(),
363            &mut SESSION.create_execution_ctx(),
364        )?;
365        assert!(
366            direct.is_some(),
367            "254-byte contains needle should stay on the DFA path"
368        );
369        assert_arrays_eq!(direct.unwrap(), BoolArray::from_iter([true, false, true]));
370        Ok(())
371    }
372}