Skip to main content

vortex_array/scalar_fn/fns/like/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod kernel;
5
6use std::borrow::Cow;
7use std::fmt::Display;
8use std::fmt::Formatter;
9
10pub use kernel::*;
11use prost::Message;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_proto::expr as pb;
15use vortex_session::VortexSession;
16use vortex_session::registry::CachedId;
17
18use crate::ArrayRef;
19use crate::ExecutionCtx;
20use crate::arrow::Datum;
21use crate::arrow::from_arrow_columnar;
22use crate::dtype::DType;
23use crate::expr::Expression;
24use crate::expr::StatsCatalog;
25use crate::expr::and;
26use crate::expr::gt;
27use crate::expr::gt_eq;
28use crate::expr::lit;
29use crate::expr::lt;
30use crate::expr::or;
31use crate::scalar::StringLike;
32use crate::scalar_fn::Arity;
33use crate::scalar_fn::ChildName;
34use crate::scalar_fn::ExecutionArgs;
35use crate::scalar_fn::ScalarFnId;
36use crate::scalar_fn::ScalarFnVTable;
37use crate::scalar_fn::fns::literal::Literal;
38
39/// Options for SQL LIKE function
40#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub struct LikeOptions {
42    pub negated: bool,
43    pub case_insensitive: bool,
44}
45
46impl Display for LikeOptions {
47    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
48        if self.negated {
49            write!(f, "NOT ")?;
50        }
51        if self.case_insensitive {
52            write!(f, "ILIKE")
53        } else {
54            write!(f, "LIKE")
55        }
56    }
57}
58
59/// Expression that performs SQL LIKE pattern matching.
60#[derive(Clone)]
61pub struct Like;
62
63impl ScalarFnVTable for Like {
64    type Options = LikeOptions;
65
66    fn id(&self) -> ScalarFnId {
67        static ID: CachedId = CachedId::new("vortex.like");
68        *ID
69    }
70
71    fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
72        Ok(Some(
73            pb::LikeOpts {
74                negated: instance.negated,
75                case_insensitive: instance.case_insensitive,
76            }
77            .encode_to_vec(),
78        ))
79    }
80
81    fn deserialize(
82        &self,
83        _metadata: &[u8],
84        _session: &VortexSession,
85    ) -> VortexResult<Self::Options> {
86        let opts = pb::LikeOpts::decode(_metadata)?;
87        Ok(LikeOptions {
88            negated: opts.negated,
89            case_insensitive: opts.case_insensitive,
90        })
91    }
92
93    fn arity(&self, _options: &Self::Options) -> Arity {
94        Arity::Exact(2)
95    }
96
97    fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
98        match child_idx {
99            0 => ChildName::from("child"),
100            1 => ChildName::from("pattern"),
101            _ => unreachable!("Invalid child index {} for Like expression", child_idx),
102        }
103    }
104
105    fn fmt_sql(
106        &self,
107        options: &Self::Options,
108        expr: &Expression,
109        f: &mut Formatter<'_>,
110    ) -> std::fmt::Result {
111        expr.child(0).fmt_sql(f)?;
112        if options.negated {
113            write!(f, " not")?;
114        }
115        if options.case_insensitive {
116            write!(f, " ilike ")?;
117        } else {
118            write!(f, " like ")?;
119        }
120        expr.child(1).fmt_sql(f)
121    }
122
123    fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
124        let input = &arg_dtypes[0];
125        let pattern = &arg_dtypes[1];
126
127        if !input.is_utf8() {
128            vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
129        }
130        if !pattern.is_utf8() {
131            vortex_bail!(
132                "LIKE expression requires UTF8 pattern dtype, got {}",
133                pattern
134            );
135        }
136
137        Ok(DType::Bool(
138            (input.is_nullable() || pattern.is_nullable()).into(),
139        ))
140    }
141
142    fn execute(
143        &self,
144        options: &Self::Options,
145        args: &dyn ExecutionArgs,
146        ctx: &mut ExecutionCtx,
147    ) -> VortexResult<ArrayRef> {
148        let child = args.get(0)?;
149        let pattern = args.get(1)?;
150
151        arrow_like(&child, &pattern, *options, ctx)
152    }
153
154    fn validity(
155        &self,
156        _options: &Self::Options,
157        expression: &Expression,
158    ) -> VortexResult<Option<Expression>> {
159        tracing::warn!("Computing validity for LIKE expression");
160        let child_validity = expression.child(0).validity()?;
161        let pattern_validity = expression.child(1).validity()?;
162        Ok(Some(and(child_validity, pattern_validity)))
163    }
164
165    fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
166        false
167    }
168
169    fn stat_falsification(
170        &self,
171        like_opts: &LikeOptions,
172        expr: &Expression,
173        catalog: &dyn StatsCatalog,
174    ) -> Option<Expression> {
175        // Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'
176
177        // Don't attempt to handle ilike or negated like
178        if like_opts.negated || like_opts.case_insensitive {
179            return None;
180        }
181
182        // Extract the pattern out
183        let pat = expr.child(1).as_::<Literal>();
184
185        // LIKE NULL is nonsensical, don't try to handle it
186        let pat_str = pat.as_utf8().value()?;
187
188        let src = expr.child(0).clone();
189        let src_min = src.stat_min(catalog)?;
190        let src_max = src.stat_max(catalog)?;
191
192        match LikeVariant::from_str(pat_str)? {
193            LikeVariant::Exact(text) => {
194                // col LIKE 'exact' ==>  col.min > 'exact' || col.max < 'exact'
195                Some(or(
196                    gt(src_min, lit(text.as_ref())),
197                    lt(src_max, lit(text.as_ref())),
198                ))
199            }
200            LikeVariant::Prefix(prefix) => {
201                // col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
202                let succ = prefix.to_string().increment().ok()?;
203
204                Some(or(
205                    gt_eq(src_min, lit(succ)),
206                    lt(src_max, lit(prefix.as_ref())),
207                ))
208            }
209        }
210    }
211}
212
213/// Implementation of LIKE using the Arrow crate.
214pub(crate) fn arrow_like(
215    array: &ArrayRef,
216    pattern: &ArrayRef,
217    options: LikeOptions,
218    ctx: &mut ExecutionCtx,
219) -> VortexResult<ArrayRef> {
220    let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
221    let len = array.len();
222    assert_eq!(
223        array.len(),
224        pattern.len(),
225        "Arrow Like: length mismatch for {}",
226        array.encoding_id()
227    );
228
229    // convert the pattern to the preferred array datatype
230    let lhs = Datum::try_new(array, ctx)?;
231    let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type(), ctx)?;
232
233    let result = match (options.negated, options.case_insensitive) {
234        (false, false) => arrow_string::like::like(&lhs, &rhs)?,
235        (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
236        (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
237        (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
238    };
239
240    from_arrow_columnar(&result, len, nullable, ctx)
241}
242
243/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.
244#[derive(Debug, PartialEq)]
245pub(crate) enum LikeVariant<'a> {
246    Exact(Cow<'a, str>),
247    Prefix(Cow<'a, str>),
248}
249
250impl<'a> LikeVariant<'a> {
251    /// Parse a LIKE pattern string into its relevant variant
252    pub(crate) fn from_str(string: &'a str) -> Option<LikeVariant<'a>> {
253        let mut literal = None;
254        let mut chars = string.char_indices();
255
256        while let Some((idx, c)) = chars.next() {
257            match c {
258                '\\' => {
259                    let literal = literal.get_or_insert_with(|| string[..idx].to_string());
260                    match chars.next() {
261                        Some((_, escaped)) => literal.push(escaped),
262                        None => literal.push('\\'),
263                    }
264                }
265                '%' | '_' => {
266                    return match literal {
267                        Some(literal) => (!literal.is_empty())
268                            .then_some(LikeVariant::Prefix(Cow::Owned(literal))),
269                        None => {
270                            (idx != 0).then_some(LikeVariant::Prefix(Cow::Borrowed(&string[..idx])))
271                        }
272                    };
273                }
274                c => {
275                    if let Some(literal) = &mut literal {
276                        literal.push(c);
277                    }
278                }
279            }
280        }
281
282        Some(match literal {
283            Some(literal) => LikeVariant::Exact(Cow::Owned(literal)),
284            None => LikeVariant::Exact(Cow::Borrowed(string)),
285        })
286    }
287}
288
289#[cfg(test)]
290mod tests {
291    use std::borrow::Cow;
292
293    use crate::IntoArray;
294    use crate::arrays::BoolArray;
295    use crate::assert_arrays_eq;
296    use crate::dtype::DType;
297    use crate::dtype::Nullability;
298    use crate::expr::col;
299    use crate::expr::get_item;
300    use crate::expr::ilike;
301    use crate::expr::like;
302    use crate::expr::lit;
303    use crate::expr::not;
304    use crate::expr::not_ilike;
305    use crate::expr::not_like;
306    use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
307    use crate::expr::root;
308    use crate::scalar_fn::fns::like::LikeVariant;
309
310    #[test]
311    fn invert_booleans() {
312        let not_expr = not(root());
313        let bools = BoolArray::from_iter([false, true, false, false, true, true]);
314        assert_arrays_eq!(
315            bools.into_array().apply(&not_expr).unwrap(),
316            BoolArray::from_iter([true, false, true, true, false, false])
317        );
318    }
319
320    #[test]
321    fn dtype() {
322        let dtype = DType::Utf8(Nullability::NonNullable);
323        let like_expr = like(root(), lit("%test%"));
324        assert_eq!(
325            like_expr.return_dtype(&dtype).unwrap(),
326            DType::Bool(Nullability::NonNullable)
327        );
328    }
329
330    #[test]
331    fn test_display() {
332        let expr = like(get_item("name", root()), lit("%john%"));
333        assert_eq!(expr.to_string(), "$.name like \"%john%\"");
334
335        let expr2 = not_ilike(root(), lit("test*"));
336        assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
337    }
338
339    fn assert_borrowed_exact(pattern: &str, expected: &str) {
340        let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
341            panic!("expected borrowed exact pattern");
342        };
343        assert!(matches!(actual, Cow::Borrowed(_)));
344        assert_eq!(actual.as_ref(), expected);
345    }
346
347    fn assert_owned_exact(pattern: &str, expected: &str) {
348        let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
349            panic!("expected owned exact pattern");
350        };
351        assert!(matches!(actual, Cow::Owned(_)));
352        assert_eq!(actual.as_ref(), expected);
353    }
354
355    fn assert_borrowed_prefix(pattern: &str, expected: &str) {
356        let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
357            panic!("expected borrowed prefix pattern");
358        };
359        assert!(matches!(actual, Cow::Borrowed(_)));
360        assert_eq!(actual.as_ref(), expected);
361    }
362
363    fn assert_owned_prefix(pattern: &str, expected: &str) {
364        let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
365            panic!("expected owned prefix pattern");
366        };
367        assert!(matches!(actual, Cow::Owned(_)));
368        assert_eq!(actual.as_ref(), expected);
369    }
370
371    #[test]
372    fn test_like_variant_borrowed_patterns() {
373        assert_borrowed_exact("simple", "simple");
374        assert_borrowed_prefix("prefix%", "prefix");
375        assert_borrowed_prefix("first%rest_stuff", "first");
376    }
377
378    #[test]
379    fn test_like_variant_escaped_patterns() {
380        assert_owned_prefix(r"\%%", "%");
381        assert_owned_prefix(r"\_%", "_");
382        assert_owned_prefix(r"\\%", "\\");
383        assert_owned_exact(r"\%", "%");
384        assert_owned_exact("trailing\\", "trailing\\");
385    }
386
387    #[test]
388    fn test_like_variant_unsupported_patterns() {
389        assert_eq!(LikeVariant::from_str("%suffix"), None);
390        assert_eq!(LikeVariant::from_str(r"%\%%"), None);
391        assert_eq!(LikeVariant::from_str("_pattern"), None);
392    }
393
394    #[test]
395    fn test_like_pushdown() {
396        // Test that LIKE prefix and exactness filters can be pushed down into stats filtering
397        // at scan time.
398        let catalog = TrackingStatsCatalog::default();
399
400        let pruning_expr = like(col("a"), lit("prefix%"))
401            .stat_falsification(&catalog)
402            .expect("LIKE stat falsification");
403
404        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
405
406        let pruning_expr = like(col("a"), lit(r"\%%"))
407            .stat_falsification(&catalog)
408            .expect("LIKE stat falsification");
409        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "&") or ($.a_max < "%"))"#);
410
411        // Multiple wildcards
412        let pruning_expr = like(col("a"), lit("pref%ix%"))
413            .stat_falsification(&catalog)
414            .expect("LIKE stat falsification");
415        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
416
417        let pruning_expr = like(col("a"), lit("pref_ix_"))
418            .stat_falsification(&catalog)
419            .expect("LIKE stat falsification");
420        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
421
422        // Exact match
423        let pruning_expr = like(col("a"), lit("exactly"))
424            .stat_falsification(&catalog)
425            .expect("LIKE stat falsification");
426        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
427
428        // Suffix search skips pushdown
429        let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
430        assert_eq!(pruning_expr, None);
431
432        // NOT LIKE, ILIKE not supported currently
433        assert_eq!(
434            None,
435            not_like(col("a"), lit("a")).stat_falsification(&catalog)
436        );
437        assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
438    }
439}