Skip to main content

vortex_array/scalar_fn/fns/like/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod kernel;
5
6use std::borrow::Cow;
7use std::fmt::Display;
8use std::fmt::Formatter;
9
10pub use kernel::*;
11use prost::Message;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_proto::expr as pb;
15use vortex_session::VortexSession;
16
17use crate::ArrayRef;
18use crate::ExecutionCtx;
19use crate::arrow::Datum;
20use crate::arrow::from_arrow_array_with_len;
21use crate::dtype::DType;
22use crate::expr::Expression;
23use crate::expr::StatsCatalog;
24use crate::expr::and;
25use crate::expr::gt;
26use crate::expr::gt_eq;
27use crate::expr::lit;
28use crate::expr::lt;
29use crate::expr::or;
30use crate::scalar::StringLike;
31use crate::scalar_fn::Arity;
32use crate::scalar_fn::ChildName;
33use crate::scalar_fn::ExecutionArgs;
34use crate::scalar_fn::ScalarFnId;
35use crate::scalar_fn::ScalarFnVTable;
36use crate::scalar_fn::fns::literal::Literal;
37
38/// Options for SQL LIKE function
39#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub struct LikeOptions {
41    pub negated: bool,
42    pub case_insensitive: bool,
43}
44
45impl Display for LikeOptions {
46    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
47        if self.negated {
48            write!(f, "NOT ")?;
49        }
50        if self.case_insensitive {
51            write!(f, "ILIKE")
52        } else {
53            write!(f, "LIKE")
54        }
55    }
56}
57
58/// Expression that performs SQL LIKE pattern matching.
59#[derive(Clone)]
60pub struct Like;
61
62impl ScalarFnVTable for Like {
63    type Options = LikeOptions;
64
65    fn id(&self) -> ScalarFnId {
66        ScalarFnId::new("vortex.like")
67    }
68
69    fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
70        Ok(Some(
71            pb::LikeOpts {
72                negated: instance.negated,
73                case_insensitive: instance.case_insensitive,
74            }
75            .encode_to_vec(),
76        ))
77    }
78
79    fn deserialize(
80        &self,
81        _metadata: &[u8],
82        _session: &VortexSession,
83    ) -> VortexResult<Self::Options> {
84        let opts = pb::LikeOpts::decode(_metadata)?;
85        Ok(LikeOptions {
86            negated: opts.negated,
87            case_insensitive: opts.case_insensitive,
88        })
89    }
90
91    fn arity(&self, _options: &Self::Options) -> Arity {
92        Arity::Exact(2)
93    }
94
95    fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
96        match child_idx {
97            0 => ChildName::from("child"),
98            1 => ChildName::from("pattern"),
99            _ => unreachable!("Invalid child index {} for Like expression", child_idx),
100        }
101    }
102
103    fn fmt_sql(
104        &self,
105        options: &Self::Options,
106        expr: &Expression,
107        f: &mut Formatter<'_>,
108    ) -> std::fmt::Result {
109        expr.child(0).fmt_sql(f)?;
110        if options.negated {
111            write!(f, " not")?;
112        }
113        if options.case_insensitive {
114            write!(f, " ilike ")?;
115        } else {
116            write!(f, " like ")?;
117        }
118        expr.child(1).fmt_sql(f)
119    }
120
121    fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
122        let input = &arg_dtypes[0];
123        let pattern = &arg_dtypes[1];
124
125        if !input.is_utf8() {
126            vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
127        }
128        if !pattern.is_utf8() {
129            vortex_bail!(
130                "LIKE expression requires UTF8 pattern dtype, got {}",
131                pattern
132            );
133        }
134
135        Ok(DType::Bool(
136            (input.is_nullable() || pattern.is_nullable()).into(),
137        ))
138    }
139
140    fn execute(
141        &self,
142        options: &Self::Options,
143        args: &dyn ExecutionArgs,
144        ctx: &mut ExecutionCtx,
145    ) -> VortexResult<ArrayRef> {
146        let child = args.get(0)?;
147        let pattern = args.get(1)?;
148
149        arrow_like(&child, &pattern, *options, ctx)
150    }
151
152    fn validity(
153        &self,
154        _options: &Self::Options,
155        expression: &Expression,
156    ) -> VortexResult<Option<Expression>> {
157        tracing::warn!("Computing validity for LIKE expression");
158        let child_validity = expression.child(0).validity()?;
159        let pattern_validity = expression.child(1).validity()?;
160        Ok(Some(and(child_validity, pattern_validity)))
161    }
162
163    fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
164        false
165    }
166
167    fn stat_falsification(
168        &self,
169        like_opts: &LikeOptions,
170        expr: &Expression,
171        catalog: &dyn StatsCatalog,
172    ) -> Option<Expression> {
173        // Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'
174
175        // Don't attempt to handle ilike or negated like
176        if like_opts.negated || like_opts.case_insensitive {
177            return None;
178        }
179
180        // Extract the pattern out
181        let pat = expr.child(1).as_::<Literal>();
182
183        // LIKE NULL is nonsensical, don't try to handle it
184        let pat_str = pat.as_utf8().value()?;
185
186        let src = expr.child(0).clone();
187        let src_min = src.stat_min(catalog)?;
188        let src_max = src.stat_max(catalog)?;
189
190        match LikeVariant::from_str(pat_str)? {
191            LikeVariant::Exact(text) => {
192                // col LIKE 'exact' ==>  col.min > 'exact' || col.max < 'exact'
193                Some(or(
194                    gt(src_min, lit(text.as_ref())),
195                    lt(src_max, lit(text.as_ref())),
196                ))
197            }
198            LikeVariant::Prefix(prefix) => {
199                // col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
200                let succ = prefix.to_string().increment().ok()?;
201
202                Some(or(
203                    gt_eq(src_min, lit(succ)),
204                    lt(src_max, lit(prefix.as_ref())),
205                ))
206            }
207        }
208    }
209}
210
211/// Implementation of LIKE using the Arrow crate.
212pub(crate) fn arrow_like(
213    array: &ArrayRef,
214    pattern: &ArrayRef,
215    options: LikeOptions,
216    ctx: &mut ExecutionCtx,
217) -> VortexResult<ArrayRef> {
218    let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
219    let len = array.len();
220    assert_eq!(
221        array.len(),
222        pattern.len(),
223        "Arrow Like: length mismatch for {}",
224        array.encoding_id()
225    );
226
227    // convert the pattern to the preferred array datatype
228    let lhs = Datum::try_new(array, ctx)?;
229    let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type(), ctx)?;
230
231    let result = match (options.negated, options.case_insensitive) {
232        (false, false) => arrow_string::like::like(&lhs, &rhs)?,
233        (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
234        (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
235        (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
236    };
237
238    from_arrow_array_with_len(&result, len, nullable)
239}
240
241/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s
242#[derive(Debug, PartialEq)]
243enum LikeVariant<'a> {
244    Exact(Cow<'a, str>),
245    Prefix(Cow<'a, str>),
246}
247
248impl<'a> LikeVariant<'a> {
249    /// Parse a LIKE pattern string into its relevant variant
250    fn from_str(string: &'a str) -> Option<LikeVariant<'a>> {
251        let mut literal = None;
252        let mut chars = string.char_indices();
253
254        while let Some((idx, c)) = chars.next() {
255            match c {
256                '\\' => {
257                    let literal = literal.get_or_insert_with(|| string[..idx].to_string());
258                    match chars.next() {
259                        Some((_, escaped)) => literal.push(escaped),
260                        None => literal.push('\\'),
261                    }
262                }
263                '%' | '_' => {
264                    return match literal {
265                        Some(literal) => (!literal.is_empty())
266                            .then_some(LikeVariant::Prefix(Cow::Owned(literal))),
267                        None => {
268                            (idx != 0).then_some(LikeVariant::Prefix(Cow::Borrowed(&string[..idx])))
269                        }
270                    };
271                }
272                c => {
273                    if let Some(literal) = &mut literal {
274                        literal.push(c);
275                    }
276                }
277            }
278        }
279
280        Some(match literal {
281            Some(literal) => LikeVariant::Exact(Cow::Owned(literal)),
282            None => LikeVariant::Exact(Cow::Borrowed(string)),
283        })
284    }
285}
286
287#[cfg(test)]
288mod tests {
289    use std::borrow::Cow;
290
291    use crate::IntoArray;
292    use crate::arrays::BoolArray;
293    use crate::assert_arrays_eq;
294    use crate::dtype::DType;
295    use crate::dtype::Nullability;
296    use crate::expr::col;
297    use crate::expr::get_item;
298    use crate::expr::ilike;
299    use crate::expr::like;
300    use crate::expr::lit;
301    use crate::expr::not;
302    use crate::expr::not_ilike;
303    use crate::expr::not_like;
304    use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
305    use crate::expr::root;
306    use crate::scalar_fn::fns::like::LikeVariant;
307
308    #[test]
309    fn invert_booleans() {
310        let not_expr = not(root());
311        let bools = BoolArray::from_iter([false, true, false, false, true, true]);
312        assert_arrays_eq!(
313            bools.into_array().apply(&not_expr).unwrap(),
314            BoolArray::from_iter([true, false, true, true, false, false])
315        );
316    }
317
318    #[test]
319    fn dtype() {
320        let dtype = DType::Utf8(Nullability::NonNullable);
321        let like_expr = like(root(), lit("%test%"));
322        assert_eq!(
323            like_expr.return_dtype(&dtype).unwrap(),
324            DType::Bool(Nullability::NonNullable)
325        );
326    }
327
328    #[test]
329    fn test_display() {
330        let expr = like(get_item("name", root()), lit("%john%"));
331        assert_eq!(expr.to_string(), "$.name like \"%john%\"");
332
333        let expr2 = not_ilike(root(), lit("test*"));
334        assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
335    }
336
337    fn assert_borrowed_exact(pattern: &str, expected: &str) {
338        let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
339            panic!("expected borrowed exact pattern");
340        };
341        assert!(matches!(actual, Cow::Borrowed(_)));
342        assert_eq!(actual.as_ref(), expected);
343    }
344
345    fn assert_owned_exact(pattern: &str, expected: &str) {
346        let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
347            panic!("expected owned exact pattern");
348        };
349        assert!(matches!(actual, Cow::Owned(_)));
350        assert_eq!(actual.as_ref(), expected);
351    }
352
353    fn assert_borrowed_prefix(pattern: &str, expected: &str) {
354        let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
355            panic!("expected borrowed prefix pattern");
356        };
357        assert!(matches!(actual, Cow::Borrowed(_)));
358        assert_eq!(actual.as_ref(), expected);
359    }
360
361    fn assert_owned_prefix(pattern: &str, expected: &str) {
362        let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
363            panic!("expected owned prefix pattern");
364        };
365        assert!(matches!(actual, Cow::Owned(_)));
366        assert_eq!(actual.as_ref(), expected);
367    }
368
369    #[test]
370    fn test_like_variant_borrowed_patterns() {
371        assert_borrowed_exact("simple", "simple");
372        assert_borrowed_prefix("prefix%", "prefix");
373        assert_borrowed_prefix("first%rest_stuff", "first");
374    }
375
376    #[test]
377    fn test_like_variant_escaped_patterns() {
378        assert_owned_prefix(r"\%%", "%");
379        assert_owned_prefix(r"\_%", "_");
380        assert_owned_prefix(r"\\%", "\\");
381        assert_owned_exact(r"\%", "%");
382        assert_owned_exact("trailing\\", "trailing\\");
383    }
384
385    #[test]
386    fn test_like_variant_unsupported_patterns() {
387        assert_eq!(LikeVariant::from_str("%suffix"), None);
388        assert_eq!(LikeVariant::from_str(r"%\%%"), None);
389        assert_eq!(LikeVariant::from_str("_pattern"), None);
390    }
391
392    #[test]
393    fn test_like_pushdown() {
394        // Test that LIKE prefix and exactness filters can be pushed down into stats filtering
395        // at scan time.
396        let catalog = TrackingStatsCatalog::default();
397
398        let pruning_expr = like(col("a"), lit("prefix%"))
399            .stat_falsification(&catalog)
400            .expect("LIKE stat falsification");
401
402        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
403
404        let pruning_expr = like(col("a"), lit(r"\%%"))
405            .stat_falsification(&catalog)
406            .expect("LIKE stat falsification");
407        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "&") or ($.a_max < "%"))"#);
408
409        // Multiple wildcards
410        let pruning_expr = like(col("a"), lit("pref%ix%"))
411            .stat_falsification(&catalog)
412            .expect("LIKE stat falsification");
413        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
414
415        let pruning_expr = like(col("a"), lit("pref_ix_"))
416            .stat_falsification(&catalog)
417            .expect("LIKE stat falsification");
418        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
419
420        // Exact match
421        let pruning_expr = like(col("a"), lit("exactly"))
422            .stat_falsification(&catalog)
423            .expect("LIKE stat falsification");
424        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
425
426        // Suffix search skips pushdown
427        let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
428        assert_eq!(pruning_expr, None);
429
430        // NOT LIKE, ILIKE not supported currently
431        assert_eq!(
432            None,
433            not_like(col("a"), lit("a")).stat_falsification(&catalog)
434        );
435        assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
436    }
437}