Skip to main content

vortex_array/expr/exprs/like/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod kernel;
5
6use std::fmt::Display;
7use std::fmt::Formatter;
8
9pub use kernel::*;
10use prost::Message;
11use vortex_dtype::DType;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_error::vortex_err;
15use vortex_proto::expr as pb;
16use vortex_session::VortexSession;
17
18use crate::Array;
19use crate::ArrayRef;
20use crate::arrow::Datum;
21use crate::arrow::from_arrow_array_with_len;
22use crate::expr::Arity;
23use crate::expr::ChildName;
24use crate::expr::ExecutionArgs;
25use crate::expr::ExprId;
26use crate::expr::Expression;
27use crate::expr::Literal;
28use crate::expr::StatsCatalog;
29use crate::expr::VTable;
30use crate::expr::VTableExt;
31use crate::expr::and;
32use crate::expr::gt;
33use crate::expr::gt_eq;
34use crate::expr::lit;
35use crate::expr::lt;
36use crate::expr::or;
37use crate::scalar::StringLike;
38
39/// Options for SQL LIKE function
40#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub struct LikeOptions {
42    pub negated: bool,
43    pub case_insensitive: bool,
44}
45
46impl Display for LikeOptions {
47    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
48        if self.negated {
49            write!(f, "NOT ")?;
50        }
51        if self.case_insensitive {
52            write!(f, "ILIKE")
53        } else {
54            write!(f, "LIKE")
55        }
56    }
57}
58
59/// Expression that performs SQL LIKE pattern matching.
60pub struct Like;
61
62impl VTable for Like {
63    type Options = LikeOptions;
64
65    fn id(&self) -> ExprId {
66        ExprId::from("vortex.like")
67    }
68
69    fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
70        Ok(Some(
71            pb::LikeOpts {
72                negated: instance.negated,
73                case_insensitive: instance.case_insensitive,
74            }
75            .encode_to_vec(),
76        ))
77    }
78
79    fn deserialize(
80        &self,
81        _metadata: &[u8],
82        _session: &VortexSession,
83    ) -> VortexResult<Self::Options> {
84        let opts = pb::LikeOpts::decode(_metadata)?;
85        Ok(LikeOptions {
86            negated: opts.negated,
87            case_insensitive: opts.case_insensitive,
88        })
89    }
90
91    fn arity(&self, _options: &Self::Options) -> Arity {
92        Arity::Exact(2)
93    }
94
95    fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
96        match child_idx {
97            0 => ChildName::from("child"),
98            1 => ChildName::from("pattern"),
99            _ => unreachable!("Invalid child index {} for Like expression", child_idx),
100        }
101    }
102
103    fn fmt_sql(
104        &self,
105        options: &Self::Options,
106        expr: &Expression,
107        f: &mut Formatter<'_>,
108    ) -> std::fmt::Result {
109        expr.child(0).fmt_sql(f)?;
110        if options.negated {
111            write!(f, " not")?;
112        }
113        if options.case_insensitive {
114            write!(f, " ilike ")?;
115        } else {
116            write!(f, " like ")?;
117        }
118        expr.child(1).fmt_sql(f)
119    }
120
121    fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
122        let input = &arg_dtypes[0];
123        let pattern = &arg_dtypes[1];
124
125        if !input.is_utf8() {
126            vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
127        }
128        if !pattern.is_utf8() {
129            vortex_bail!(
130                "LIKE expression requires UTF8 pattern dtype, got {}",
131                pattern
132            );
133        }
134
135        Ok(DType::Bool(
136            (input.is_nullable() || pattern.is_nullable()).into(),
137        ))
138    }
139
140    fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult<ArrayRef> {
141        let [child, pattern]: [ArrayRef; _] = args
142            .inputs
143            .try_into()
144            .map_err(|_| vortex_err!("Wrong argument count"))?;
145
146        arrow_like(&child, &pattern, *options)
147    }
148
149    fn validity(
150        &self,
151        _options: &Self::Options,
152        expression: &Expression,
153    ) -> VortexResult<Option<Expression>> {
154        tracing::warn!("Computing validity for LIKE expression");
155        let child_validity = expression.child(0).validity()?;
156        let pattern_validity = expression.child(1).validity()?;
157        Ok(Some(and(child_validity, pattern_validity)))
158    }
159
160    fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
161        false
162    }
163
164    fn stat_falsification(
165        &self,
166        like_opts: &LikeOptions,
167        expr: &Expression,
168        catalog: &dyn StatsCatalog,
169    ) -> Option<Expression> {
170        // Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'
171
172        // Don't attempt to handle ilike or negated like
173        if like_opts.negated || like_opts.case_insensitive {
174            return None;
175        }
176
177        // Extract the pattern out
178        let pat = expr.child(1).as_::<Literal>();
179
180        // LIKE NULL is nonsensical, don't try to handle it
181        let pat_str = pat.as_utf8().value()?;
182
183        let src = expr.child(0).clone();
184        let src_min = src.stat_min(catalog)?;
185        let src_max = src.stat_max(catalog)?;
186
187        match LikeVariant::from_str(pat_str)? {
188            LikeVariant::Exact(text) => {
189                // col LIKE 'exact' ==>  col.min > 'exact' || col.max < 'exact'
190                Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
191            }
192            LikeVariant::Prefix(prefix) => {
193                // col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
194                let succ = prefix.to_string().increment().ok()?;
195
196                Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
197            }
198        }
199    }
200}
201
202/// Implementation of LIKE using the Arrow crate.
203pub(crate) fn arrow_like(
204    array: &dyn Array,
205    pattern: &dyn Array,
206    options: LikeOptions,
207) -> VortexResult<ArrayRef> {
208    let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
209    let len = array.len();
210    assert_eq!(
211        array.len(),
212        pattern.len(),
213        "Arrow Like: length mismatch for {}",
214        array.encoding_id()
215    );
216
217    // convert the pattern to the preferred array datatype
218    let lhs = Datum::try_new(array)?;
219    let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type())?;
220
221    let result = match (options.negated, options.case_insensitive) {
222        (false, false) => arrow_string::like::like(&lhs, &rhs)?,
223        (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
224        (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
225        (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
226    };
227
228    from_arrow_array_with_len(&result, len, nullable)
229}
230
231/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s
232#[derive(Debug, PartialEq)]
233enum LikeVariant<'a> {
234    Exact(&'a str),
235    Prefix(&'a str),
236}
237
238impl<'a> LikeVariant<'a> {
239    /// Parse a LIKE pattern string into its relevant variant
240    fn from_str(string: &str) -> Option<LikeVariant<'_>> {
241        let Some(wildcard_pos) = string.find(['%', '_']) else {
242            return Some(LikeVariant::Exact(string));
243        };
244
245        // Can't handle wildcard in the front.
246        if wildcard_pos == 0 {
247            return None;
248        }
249
250        let prefix = &string[..wildcard_pos];
251        Some(LikeVariant::Prefix(prefix))
252    }
253}
254
255pub fn like(child: Expression, pattern: Expression) -> Expression {
256    Like.new_expr(
257        LikeOptions {
258            negated: false,
259            case_insensitive: false,
260        },
261        [child, pattern],
262    )
263}
264
265pub fn ilike(child: Expression, pattern: Expression) -> Expression {
266    Like.new_expr(
267        LikeOptions {
268            negated: false,
269            case_insensitive: true,
270        },
271        [child, pattern],
272    )
273}
274
275pub fn not_like(child: Expression, pattern: Expression) -> Expression {
276    Like.new_expr(
277        LikeOptions {
278            negated: true,
279            case_insensitive: false,
280        },
281        [child, pattern],
282    )
283}
284
285pub fn not_ilike(child: Expression, pattern: Expression) -> Expression {
286    Like.new_expr(
287        LikeOptions {
288            negated: true,
289            case_insensitive: true,
290        },
291        [child, pattern],
292    )
293}
294
295#[cfg(test)]
296mod tests {
297    use vortex_dtype::DType;
298    use vortex_dtype::Nullability;
299
300    use crate::arrays::BoolArray;
301    use crate::assert_arrays_eq;
302    use crate::expr::col;
303    use crate::expr::exprs::get_item::get_item;
304    use crate::expr::exprs::like::LikeVariant;
305    use crate::expr::exprs::like::like;
306    use crate::expr::exprs::literal::lit;
307    use crate::expr::exprs::not::not;
308    use crate::expr::exprs::root::root;
309    use crate::expr::ilike;
310    use crate::expr::not_ilike;
311    use crate::expr::not_like;
312    use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
313
314    #[test]
315    fn invert_booleans() {
316        let not_expr = not(root());
317        let bools = BoolArray::from_iter([false, true, false, false, true, true]);
318        assert_arrays_eq!(
319            bools.to_array().apply(&not_expr).unwrap(),
320            BoolArray::from_iter([true, false, true, true, false, false])
321        );
322    }
323
324    #[test]
325    fn dtype() {
326        let dtype = DType::Utf8(Nullability::NonNullable);
327        let like_expr = like(root(), lit("%test%"));
328        assert_eq!(
329            like_expr.return_dtype(&dtype).unwrap(),
330            DType::Bool(Nullability::NonNullable)
331        );
332    }
333
334    #[test]
335    fn test_display() {
336        let expr = like(get_item("name", root()), lit("%john%"));
337        assert_eq!(expr.to_string(), "$.name like \"%john%\"");
338
339        let expr2 = not_ilike(root(), lit("test*"));
340        assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
341    }
342
343    #[test]
344    fn test_like_variant() {
345        // Supported patterns
346        assert_eq!(
347            LikeVariant::from_str("simple"),
348            Some(LikeVariant::Exact("simple"))
349        );
350        assert_eq!(
351            LikeVariant::from_str("prefix%"),
352            Some(LikeVariant::Prefix("prefix"))
353        );
354        assert_eq!(
355            LikeVariant::from_str("first%rest_stuff"),
356            Some(LikeVariant::Prefix("first"))
357        );
358
359        // Unsupported patterns
360        assert_eq!(LikeVariant::from_str("%suffix"), None);
361        assert_eq!(LikeVariant::from_str("_pattern"), None);
362    }
363
364    #[test]
365    fn test_like_pushdown() {
366        // Test that LIKE prefix and exactness filters can be pushed down into stats filtering
367        // at scan time.
368        let catalog = TrackingStatsCatalog::default();
369
370        let pruning_expr = like(col("a"), lit("prefix%"))
371            .stat_falsification(&catalog)
372            .expect("LIKE stat falsification");
373
374        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
375
376        // Multiple wildcards
377        let pruning_expr = like(col("a"), lit("pref%ix%"))
378            .stat_falsification(&catalog)
379            .expect("LIKE stat falsification");
380        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
381
382        let pruning_expr = like(col("a"), lit("pref_ix_"))
383            .stat_falsification(&catalog)
384            .expect("LIKE stat falsification");
385        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
386
387        // Exact match
388        let pruning_expr = like(col("a"), lit("exactly"))
389            .stat_falsification(&catalog)
390            .expect("LIKE stat falsification");
391        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
392
393        // Suffix search skips pushdown
394        let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
395        assert_eq!(pruning_expr, None);
396
397        // NOT LIKE, ILIKE not supported currently
398        assert_eq!(
399            None,
400            not_like(col("a"), lit("a")).stat_falsification(&catalog)
401        );
402        assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
403    }
404}