Skip to main content

vortex_array/scalar_fn/fns/like/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod kernel;
5
6use std::fmt::Display;
7use std::fmt::Formatter;
8
9pub use kernel::*;
10use prost::Message;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_proto::expr as pb;
14use vortex_session::VortexSession;
15
16use crate::ArrayRef;
17use crate::ExecutionCtx;
18use crate::arrow::Datum;
19use crate::arrow::from_arrow_array_with_len;
20use crate::dtype::DType;
21use crate::expr::Expression;
22use crate::expr::StatsCatalog;
23use crate::expr::and;
24use crate::expr::gt;
25use crate::expr::gt_eq;
26use crate::expr::lit;
27use crate::expr::lt;
28use crate::expr::or;
29use crate::scalar::StringLike;
30use crate::scalar_fn::Arity;
31use crate::scalar_fn::ChildName;
32use crate::scalar_fn::ExecutionArgs;
33use crate::scalar_fn::ScalarFnId;
34use crate::scalar_fn::ScalarFnVTable;
35use crate::scalar_fn::fns::literal::Literal;
36
37/// Options for SQL LIKE function
38#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
39pub struct LikeOptions {
40    pub negated: bool,
41    pub case_insensitive: bool,
42}
43
44impl Display for LikeOptions {
45    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
46        if self.negated {
47            write!(f, "NOT ")?;
48        }
49        if self.case_insensitive {
50            write!(f, "ILIKE")
51        } else {
52            write!(f, "LIKE")
53        }
54    }
55}
56
57/// Expression that performs SQL LIKE pattern matching.
58#[derive(Clone)]
59pub struct Like;
60
61impl ScalarFnVTable for Like {
62    type Options = LikeOptions;
63
64    fn id(&self) -> ScalarFnId {
65        ScalarFnId::from("vortex.like")
66    }
67
68    fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
69        Ok(Some(
70            pb::LikeOpts {
71                negated: instance.negated,
72                case_insensitive: instance.case_insensitive,
73            }
74            .encode_to_vec(),
75        ))
76    }
77
78    fn deserialize(
79        &self,
80        _metadata: &[u8],
81        _session: &VortexSession,
82    ) -> VortexResult<Self::Options> {
83        let opts = pb::LikeOpts::decode(_metadata)?;
84        Ok(LikeOptions {
85            negated: opts.negated,
86            case_insensitive: opts.case_insensitive,
87        })
88    }
89
90    fn arity(&self, _options: &Self::Options) -> Arity {
91        Arity::Exact(2)
92    }
93
94    fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
95        match child_idx {
96            0 => ChildName::from("child"),
97            1 => ChildName::from("pattern"),
98            _ => unreachable!("Invalid child index {} for Like expression", child_idx),
99        }
100    }
101
102    fn fmt_sql(
103        &self,
104        options: &Self::Options,
105        expr: &Expression,
106        f: &mut Formatter<'_>,
107    ) -> std::fmt::Result {
108        expr.child(0).fmt_sql(f)?;
109        if options.negated {
110            write!(f, " not")?;
111        }
112        if options.case_insensitive {
113            write!(f, " ilike ")?;
114        } else {
115            write!(f, " like ")?;
116        }
117        expr.child(1).fmt_sql(f)
118    }
119
120    fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
121        let input = &arg_dtypes[0];
122        let pattern = &arg_dtypes[1];
123
124        if !input.is_utf8() {
125            vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
126        }
127        if !pattern.is_utf8() {
128            vortex_bail!(
129                "LIKE expression requires UTF8 pattern dtype, got {}",
130                pattern
131            );
132        }
133
134        Ok(DType::Bool(
135            (input.is_nullable() || pattern.is_nullable()).into(),
136        ))
137    }
138
139    fn execute(
140        &self,
141        options: &Self::Options,
142        args: &dyn ExecutionArgs,
143        _ctx: &mut ExecutionCtx,
144    ) -> VortexResult<ArrayRef> {
145        let child = args.get(0)?;
146        let pattern = args.get(1)?;
147
148        arrow_like(&child, &pattern, *options)
149    }
150
151    fn validity(
152        &self,
153        _options: &Self::Options,
154        expression: &Expression,
155    ) -> VortexResult<Option<Expression>> {
156        tracing::warn!("Computing validity for LIKE expression");
157        let child_validity = expression.child(0).validity()?;
158        let pattern_validity = expression.child(1).validity()?;
159        Ok(Some(and(child_validity, pattern_validity)))
160    }
161
162    fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
163        false
164    }
165
166    fn stat_falsification(
167        &self,
168        like_opts: &LikeOptions,
169        expr: &Expression,
170        catalog: &dyn StatsCatalog,
171    ) -> Option<Expression> {
172        // Attempt to do min/max pruning for LIKE 'exact' or LIKE 'prefix%'
173
174        // Don't attempt to handle ilike or negated like
175        if like_opts.negated || like_opts.case_insensitive {
176            return None;
177        }
178
179        // Extract the pattern out
180        let pat = expr.child(1).as_::<Literal>();
181
182        // LIKE NULL is nonsensical, don't try to handle it
183        let pat_str = pat.as_utf8().value()?;
184
185        let src = expr.child(0).clone();
186        let src_min = src.stat_min(catalog)?;
187        let src_max = src.stat_max(catalog)?;
188
189        match LikeVariant::from_str(pat_str)? {
190            LikeVariant::Exact(text) => {
191                // col LIKE 'exact' ==>  col.min > 'exact' || col.max < 'exact'
192                Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
193            }
194            LikeVariant::Prefix(prefix) => {
195                // col LIKE 'prefix%' ==> col.max < 'prefix' || col.min >= 'prefiy'
196                let succ = prefix.to_string().increment().ok()?;
197
198                Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
199            }
200        }
201    }
202}
203
204/// Implementation of LIKE using the Arrow crate.
205pub(crate) fn arrow_like(
206    array: &ArrayRef,
207    pattern: &ArrayRef,
208    options: LikeOptions,
209) -> VortexResult<ArrayRef> {
210    let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
211    let len = array.len();
212    assert_eq!(
213        array.len(),
214        pattern.len(),
215        "Arrow Like: length mismatch for {}",
216        array.encoding_id()
217    );
218
219    // convert the pattern to the preferred array datatype
220    let lhs = Datum::try_new(array)?;
221    let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type())?;
222
223    let result = match (options.negated, options.case_insensitive) {
224        (false, false) => arrow_string::like::like(&lhs, &rhs)?,
225        (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
226        (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
227        (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
228    };
229
230    from_arrow_array_with_len(&result, len, nullable)
231}
232
233/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.s
234#[derive(Debug, PartialEq)]
235enum LikeVariant<'a> {
236    Exact(&'a str),
237    Prefix(&'a str),
238}
239
240impl<'a> LikeVariant<'a> {
241    /// Parse a LIKE pattern string into its relevant variant
242    fn from_str(string: &str) -> Option<LikeVariant<'_>> {
243        let Some(wildcard_pos) = string.find(['%', '_']) else {
244            return Some(LikeVariant::Exact(string));
245        };
246
247        // Can't handle wildcard in the front.
248        if wildcard_pos == 0 {
249            return None;
250        }
251
252        let prefix = &string[..wildcard_pos];
253        Some(LikeVariant::Prefix(prefix))
254    }
255}
256
257#[cfg(test)]
258mod tests {
259    use crate::IntoArray;
260    use crate::arrays::BoolArray;
261    use crate::assert_arrays_eq;
262    use crate::dtype::DType;
263    use crate::dtype::Nullability;
264    use crate::expr::col;
265    use crate::expr::get_item;
266    use crate::expr::ilike;
267    use crate::expr::like;
268    use crate::expr::lit;
269    use crate::expr::not;
270    use crate::expr::not_ilike;
271    use crate::expr::not_like;
272    use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
273    use crate::expr::root;
274    use crate::scalar_fn::fns::like::LikeVariant;
275
276    #[test]
277    fn invert_booleans() {
278        let not_expr = not(root());
279        let bools = BoolArray::from_iter([false, true, false, false, true, true]);
280        assert_arrays_eq!(
281            bools.into_array().apply(&not_expr).unwrap(),
282            BoolArray::from_iter([true, false, true, true, false, false])
283        );
284    }
285
286    #[test]
287    fn dtype() {
288        let dtype = DType::Utf8(Nullability::NonNullable);
289        let like_expr = like(root(), lit("%test%"));
290        assert_eq!(
291            like_expr.return_dtype(&dtype).unwrap(),
292            DType::Bool(Nullability::NonNullable)
293        );
294    }
295
296    #[test]
297    fn test_display() {
298        let expr = like(get_item("name", root()), lit("%john%"));
299        assert_eq!(expr.to_string(), "$.name like \"%john%\"");
300
301        let expr2 = not_ilike(root(), lit("test*"));
302        assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
303    }
304
305    #[test]
306    fn test_like_variant() {
307        // Supported patterns
308        assert_eq!(
309            LikeVariant::from_str("simple"),
310            Some(LikeVariant::Exact("simple"))
311        );
312        assert_eq!(
313            LikeVariant::from_str("prefix%"),
314            Some(LikeVariant::Prefix("prefix"))
315        );
316        assert_eq!(
317            LikeVariant::from_str("first%rest_stuff"),
318            Some(LikeVariant::Prefix("first"))
319        );
320
321        // Unsupported patterns
322        assert_eq!(LikeVariant::from_str("%suffix"), None);
323        assert_eq!(LikeVariant::from_str("_pattern"), None);
324    }
325
326    #[test]
327    fn test_like_pushdown() {
328        // Test that LIKE prefix and exactness filters can be pushed down into stats filtering
329        // at scan time.
330        let catalog = TrackingStatsCatalog::default();
331
332        let pruning_expr = like(col("a"), lit("prefix%"))
333            .stat_falsification(&catalog)
334            .expect("LIKE stat falsification");
335
336        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
337
338        // Multiple wildcards
339        let pruning_expr = like(col("a"), lit("pref%ix%"))
340            .stat_falsification(&catalog)
341            .expect("LIKE stat falsification");
342        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
343
344        let pruning_expr = like(col("a"), lit("pref_ix_"))
345            .stat_falsification(&catalog)
346            .expect("LIKE stat falsification");
347        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
348
349        // Exact match
350        let pruning_expr = like(col("a"), lit("exactly"))
351            .stat_falsification(&catalog)
352            .expect("LIKE stat falsification");
353        insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
354
355        // Suffix search skips pushdown
356        let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
357        assert_eq!(pruning_expr, None);
358
359        // NOT LIKE, ILIKE not supported currently
360        assert_eq!(
361            None,
362            not_like(col("a"), lit("a")).stat_falsification(&catalog)
363        );
364        assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
365    }
366}