Skip to main content

vortex_array/scalar_fn/fns/like/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod kernel;
5
6use std::borrow::Cow;
7use std::fmt::Display;
8use std::fmt::Formatter;
9
10pub use kernel::*;
11use prost::Message;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_proto::expr as pb;
15use vortex_session::VortexSession;
16use vortex_session::registry::CachedId;
17
18use crate::ArrayRef;
19use crate::ExecutionCtx;
20use crate::arrow::Datum;
21use crate::arrow::from_arrow_columnar;
22use crate::dtype::DType;
23use crate::expr::Expression;
24use crate::expr::and;
25use crate::scalar_fn::Arity;
26use crate::scalar_fn::ChildName;
27use crate::scalar_fn::ExecutionArgs;
28use crate::scalar_fn::ScalarFnId;
29use crate::scalar_fn::ScalarFnVTable;
30
31/// Options for SQL LIKE function
32#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
33pub struct LikeOptions {
34    pub negated: bool,
35    pub case_insensitive: bool,
36}
37
38impl Display for LikeOptions {
39    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
40        if self.negated {
41            write!(f, "NOT ")?;
42        }
43        if self.case_insensitive {
44            write!(f, "ILIKE")
45        } else {
46            write!(f, "LIKE")
47        }
48    }
49}
50
51/// Expression that performs SQL LIKE pattern matching.
52#[derive(Clone)]
53pub struct Like;
54
55impl ScalarFnVTable for Like {
56    type Options = LikeOptions;
57
58    fn id(&self) -> ScalarFnId {
59        static ID: CachedId = CachedId::new("vortex.like");
60        *ID
61    }
62
63    fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
64        Ok(Some(
65            pb::LikeOpts {
66                negated: instance.negated,
67                case_insensitive: instance.case_insensitive,
68            }
69            .encode_to_vec(),
70        ))
71    }
72
73    fn deserialize(
74        &self,
75        _metadata: &[u8],
76        _session: &VortexSession,
77    ) -> VortexResult<Self::Options> {
78        let opts = pb::LikeOpts::decode(_metadata)?;
79        Ok(LikeOptions {
80            negated: opts.negated,
81            case_insensitive: opts.case_insensitive,
82        })
83    }
84
85    fn arity(&self, _options: &Self::Options) -> Arity {
86        Arity::Exact(2)
87    }
88
89    fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
90        match child_idx {
91            0 => ChildName::from("child"),
92            1 => ChildName::from("pattern"),
93            _ => unreachable!("Invalid child index {} for Like expression", child_idx),
94        }
95    }
96
97    fn fmt_sql(
98        &self,
99        options: &Self::Options,
100        expr: &Expression,
101        f: &mut Formatter<'_>,
102    ) -> std::fmt::Result {
103        expr.child(0).fmt_sql(f)?;
104        if options.negated {
105            write!(f, " not")?;
106        }
107        if options.case_insensitive {
108            write!(f, " ilike ")?;
109        } else {
110            write!(f, " like ")?;
111        }
112        expr.child(1).fmt_sql(f)
113    }
114
115    fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
116        let input = &arg_dtypes[0];
117        let pattern = &arg_dtypes[1];
118
119        if !input.is_utf8() {
120            vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
121        }
122        if !pattern.is_utf8() {
123            vortex_bail!(
124                "LIKE expression requires UTF8 pattern dtype, got {}",
125                pattern
126            );
127        }
128
129        Ok(DType::Bool(
130            (input.is_nullable() || pattern.is_nullable()).into(),
131        ))
132    }
133
134    fn execute(
135        &self,
136        options: &Self::Options,
137        args: &dyn ExecutionArgs,
138        ctx: &mut ExecutionCtx,
139    ) -> VortexResult<ArrayRef> {
140        let child = args.get(0)?;
141        let pattern = args.get(1)?;
142
143        arrow_like(&child, &pattern, *options, ctx)
144    }
145
146    fn validity(
147        &self,
148        _options: &Self::Options,
149        expression: &Expression,
150    ) -> VortexResult<Option<Expression>> {
151        tracing::warn!("Computing validity for LIKE expression");
152        let child_validity = expression.child(0).validity()?;
153        let pattern_validity = expression.child(1).validity()?;
154        Ok(Some(and(child_validity, pattern_validity)))
155    }
156
157    fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
158        false
159    }
160
161    fn is_fallible(&self, _options: &Self::Options) -> bool {
162        false
163    }
164}
165
166/// Implementation of LIKE using the Arrow crate.
167pub(crate) fn arrow_like(
168    array: &ArrayRef,
169    pattern: &ArrayRef,
170    options: LikeOptions,
171    ctx: &mut ExecutionCtx,
172) -> VortexResult<ArrayRef> {
173    let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
174    let len = array.len();
175    assert_eq!(
176        array.len(),
177        pattern.len(),
178        "Arrow Like: length mismatch for {}",
179        array.encoding_id()
180    );
181
182    // convert the pattern to the preferred array datatype
183    let lhs = Datum::try_new(array, ctx)?;
184    let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type(), ctx)?;
185
186    let result = match (options.negated, options.case_insensitive) {
187        (false, false) => arrow_string::like::like(&lhs, &rhs)?,
188        (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
189        (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
190        (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
191    };
192
193    from_arrow_columnar(&result, len, nullable, ctx)
194}
195
196/// Variants of the LIKE filter that we know how to turn into a stats pruning predicate.
197#[derive(Debug, PartialEq)]
198pub(crate) enum LikeVariant<'a> {
199    Exact(Cow<'a, str>),
200    Prefix(Cow<'a, str>),
201}
202
203impl<'a> LikeVariant<'a> {
204    /// Parse a LIKE pattern string into its relevant variant
205    pub(crate) fn from_str(string: &'a str) -> Option<LikeVariant<'a>> {
206        let mut literal = None;
207        let mut chars = string.char_indices();
208
209        while let Some((idx, c)) = chars.next() {
210            match c {
211                '\\' => {
212                    let literal = literal.get_or_insert_with(|| string[..idx].to_string());
213                    match chars.next() {
214                        Some((_, escaped)) => literal.push(escaped),
215                        None => literal.push('\\'),
216                    }
217                }
218                '%' | '_' => {
219                    return match literal {
220                        Some(literal) => (!literal.is_empty())
221                            .then_some(LikeVariant::Prefix(Cow::Owned(literal))),
222                        None => {
223                            (idx != 0).then_some(LikeVariant::Prefix(Cow::Borrowed(&string[..idx])))
224                        }
225                    };
226                }
227                c => {
228                    if let Some(literal) = &mut literal {
229                        literal.push(c);
230                    }
231                }
232            }
233        }
234
235        Some(match literal {
236            Some(literal) => LikeVariant::Exact(Cow::Owned(literal)),
237            None => LikeVariant::Exact(Cow::Borrowed(string)),
238        })
239    }
240}
241
242#[cfg(test)]
243mod tests {
244    use std::borrow::Cow;
245
246    use crate::IntoArray;
247    use crate::VortexSessionExecute;
248    use crate::array_session;
249    use crate::arrays::BoolArray;
250    use crate::assert_arrays_eq;
251    use crate::dtype::DType;
252    use crate::dtype::Nullability;
253    use crate::expr::get_item;
254    use crate::expr::like;
255    use crate::expr::lit;
256    use crate::expr::not;
257    use crate::expr::not_ilike;
258    use crate::expr::root;
259    use crate::scalar_fn::fns::like::LikeVariant;
260
261    #[test]
262    fn invert_booleans() {
263        let not_expr = not(root());
264        let bools = BoolArray::from_iter([false, true, false, false, true, true]);
265        let mut ctx = array_session().create_execution_ctx();
266        assert_arrays_eq!(
267            bools.into_array().apply(&not_expr).unwrap(),
268            BoolArray::from_iter([true, false, true, true, false, false]),
269            &mut ctx
270        );
271    }
272
273    #[test]
274    fn dtype() {
275        let dtype = DType::Utf8(Nullability::NonNullable);
276        let like_expr = like(root(), lit("%test%"));
277        assert_eq!(
278            like_expr.return_dtype(&dtype).unwrap(),
279            DType::Bool(Nullability::NonNullable)
280        );
281    }
282
283    #[test]
284    fn signature() {
285        let like_expr = like(root(), lit("%test%"));
286        assert!(!like_expr.signature().is_null_sensitive());
287        assert!(!like_expr.signature().is_fallible());
288    }
289
290    #[test]
291    fn test_display() {
292        let expr = like(get_item("name", root()), lit("%john%"));
293        assert_eq!(expr.to_string(), "$.name like \"%john%\"");
294
295        let expr2 = not_ilike(root(), lit("test*"));
296        assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
297    }
298
299    fn assert_borrowed_exact(pattern: &str, expected: &str) {
300        let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
301            panic!("expected borrowed exact pattern");
302        };
303        assert!(matches!(actual, Cow::Borrowed(_)));
304        assert_eq!(actual.as_ref(), expected);
305    }
306
307    fn assert_owned_exact(pattern: &str, expected: &str) {
308        let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
309            panic!("expected owned exact pattern");
310        };
311        assert!(matches!(actual, Cow::Owned(_)));
312        assert_eq!(actual.as_ref(), expected);
313    }
314
315    fn assert_borrowed_prefix(pattern: &str, expected: &str) {
316        let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
317            panic!("expected borrowed prefix pattern");
318        };
319        assert!(matches!(actual, Cow::Borrowed(_)));
320        assert_eq!(actual.as_ref(), expected);
321    }
322
323    fn assert_owned_prefix(pattern: &str, expected: &str) {
324        let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
325            panic!("expected owned prefix pattern");
326        };
327        assert!(matches!(actual, Cow::Owned(_)));
328        assert_eq!(actual.as_ref(), expected);
329    }
330
331    #[test]
332    fn test_like_variant_borrowed_patterns() {
333        assert_borrowed_exact("simple", "simple");
334        assert_borrowed_prefix("prefix%", "prefix");
335        assert_borrowed_prefix("first%rest_stuff", "first");
336    }
337
338    #[test]
339    fn test_like_variant_escaped_patterns() {
340        assert_owned_prefix(r"\%%", "%");
341        assert_owned_prefix(r"\_%", "_");
342        assert_owned_prefix(r"\\%", "\\");
343        assert_owned_exact(r"\%", "%");
344        assert_owned_exact("trailing\\", "trailing\\");
345    }
346
347    #[test]
348    fn test_like_variant_unsupported_patterns() {
349        assert_eq!(LikeVariant::from_str("%suffix"), None);
350        assert_eq!(LikeVariant::from_str(r"%\%%"), None);
351        assert_eq!(LikeVariant::from_str("_pattern"), None);
352    }
353}