vortex_array/scalar_fn/fns/like/
mod.rs1mod kernel;
5
6use std::borrow::Cow;
7use std::fmt::Display;
8use std::fmt::Formatter;
9
10pub use kernel::*;
11use prost::Message;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_proto::expr as pb;
15use vortex_session::VortexSession;
16
17use crate::ArrayRef;
18use crate::ExecutionCtx;
19use crate::arrow::Datum;
20use crate::arrow::from_arrow_array_with_len;
21use crate::dtype::DType;
22use crate::expr::Expression;
23use crate::expr::StatsCatalog;
24use crate::expr::and;
25use crate::expr::gt;
26use crate::expr::gt_eq;
27use crate::expr::lit;
28use crate::expr::lt;
29use crate::expr::or;
30use crate::scalar::StringLike;
31use crate::scalar_fn::Arity;
32use crate::scalar_fn::ChildName;
33use crate::scalar_fn::ExecutionArgs;
34use crate::scalar_fn::ScalarFnId;
35use crate::scalar_fn::ScalarFnVTable;
36use crate::scalar_fn::fns::literal::Literal;
37
38#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
40pub struct LikeOptions {
41 pub negated: bool,
42 pub case_insensitive: bool,
43}
44
45impl Display for LikeOptions {
46 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
47 if self.negated {
48 write!(f, "NOT ")?;
49 }
50 if self.case_insensitive {
51 write!(f, "ILIKE")
52 } else {
53 write!(f, "LIKE")
54 }
55 }
56}
57
58#[derive(Clone)]
60pub struct Like;
61
62impl ScalarFnVTable for Like {
63 type Options = LikeOptions;
64
65 fn id(&self) -> ScalarFnId {
66 ScalarFnId::new("vortex.like")
67 }
68
69 fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
70 Ok(Some(
71 pb::LikeOpts {
72 negated: instance.negated,
73 case_insensitive: instance.case_insensitive,
74 }
75 .encode_to_vec(),
76 ))
77 }
78
79 fn deserialize(
80 &self,
81 _metadata: &[u8],
82 _session: &VortexSession,
83 ) -> VortexResult<Self::Options> {
84 let opts = pb::LikeOpts::decode(_metadata)?;
85 Ok(LikeOptions {
86 negated: opts.negated,
87 case_insensitive: opts.case_insensitive,
88 })
89 }
90
91 fn arity(&self, _options: &Self::Options) -> Arity {
92 Arity::Exact(2)
93 }
94
95 fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
96 match child_idx {
97 0 => ChildName::from("child"),
98 1 => ChildName::from("pattern"),
99 _ => unreachable!("Invalid child index {} for Like expression", child_idx),
100 }
101 }
102
103 fn fmt_sql(
104 &self,
105 options: &Self::Options,
106 expr: &Expression,
107 f: &mut Formatter<'_>,
108 ) -> std::fmt::Result {
109 expr.child(0).fmt_sql(f)?;
110 if options.negated {
111 write!(f, " not")?;
112 }
113 if options.case_insensitive {
114 write!(f, " ilike ")?;
115 } else {
116 write!(f, " like ")?;
117 }
118 expr.child(1).fmt_sql(f)
119 }
120
121 fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
122 let input = &arg_dtypes[0];
123 let pattern = &arg_dtypes[1];
124
125 if !input.is_utf8() {
126 vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
127 }
128 if !pattern.is_utf8() {
129 vortex_bail!(
130 "LIKE expression requires UTF8 pattern dtype, got {}",
131 pattern
132 );
133 }
134
135 Ok(DType::Bool(
136 (input.is_nullable() || pattern.is_nullable()).into(),
137 ))
138 }
139
140 fn execute(
141 &self,
142 options: &Self::Options,
143 args: &dyn ExecutionArgs,
144 ctx: &mut ExecutionCtx,
145 ) -> VortexResult<ArrayRef> {
146 let child = args.get(0)?;
147 let pattern = args.get(1)?;
148
149 arrow_like(&child, &pattern, *options, ctx)
150 }
151
152 fn validity(
153 &self,
154 _options: &Self::Options,
155 expression: &Expression,
156 ) -> VortexResult<Option<Expression>> {
157 tracing::warn!("Computing validity for LIKE expression");
158 let child_validity = expression.child(0).validity()?;
159 let pattern_validity = expression.child(1).validity()?;
160 Ok(Some(and(child_validity, pattern_validity)))
161 }
162
163 fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
164 false
165 }
166
167 fn stat_falsification(
168 &self,
169 like_opts: &LikeOptions,
170 expr: &Expression,
171 catalog: &dyn StatsCatalog,
172 ) -> Option<Expression> {
173 if like_opts.negated || like_opts.case_insensitive {
177 return None;
178 }
179
180 let pat = expr.child(1).as_::<Literal>();
182
183 let pat_str = pat.as_utf8().value()?;
185
186 let src = expr.child(0).clone();
187 let src_min = src.stat_min(catalog)?;
188 let src_max = src.stat_max(catalog)?;
189
190 match LikeVariant::from_str(pat_str)? {
191 LikeVariant::Exact(text) => {
192 Some(or(
194 gt(src_min, lit(text.as_ref())),
195 lt(src_max, lit(text.as_ref())),
196 ))
197 }
198 LikeVariant::Prefix(prefix) => {
199 let succ = prefix.to_string().increment().ok()?;
201
202 Some(or(
203 gt_eq(src_min, lit(succ)),
204 lt(src_max, lit(prefix.as_ref())),
205 ))
206 }
207 }
208 }
209}
210
211pub(crate) fn arrow_like(
213 array: &ArrayRef,
214 pattern: &ArrayRef,
215 options: LikeOptions,
216 ctx: &mut ExecutionCtx,
217) -> VortexResult<ArrayRef> {
218 let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
219 let len = array.len();
220 assert_eq!(
221 array.len(),
222 pattern.len(),
223 "Arrow Like: length mismatch for {}",
224 array.encoding_id()
225 );
226
227 let lhs = Datum::try_new(array, ctx)?;
229 let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type(), ctx)?;
230
231 let result = match (options.negated, options.case_insensitive) {
232 (false, false) => arrow_string::like::like(&lhs, &rhs)?,
233 (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
234 (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
235 (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
236 };
237
238 from_arrow_array_with_len(&result, len, nullable)
239}
240
241#[derive(Debug, PartialEq)]
243enum LikeVariant<'a> {
244 Exact(Cow<'a, str>),
245 Prefix(Cow<'a, str>),
246}
247
248impl<'a> LikeVariant<'a> {
249 fn from_str(string: &'a str) -> Option<LikeVariant<'a>> {
251 let mut literal = None;
252 let mut chars = string.char_indices();
253
254 while let Some((idx, c)) = chars.next() {
255 match c {
256 '\\' => {
257 let literal = literal.get_or_insert_with(|| string[..idx].to_string());
258 match chars.next() {
259 Some((_, escaped)) => literal.push(escaped),
260 None => literal.push('\\'),
261 }
262 }
263 '%' | '_' => {
264 return match literal {
265 Some(literal) => (!literal.is_empty())
266 .then_some(LikeVariant::Prefix(Cow::Owned(literal))),
267 None => {
268 (idx != 0).then_some(LikeVariant::Prefix(Cow::Borrowed(&string[..idx])))
269 }
270 };
271 }
272 c => {
273 if let Some(literal) = &mut literal {
274 literal.push(c);
275 }
276 }
277 }
278 }
279
280 Some(match literal {
281 Some(literal) => LikeVariant::Exact(Cow::Owned(literal)),
282 None => LikeVariant::Exact(Cow::Borrowed(string)),
283 })
284 }
285}
286
287#[cfg(test)]
288mod tests {
289 use std::borrow::Cow;
290
291 use crate::IntoArray;
292 use crate::arrays::BoolArray;
293 use crate::assert_arrays_eq;
294 use crate::dtype::DType;
295 use crate::dtype::Nullability;
296 use crate::expr::col;
297 use crate::expr::get_item;
298 use crate::expr::ilike;
299 use crate::expr::like;
300 use crate::expr::lit;
301 use crate::expr::not;
302 use crate::expr::not_ilike;
303 use crate::expr::not_like;
304 use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
305 use crate::expr::root;
306 use crate::scalar_fn::fns::like::LikeVariant;
307
308 #[test]
309 fn invert_booleans() {
310 let not_expr = not(root());
311 let bools = BoolArray::from_iter([false, true, false, false, true, true]);
312 assert_arrays_eq!(
313 bools.into_array().apply(¬_expr).unwrap(),
314 BoolArray::from_iter([true, false, true, true, false, false])
315 );
316 }
317
318 #[test]
319 fn dtype() {
320 let dtype = DType::Utf8(Nullability::NonNullable);
321 let like_expr = like(root(), lit("%test%"));
322 assert_eq!(
323 like_expr.return_dtype(&dtype).unwrap(),
324 DType::Bool(Nullability::NonNullable)
325 );
326 }
327
328 #[test]
329 fn test_display() {
330 let expr = like(get_item("name", root()), lit("%john%"));
331 assert_eq!(expr.to_string(), "$.name like \"%john%\"");
332
333 let expr2 = not_ilike(root(), lit("test*"));
334 assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
335 }
336
337 fn assert_borrowed_exact(pattern: &str, expected: &str) {
338 let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
339 panic!("expected borrowed exact pattern");
340 };
341 assert!(matches!(actual, Cow::Borrowed(_)));
342 assert_eq!(actual.as_ref(), expected);
343 }
344
345 fn assert_owned_exact(pattern: &str, expected: &str) {
346 let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
347 panic!("expected owned exact pattern");
348 };
349 assert!(matches!(actual, Cow::Owned(_)));
350 assert_eq!(actual.as_ref(), expected);
351 }
352
353 fn assert_borrowed_prefix(pattern: &str, expected: &str) {
354 let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
355 panic!("expected borrowed prefix pattern");
356 };
357 assert!(matches!(actual, Cow::Borrowed(_)));
358 assert_eq!(actual.as_ref(), expected);
359 }
360
361 fn assert_owned_prefix(pattern: &str, expected: &str) {
362 let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
363 panic!("expected owned prefix pattern");
364 };
365 assert!(matches!(actual, Cow::Owned(_)));
366 assert_eq!(actual.as_ref(), expected);
367 }
368
369 #[test]
370 fn test_like_variant_borrowed_patterns() {
371 assert_borrowed_exact("simple", "simple");
372 assert_borrowed_prefix("prefix%", "prefix");
373 assert_borrowed_prefix("first%rest_stuff", "first");
374 }
375
376 #[test]
377 fn test_like_variant_escaped_patterns() {
378 assert_owned_prefix(r"\%%", "%");
379 assert_owned_prefix(r"\_%", "_");
380 assert_owned_prefix(r"\\%", "\\");
381 assert_owned_exact(r"\%", "%");
382 assert_owned_exact("trailing\\", "trailing\\");
383 }
384
385 #[test]
386 fn test_like_variant_unsupported_patterns() {
387 assert_eq!(LikeVariant::from_str("%suffix"), None);
388 assert_eq!(LikeVariant::from_str(r"%\%%"), None);
389 assert_eq!(LikeVariant::from_str("_pattern"), None);
390 }
391
392 #[test]
393 fn test_like_pushdown() {
394 let catalog = TrackingStatsCatalog::default();
397
398 let pruning_expr = like(col("a"), lit("prefix%"))
399 .stat_falsification(&catalog)
400 .expect("LIKE stat falsification");
401
402 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
403
404 let pruning_expr = like(col("a"), lit(r"\%%"))
405 .stat_falsification(&catalog)
406 .expect("LIKE stat falsification");
407 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "&") or ($.a_max < "%"))"#);
408
409 let pruning_expr = like(col("a"), lit("pref%ix%"))
411 .stat_falsification(&catalog)
412 .expect("LIKE stat falsification");
413 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
414
415 let pruning_expr = like(col("a"), lit("pref_ix_"))
416 .stat_falsification(&catalog)
417 .expect("LIKE stat falsification");
418 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
419
420 let pruning_expr = like(col("a"), lit("exactly"))
422 .stat_falsification(&catalog)
423 .expect("LIKE stat falsification");
424 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
425
426 let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
428 assert_eq!(pruning_expr, None);
429
430 assert_eq!(
432 None,
433 not_like(col("a"), lit("a")).stat_falsification(&catalog)
434 );
435 assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
436 }
437}