vortex_array/scalar_fn/fns/like/
mod.rs1mod kernel;
5
6use std::borrow::Cow;
7use std::fmt::Display;
8use std::fmt::Formatter;
9
10pub use kernel::*;
11use prost::Message;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_proto::expr as pb;
15use vortex_session::VortexSession;
16use vortex_session::registry::CachedId;
17
18use crate::ArrayRef;
19use crate::ExecutionCtx;
20use crate::arrow::Datum;
21use crate::arrow::from_arrow_columnar;
22use crate::dtype::DType;
23use crate::expr::Expression;
24use crate::expr::StatsCatalog;
25use crate::expr::and;
26use crate::expr::gt;
27use crate::expr::gt_eq;
28use crate::expr::lit;
29use crate::expr::lt;
30use crate::expr::or;
31use crate::scalar::StringLike;
32use crate::scalar_fn::Arity;
33use crate::scalar_fn::ChildName;
34use crate::scalar_fn::ExecutionArgs;
35use crate::scalar_fn::ScalarFnId;
36use crate::scalar_fn::ScalarFnVTable;
37use crate::scalar_fn::fns::literal::Literal;
38
39#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub struct LikeOptions {
42 pub negated: bool,
43 pub case_insensitive: bool,
44}
45
46impl Display for LikeOptions {
47 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
48 if self.negated {
49 write!(f, "NOT ")?;
50 }
51 if self.case_insensitive {
52 write!(f, "ILIKE")
53 } else {
54 write!(f, "LIKE")
55 }
56 }
57}
58
59#[derive(Clone)]
61pub struct Like;
62
63impl ScalarFnVTable for Like {
64 type Options = LikeOptions;
65
66 fn id(&self) -> ScalarFnId {
67 static ID: CachedId = CachedId::new("vortex.like");
68 *ID
69 }
70
71 fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
72 Ok(Some(
73 pb::LikeOpts {
74 negated: instance.negated,
75 case_insensitive: instance.case_insensitive,
76 }
77 .encode_to_vec(),
78 ))
79 }
80
81 fn deserialize(
82 &self,
83 _metadata: &[u8],
84 _session: &VortexSession,
85 ) -> VortexResult<Self::Options> {
86 let opts = pb::LikeOpts::decode(_metadata)?;
87 Ok(LikeOptions {
88 negated: opts.negated,
89 case_insensitive: opts.case_insensitive,
90 })
91 }
92
93 fn arity(&self, _options: &Self::Options) -> Arity {
94 Arity::Exact(2)
95 }
96
97 fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
98 match child_idx {
99 0 => ChildName::from("child"),
100 1 => ChildName::from("pattern"),
101 _ => unreachable!("Invalid child index {} for Like expression", child_idx),
102 }
103 }
104
105 fn fmt_sql(
106 &self,
107 options: &Self::Options,
108 expr: &Expression,
109 f: &mut Formatter<'_>,
110 ) -> std::fmt::Result {
111 expr.child(0).fmt_sql(f)?;
112 if options.negated {
113 write!(f, " not")?;
114 }
115 if options.case_insensitive {
116 write!(f, " ilike ")?;
117 } else {
118 write!(f, " like ")?;
119 }
120 expr.child(1).fmt_sql(f)
121 }
122
123 fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
124 let input = &arg_dtypes[0];
125 let pattern = &arg_dtypes[1];
126
127 if !input.is_utf8() {
128 vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
129 }
130 if !pattern.is_utf8() {
131 vortex_bail!(
132 "LIKE expression requires UTF8 pattern dtype, got {}",
133 pattern
134 );
135 }
136
137 Ok(DType::Bool(
138 (input.is_nullable() || pattern.is_nullable()).into(),
139 ))
140 }
141
142 fn execute(
143 &self,
144 options: &Self::Options,
145 args: &dyn ExecutionArgs,
146 ctx: &mut ExecutionCtx,
147 ) -> VortexResult<ArrayRef> {
148 let child = args.get(0)?;
149 let pattern = args.get(1)?;
150
151 arrow_like(&child, &pattern, *options, ctx)
152 }
153
154 fn validity(
155 &self,
156 _options: &Self::Options,
157 expression: &Expression,
158 ) -> VortexResult<Option<Expression>> {
159 tracing::warn!("Computing validity for LIKE expression");
160 let child_validity = expression.child(0).validity()?;
161 let pattern_validity = expression.child(1).validity()?;
162 Ok(Some(and(child_validity, pattern_validity)))
163 }
164
165 fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
166 false
167 }
168
169 fn stat_falsification(
170 &self,
171 like_opts: &LikeOptions,
172 expr: &Expression,
173 catalog: &dyn StatsCatalog,
174 ) -> Option<Expression> {
175 if like_opts.negated || like_opts.case_insensitive {
179 return None;
180 }
181
182 let pat = expr.child(1).as_::<Literal>();
184
185 let pat_str = pat.as_utf8().value()?;
187
188 let src = expr.child(0).clone();
189 let src_min = src.stat_min(catalog)?;
190 let src_max = src.stat_max(catalog)?;
191
192 match LikeVariant::from_str(pat_str)? {
193 LikeVariant::Exact(text) => {
194 Some(or(
196 gt(src_min, lit(text.as_ref())),
197 lt(src_max, lit(text.as_ref())),
198 ))
199 }
200 LikeVariant::Prefix(prefix) => {
201 let succ = prefix.to_string().increment().ok()?;
203
204 Some(or(
205 gt_eq(src_min, lit(succ)),
206 lt(src_max, lit(prefix.as_ref())),
207 ))
208 }
209 }
210 }
211}
212
213pub(crate) fn arrow_like(
215 array: &ArrayRef,
216 pattern: &ArrayRef,
217 options: LikeOptions,
218 ctx: &mut ExecutionCtx,
219) -> VortexResult<ArrayRef> {
220 let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
221 let len = array.len();
222 assert_eq!(
223 array.len(),
224 pattern.len(),
225 "Arrow Like: length mismatch for {}",
226 array.encoding_id()
227 );
228
229 let lhs = Datum::try_new(array, ctx)?;
231 let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type(), ctx)?;
232
233 let result = match (options.negated, options.case_insensitive) {
234 (false, false) => arrow_string::like::like(&lhs, &rhs)?,
235 (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
236 (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
237 (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
238 };
239
240 from_arrow_columnar(&result, len, nullable, ctx)
241}
242
243#[derive(Debug, PartialEq)]
245pub(crate) enum LikeVariant<'a> {
246 Exact(Cow<'a, str>),
247 Prefix(Cow<'a, str>),
248}
249
250impl<'a> LikeVariant<'a> {
251 pub(crate) fn from_str(string: &'a str) -> Option<LikeVariant<'a>> {
253 let mut literal = None;
254 let mut chars = string.char_indices();
255
256 while let Some((idx, c)) = chars.next() {
257 match c {
258 '\\' => {
259 let literal = literal.get_or_insert_with(|| string[..idx].to_string());
260 match chars.next() {
261 Some((_, escaped)) => literal.push(escaped),
262 None => literal.push('\\'),
263 }
264 }
265 '%' | '_' => {
266 return match literal {
267 Some(literal) => (!literal.is_empty())
268 .then_some(LikeVariant::Prefix(Cow::Owned(literal))),
269 None => {
270 (idx != 0).then_some(LikeVariant::Prefix(Cow::Borrowed(&string[..idx])))
271 }
272 };
273 }
274 c => {
275 if let Some(literal) = &mut literal {
276 literal.push(c);
277 }
278 }
279 }
280 }
281
282 Some(match literal {
283 Some(literal) => LikeVariant::Exact(Cow::Owned(literal)),
284 None => LikeVariant::Exact(Cow::Borrowed(string)),
285 })
286 }
287}
288
289#[cfg(test)]
290mod tests {
291 use std::borrow::Cow;
292
293 use crate::IntoArray;
294 use crate::arrays::BoolArray;
295 use crate::assert_arrays_eq;
296 use crate::dtype::DType;
297 use crate::dtype::Nullability;
298 use crate::expr::col;
299 use crate::expr::get_item;
300 use crate::expr::ilike;
301 use crate::expr::like;
302 use crate::expr::lit;
303 use crate::expr::not;
304 use crate::expr::not_ilike;
305 use crate::expr::not_like;
306 use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
307 use crate::expr::root;
308 use crate::scalar_fn::fns::like::LikeVariant;
309
310 #[test]
311 fn invert_booleans() {
312 let not_expr = not(root());
313 let bools = BoolArray::from_iter([false, true, false, false, true, true]);
314 assert_arrays_eq!(
315 bools.into_array().apply(¬_expr).unwrap(),
316 BoolArray::from_iter([true, false, true, true, false, false])
317 );
318 }
319
320 #[test]
321 fn dtype() {
322 let dtype = DType::Utf8(Nullability::NonNullable);
323 let like_expr = like(root(), lit("%test%"));
324 assert_eq!(
325 like_expr.return_dtype(&dtype).unwrap(),
326 DType::Bool(Nullability::NonNullable)
327 );
328 }
329
330 #[test]
331 fn test_display() {
332 let expr = like(get_item("name", root()), lit("%john%"));
333 assert_eq!(expr.to_string(), "$.name like \"%john%\"");
334
335 let expr2 = not_ilike(root(), lit("test*"));
336 assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
337 }
338
339 fn assert_borrowed_exact(pattern: &str, expected: &str) {
340 let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
341 panic!("expected borrowed exact pattern");
342 };
343 assert!(matches!(actual, Cow::Borrowed(_)));
344 assert_eq!(actual.as_ref(), expected);
345 }
346
347 fn assert_owned_exact(pattern: &str, expected: &str) {
348 let Some(LikeVariant::Exact(actual)) = LikeVariant::from_str(pattern) else {
349 panic!("expected owned exact pattern");
350 };
351 assert!(matches!(actual, Cow::Owned(_)));
352 assert_eq!(actual.as_ref(), expected);
353 }
354
355 fn assert_borrowed_prefix(pattern: &str, expected: &str) {
356 let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
357 panic!("expected borrowed prefix pattern");
358 };
359 assert!(matches!(actual, Cow::Borrowed(_)));
360 assert_eq!(actual.as_ref(), expected);
361 }
362
363 fn assert_owned_prefix(pattern: &str, expected: &str) {
364 let Some(LikeVariant::Prefix(actual)) = LikeVariant::from_str(pattern) else {
365 panic!("expected owned prefix pattern");
366 };
367 assert!(matches!(actual, Cow::Owned(_)));
368 assert_eq!(actual.as_ref(), expected);
369 }
370
371 #[test]
372 fn test_like_variant_borrowed_patterns() {
373 assert_borrowed_exact("simple", "simple");
374 assert_borrowed_prefix("prefix%", "prefix");
375 assert_borrowed_prefix("first%rest_stuff", "first");
376 }
377
378 #[test]
379 fn test_like_variant_escaped_patterns() {
380 assert_owned_prefix(r"\%%", "%");
381 assert_owned_prefix(r"\_%", "_");
382 assert_owned_prefix(r"\\%", "\\");
383 assert_owned_exact(r"\%", "%");
384 assert_owned_exact("trailing\\", "trailing\\");
385 }
386
387 #[test]
388 fn test_like_variant_unsupported_patterns() {
389 assert_eq!(LikeVariant::from_str("%suffix"), None);
390 assert_eq!(LikeVariant::from_str(r"%\%%"), None);
391 assert_eq!(LikeVariant::from_str("_pattern"), None);
392 }
393
394 #[test]
395 fn test_like_pushdown() {
396 let catalog = TrackingStatsCatalog::default();
399
400 let pruning_expr = like(col("a"), lit("prefix%"))
401 .stat_falsification(&catalog)
402 .expect("LIKE stat falsification");
403
404 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
405
406 let pruning_expr = like(col("a"), lit(r"\%%"))
407 .stat_falsification(&catalog)
408 .expect("LIKE stat falsification");
409 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "&") or ($.a_max < "%"))"#);
410
411 let pruning_expr = like(col("a"), lit("pref%ix%"))
413 .stat_falsification(&catalog)
414 .expect("LIKE stat falsification");
415 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
416
417 let pruning_expr = like(col("a"), lit("pref_ix_"))
418 .stat_falsification(&catalog)
419 .expect("LIKE stat falsification");
420 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
421
422 let pruning_expr = like(col("a"), lit("exactly"))
424 .stat_falsification(&catalog)
425 .expect("LIKE stat falsification");
426 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
427
428 let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
430 assert_eq!(pruning_expr, None);
431
432 assert_eq!(
434 None,
435 not_like(col("a"), lit("a")).stat_falsification(&catalog)
436 );
437 assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
438 }
439}