vortex_array/expr/exprs/like/
mod.rs1mod kernel;
5
6use std::fmt::Display;
7use std::fmt::Formatter;
8
9pub use kernel::*;
10use prost::Message;
11use vortex_dtype::DType;
12use vortex_error::VortexResult;
13use vortex_error::vortex_bail;
14use vortex_error::vortex_err;
15use vortex_proto::expr as pb;
16use vortex_session::VortexSession;
17
18use crate::Array;
19use crate::ArrayRef;
20use crate::arrow::Datum;
21use crate::arrow::from_arrow_array_with_len;
22use crate::expr::Arity;
23use crate::expr::ChildName;
24use crate::expr::ExecutionArgs;
25use crate::expr::ExprId;
26use crate::expr::Expression;
27use crate::expr::Literal;
28use crate::expr::StatsCatalog;
29use crate::expr::VTable;
30use crate::expr::VTableExt;
31use crate::expr::and;
32use crate::expr::gt;
33use crate::expr::gt_eq;
34use crate::expr::lit;
35use crate::expr::lt;
36use crate::expr::or;
37use crate::scalar::StringLike;
38
39#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
41pub struct LikeOptions {
42 pub negated: bool,
43 pub case_insensitive: bool,
44}
45
46impl Display for LikeOptions {
47 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
48 if self.negated {
49 write!(f, "NOT ")?;
50 }
51 if self.case_insensitive {
52 write!(f, "ILIKE")
53 } else {
54 write!(f, "LIKE")
55 }
56 }
57}
58
59pub struct Like;
61
62impl VTable for Like {
63 type Options = LikeOptions;
64
65 fn id(&self) -> ExprId {
66 ExprId::from("vortex.like")
67 }
68
69 fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
70 Ok(Some(
71 pb::LikeOpts {
72 negated: instance.negated,
73 case_insensitive: instance.case_insensitive,
74 }
75 .encode_to_vec(),
76 ))
77 }
78
79 fn deserialize(
80 &self,
81 _metadata: &[u8],
82 _session: &VortexSession,
83 ) -> VortexResult<Self::Options> {
84 let opts = pb::LikeOpts::decode(_metadata)?;
85 Ok(LikeOptions {
86 negated: opts.negated,
87 case_insensitive: opts.case_insensitive,
88 })
89 }
90
91 fn arity(&self, _options: &Self::Options) -> Arity {
92 Arity::Exact(2)
93 }
94
95 fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
96 match child_idx {
97 0 => ChildName::from("child"),
98 1 => ChildName::from("pattern"),
99 _ => unreachable!("Invalid child index {} for Like expression", child_idx),
100 }
101 }
102
103 fn fmt_sql(
104 &self,
105 options: &Self::Options,
106 expr: &Expression,
107 f: &mut Formatter<'_>,
108 ) -> std::fmt::Result {
109 expr.child(0).fmt_sql(f)?;
110 if options.negated {
111 write!(f, " not")?;
112 }
113 if options.case_insensitive {
114 write!(f, " ilike ")?;
115 } else {
116 write!(f, " like ")?;
117 }
118 expr.child(1).fmt_sql(f)
119 }
120
121 fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
122 let input = &arg_dtypes[0];
123 let pattern = &arg_dtypes[1];
124
125 if !input.is_utf8() {
126 vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
127 }
128 if !pattern.is_utf8() {
129 vortex_bail!(
130 "LIKE expression requires UTF8 pattern dtype, got {}",
131 pattern
132 );
133 }
134
135 Ok(DType::Bool(
136 (input.is_nullable() || pattern.is_nullable()).into(),
137 ))
138 }
139
140 fn execute(&self, options: &Self::Options, args: ExecutionArgs) -> VortexResult<ArrayRef> {
141 let [child, pattern]: [ArrayRef; _] = args
142 .inputs
143 .try_into()
144 .map_err(|_| vortex_err!("Wrong argument count"))?;
145
146 arrow_like(&child, &pattern, *options)
147 }
148
149 fn validity(
150 &self,
151 _options: &Self::Options,
152 expression: &Expression,
153 ) -> VortexResult<Option<Expression>> {
154 tracing::warn!("Computing validity for LIKE expression");
155 let child_validity = expression.child(0).validity()?;
156 let pattern_validity = expression.child(1).validity()?;
157 Ok(Some(and(child_validity, pattern_validity)))
158 }
159
160 fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
161 false
162 }
163
164 fn stat_falsification(
165 &self,
166 like_opts: &LikeOptions,
167 expr: &Expression,
168 catalog: &dyn StatsCatalog,
169 ) -> Option<Expression> {
170 if like_opts.negated || like_opts.case_insensitive {
174 return None;
175 }
176
177 let pat = expr.child(1).as_::<Literal>();
179
180 let pat_str = pat.as_utf8().value()?;
182
183 let src = expr.child(0).clone();
184 let src_min = src.stat_min(catalog)?;
185 let src_max = src.stat_max(catalog)?;
186
187 match LikeVariant::from_str(pat_str)? {
188 LikeVariant::Exact(text) => {
189 Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
191 }
192 LikeVariant::Prefix(prefix) => {
193 let succ = prefix.to_string().increment().ok()?;
195
196 Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
197 }
198 }
199 }
200}
201
202pub(crate) fn arrow_like(
204 array: &dyn Array,
205 pattern: &dyn Array,
206 options: LikeOptions,
207) -> VortexResult<ArrayRef> {
208 let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
209 let len = array.len();
210 assert_eq!(
211 array.len(),
212 pattern.len(),
213 "Arrow Like: length mismatch for {}",
214 array.encoding_id()
215 );
216
217 let lhs = Datum::try_new(array)?;
219 let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type())?;
220
221 let result = match (options.negated, options.case_insensitive) {
222 (false, false) => arrow_string::like::like(&lhs, &rhs)?,
223 (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
224 (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
225 (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
226 };
227
228 from_arrow_array_with_len(&result, len, nullable)
229}
230
231#[derive(Debug, PartialEq)]
233enum LikeVariant<'a> {
234 Exact(&'a str),
235 Prefix(&'a str),
236}
237
238impl<'a> LikeVariant<'a> {
239 fn from_str(string: &str) -> Option<LikeVariant<'_>> {
241 let Some(wildcard_pos) = string.find(['%', '_']) else {
242 return Some(LikeVariant::Exact(string));
243 };
244
245 if wildcard_pos == 0 {
247 return None;
248 }
249
250 let prefix = &string[..wildcard_pos];
251 Some(LikeVariant::Prefix(prefix))
252 }
253}
254
255pub fn like(child: Expression, pattern: Expression) -> Expression {
256 Like.new_expr(
257 LikeOptions {
258 negated: false,
259 case_insensitive: false,
260 },
261 [child, pattern],
262 )
263}
264
265pub fn ilike(child: Expression, pattern: Expression) -> Expression {
266 Like.new_expr(
267 LikeOptions {
268 negated: false,
269 case_insensitive: true,
270 },
271 [child, pattern],
272 )
273}
274
275pub fn not_like(child: Expression, pattern: Expression) -> Expression {
276 Like.new_expr(
277 LikeOptions {
278 negated: true,
279 case_insensitive: false,
280 },
281 [child, pattern],
282 )
283}
284
285pub fn not_ilike(child: Expression, pattern: Expression) -> Expression {
286 Like.new_expr(
287 LikeOptions {
288 negated: true,
289 case_insensitive: true,
290 },
291 [child, pattern],
292 )
293}
294
295#[cfg(test)]
296mod tests {
297 use vortex_dtype::DType;
298 use vortex_dtype::Nullability;
299
300 use crate::arrays::BoolArray;
301 use crate::assert_arrays_eq;
302 use crate::expr::col;
303 use crate::expr::exprs::get_item::get_item;
304 use crate::expr::exprs::like::LikeVariant;
305 use crate::expr::exprs::like::like;
306 use crate::expr::exprs::literal::lit;
307 use crate::expr::exprs::not::not;
308 use crate::expr::exprs::root::root;
309 use crate::expr::ilike;
310 use crate::expr::not_ilike;
311 use crate::expr::not_like;
312 use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
313
314 #[test]
315 fn invert_booleans() {
316 let not_expr = not(root());
317 let bools = BoolArray::from_iter([false, true, false, false, true, true]);
318 assert_arrays_eq!(
319 bools.to_array().apply(¬_expr).unwrap(),
320 BoolArray::from_iter([true, false, true, true, false, false])
321 );
322 }
323
324 #[test]
325 fn dtype() {
326 let dtype = DType::Utf8(Nullability::NonNullable);
327 let like_expr = like(root(), lit("%test%"));
328 assert_eq!(
329 like_expr.return_dtype(&dtype).unwrap(),
330 DType::Bool(Nullability::NonNullable)
331 );
332 }
333
334 #[test]
335 fn test_display() {
336 let expr = like(get_item("name", root()), lit("%john%"));
337 assert_eq!(expr.to_string(), "$.name like \"%john%\"");
338
339 let expr2 = not_ilike(root(), lit("test*"));
340 assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
341 }
342
343 #[test]
344 fn test_like_variant() {
345 assert_eq!(
347 LikeVariant::from_str("simple"),
348 Some(LikeVariant::Exact("simple"))
349 );
350 assert_eq!(
351 LikeVariant::from_str("prefix%"),
352 Some(LikeVariant::Prefix("prefix"))
353 );
354 assert_eq!(
355 LikeVariant::from_str("first%rest_stuff"),
356 Some(LikeVariant::Prefix("first"))
357 );
358
359 assert_eq!(LikeVariant::from_str("%suffix"), None);
361 assert_eq!(LikeVariant::from_str("_pattern"), None);
362 }
363
364 #[test]
365 fn test_like_pushdown() {
366 let catalog = TrackingStatsCatalog::default();
369
370 let pruning_expr = like(col("a"), lit("prefix%"))
371 .stat_falsification(&catalog)
372 .expect("LIKE stat falsification");
373
374 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
375
376 let pruning_expr = like(col("a"), lit("pref%ix%"))
378 .stat_falsification(&catalog)
379 .expect("LIKE stat falsification");
380 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
381
382 let pruning_expr = like(col("a"), lit("pref_ix_"))
383 .stat_falsification(&catalog)
384 .expect("LIKE stat falsification");
385 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
386
387 let pruning_expr = like(col("a"), lit("exactly"))
389 .stat_falsification(&catalog)
390 .expect("LIKE stat falsification");
391 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
392
393 let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
395 assert_eq!(pruning_expr, None);
396
397 assert_eq!(
399 None,
400 not_like(col("a"), lit("a")).stat_falsification(&catalog)
401 );
402 assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
403 }
404}