vortex_array/scalar_fn/fns/like/
mod.rs1mod kernel;
5
6use std::fmt::Display;
7use std::fmt::Formatter;
8
9pub use kernel::*;
10use prost::Message;
11use vortex_error::VortexResult;
12use vortex_error::vortex_bail;
13use vortex_proto::expr as pb;
14use vortex_session::VortexSession;
15
16use crate::ArrayRef;
17use crate::ExecutionCtx;
18use crate::arrow::Datum;
19use crate::arrow::from_arrow_array_with_len;
20use crate::dtype::DType;
21use crate::expr::Expression;
22use crate::expr::StatsCatalog;
23use crate::expr::and;
24use crate::expr::gt;
25use crate::expr::gt_eq;
26use crate::expr::lit;
27use crate::expr::lt;
28use crate::expr::or;
29use crate::scalar::StringLike;
30use crate::scalar_fn::Arity;
31use crate::scalar_fn::ChildName;
32use crate::scalar_fn::ExecutionArgs;
33use crate::scalar_fn::ScalarFnId;
34use crate::scalar_fn::ScalarFnVTable;
35use crate::scalar_fn::fns::literal::Literal;
36
37#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Hash)]
39pub struct LikeOptions {
40 pub negated: bool,
41 pub case_insensitive: bool,
42}
43
44impl Display for LikeOptions {
45 fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
46 if self.negated {
47 write!(f, "NOT ")?;
48 }
49 if self.case_insensitive {
50 write!(f, "ILIKE")
51 } else {
52 write!(f, "LIKE")
53 }
54 }
55}
56
57#[derive(Clone)]
59pub struct Like;
60
61impl ScalarFnVTable for Like {
62 type Options = LikeOptions;
63
64 fn id(&self) -> ScalarFnId {
65 ScalarFnId::from("vortex.like")
66 }
67
68 fn serialize(&self, instance: &Self::Options) -> VortexResult<Option<Vec<u8>>> {
69 Ok(Some(
70 pb::LikeOpts {
71 negated: instance.negated,
72 case_insensitive: instance.case_insensitive,
73 }
74 .encode_to_vec(),
75 ))
76 }
77
78 fn deserialize(
79 &self,
80 _metadata: &[u8],
81 _session: &VortexSession,
82 ) -> VortexResult<Self::Options> {
83 let opts = pb::LikeOpts::decode(_metadata)?;
84 Ok(LikeOptions {
85 negated: opts.negated,
86 case_insensitive: opts.case_insensitive,
87 })
88 }
89
90 fn arity(&self, _options: &Self::Options) -> Arity {
91 Arity::Exact(2)
92 }
93
94 fn child_name(&self, _instance: &Self::Options, child_idx: usize) -> ChildName {
95 match child_idx {
96 0 => ChildName::from("child"),
97 1 => ChildName::from("pattern"),
98 _ => unreachable!("Invalid child index {} for Like expression", child_idx),
99 }
100 }
101
102 fn fmt_sql(
103 &self,
104 options: &Self::Options,
105 expr: &Expression,
106 f: &mut Formatter<'_>,
107 ) -> std::fmt::Result {
108 expr.child(0).fmt_sql(f)?;
109 if options.negated {
110 write!(f, " not")?;
111 }
112 if options.case_insensitive {
113 write!(f, " ilike ")?;
114 } else {
115 write!(f, " like ")?;
116 }
117 expr.child(1).fmt_sql(f)
118 }
119
120 fn return_dtype(&self, _options: &Self::Options, arg_dtypes: &[DType]) -> VortexResult<DType> {
121 let input = &arg_dtypes[0];
122 let pattern = &arg_dtypes[1];
123
124 if !input.is_utf8() {
125 vortex_bail!("LIKE expression requires UTF8 input dtype, got {}", input);
126 }
127 if !pattern.is_utf8() {
128 vortex_bail!(
129 "LIKE expression requires UTF8 pattern dtype, got {}",
130 pattern
131 );
132 }
133
134 Ok(DType::Bool(
135 (input.is_nullable() || pattern.is_nullable()).into(),
136 ))
137 }
138
139 fn execute(
140 &self,
141 options: &Self::Options,
142 args: &dyn ExecutionArgs,
143 _ctx: &mut ExecutionCtx,
144 ) -> VortexResult<ArrayRef> {
145 let child = args.get(0)?;
146 let pattern = args.get(1)?;
147
148 arrow_like(&child, &pattern, *options)
149 }
150
151 fn validity(
152 &self,
153 _options: &Self::Options,
154 expression: &Expression,
155 ) -> VortexResult<Option<Expression>> {
156 tracing::warn!("Computing validity for LIKE expression");
157 let child_validity = expression.child(0).validity()?;
158 let pattern_validity = expression.child(1).validity()?;
159 Ok(Some(and(child_validity, pattern_validity)))
160 }
161
162 fn is_null_sensitive(&self, _instance: &Self::Options) -> bool {
163 false
164 }
165
166 fn stat_falsification(
167 &self,
168 like_opts: &LikeOptions,
169 expr: &Expression,
170 catalog: &dyn StatsCatalog,
171 ) -> Option<Expression> {
172 if like_opts.negated || like_opts.case_insensitive {
176 return None;
177 }
178
179 let pat = expr.child(1).as_::<Literal>();
181
182 let pat_str = pat.as_utf8().value()?;
184
185 let src = expr.child(0).clone();
186 let src_min = src.stat_min(catalog)?;
187 let src_max = src.stat_max(catalog)?;
188
189 match LikeVariant::from_str(pat_str)? {
190 LikeVariant::Exact(text) => {
191 Some(or(gt(src_min, lit(text)), lt(src_max, lit(text))))
193 }
194 LikeVariant::Prefix(prefix) => {
195 let succ = prefix.to_string().increment().ok()?;
197
198 Some(or(gt_eq(src_min, lit(succ)), lt(src_max, lit(prefix))))
199 }
200 }
201 }
202}
203
204pub(crate) fn arrow_like(
206 array: &ArrayRef,
207 pattern: &ArrayRef,
208 options: LikeOptions,
209) -> VortexResult<ArrayRef> {
210 let nullable = array.dtype().is_nullable() | pattern.dtype().is_nullable();
211 let len = array.len();
212 assert_eq!(
213 array.len(),
214 pattern.len(),
215 "Arrow Like: length mismatch for {}",
216 array.encoding_id()
217 );
218
219 let lhs = Datum::try_new(array)?;
221 let rhs = Datum::try_new_with_target_datatype(pattern, lhs.data_type())?;
222
223 let result = match (options.negated, options.case_insensitive) {
224 (false, false) => arrow_string::like::like(&lhs, &rhs)?,
225 (true, false) => arrow_string::like::nlike(&lhs, &rhs)?,
226 (false, true) => arrow_string::like::ilike(&lhs, &rhs)?,
227 (true, true) => arrow_string::like::nilike(&lhs, &rhs)?,
228 };
229
230 from_arrow_array_with_len(&result, len, nullable)
231}
232
233#[derive(Debug, PartialEq)]
235enum LikeVariant<'a> {
236 Exact(&'a str),
237 Prefix(&'a str),
238}
239
240impl<'a> LikeVariant<'a> {
241 fn from_str(string: &str) -> Option<LikeVariant<'_>> {
243 let Some(wildcard_pos) = string.find(['%', '_']) else {
244 return Some(LikeVariant::Exact(string));
245 };
246
247 if wildcard_pos == 0 {
249 return None;
250 }
251
252 let prefix = &string[..wildcard_pos];
253 Some(LikeVariant::Prefix(prefix))
254 }
255}
256
257#[cfg(test)]
258mod tests {
259 use crate::IntoArray;
260 use crate::arrays::BoolArray;
261 use crate::assert_arrays_eq;
262 use crate::dtype::DType;
263 use crate::dtype::Nullability;
264 use crate::expr::col;
265 use crate::expr::get_item;
266 use crate::expr::ilike;
267 use crate::expr::like;
268 use crate::expr::lit;
269 use crate::expr::not;
270 use crate::expr::not_ilike;
271 use crate::expr::not_like;
272 use crate::expr::pruning::pruning_expr::TrackingStatsCatalog;
273 use crate::expr::root;
274 use crate::scalar_fn::fns::like::LikeVariant;
275
276 #[test]
277 fn invert_booleans() {
278 let not_expr = not(root());
279 let bools = BoolArray::from_iter([false, true, false, false, true, true]);
280 assert_arrays_eq!(
281 bools.into_array().apply(¬_expr).unwrap(),
282 BoolArray::from_iter([true, false, true, true, false, false])
283 );
284 }
285
286 #[test]
287 fn dtype() {
288 let dtype = DType::Utf8(Nullability::NonNullable);
289 let like_expr = like(root(), lit("%test%"));
290 assert_eq!(
291 like_expr.return_dtype(&dtype).unwrap(),
292 DType::Bool(Nullability::NonNullable)
293 );
294 }
295
296 #[test]
297 fn test_display() {
298 let expr = like(get_item("name", root()), lit("%john%"));
299 assert_eq!(expr.to_string(), "$.name like \"%john%\"");
300
301 let expr2 = not_ilike(root(), lit("test*"));
302 assert_eq!(expr2.to_string(), "$ not ilike \"test*\"");
303 }
304
305 #[test]
306 fn test_like_variant() {
307 assert_eq!(
309 LikeVariant::from_str("simple"),
310 Some(LikeVariant::Exact("simple"))
311 );
312 assert_eq!(
313 LikeVariant::from_str("prefix%"),
314 Some(LikeVariant::Prefix("prefix"))
315 );
316 assert_eq!(
317 LikeVariant::from_str("first%rest_stuff"),
318 Some(LikeVariant::Prefix("first"))
319 );
320
321 assert_eq!(LikeVariant::from_str("%suffix"), None);
323 assert_eq!(LikeVariant::from_str("_pattern"), None);
324 }
325
326 #[test]
327 fn test_like_pushdown() {
328 let catalog = TrackingStatsCatalog::default();
331
332 let pruning_expr = like(col("a"), lit("prefix%"))
333 .stat_falsification(&catalog)
334 .expect("LIKE stat falsification");
335
336 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "prefiy") or ($.a_max < "prefix"))"#);
337
338 let pruning_expr = like(col("a"), lit("pref%ix%"))
340 .stat_falsification(&catalog)
341 .expect("LIKE stat falsification");
342 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
343
344 let pruning_expr = like(col("a"), lit("pref_ix_"))
345 .stat_falsification(&catalog)
346 .expect("LIKE stat falsification");
347 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min >= "preg") or ($.a_max < "pref"))"#);
348
349 let pruning_expr = like(col("a"), lit("exactly"))
351 .stat_falsification(&catalog)
352 .expect("LIKE stat falsification");
353 insta::assert_snapshot!(pruning_expr, @r#"(($.a_min > "exactly") or ($.a_max < "exactly"))"#);
354
355 let pruning_expr = like(col("a"), lit("%suffix")).stat_falsification(&catalog);
357 assert_eq!(pruning_expr, None);
358
359 assert_eq!(
361 None,
362 not_like(col("a"), lit("a")).stat_falsification(&catalog)
363 );
364 assert_eq!(None, ilike(col("a"), lit("a")).stat_falsification(&catalog));
365 }
366}