datafusion_functions/regex/
regexplike.rs1use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::types::logical_string;
25use datafusion_common::{
26 Result, ScalarValue, arrow_datafusion_err, exec_err, internal_err, plan_err,
27};
28use datafusion_expr::{
29 Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature,
30 TypeSignature, TypeSignatureClass, Volatility, binary_expr, cast,
31};
32use datafusion_macros::user_doc;
33
34use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
35use datafusion_expr_common::operator::Operator;
36use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
37use std::any::Any;
38use std::sync::Arc;
39
40#[user_doc(
41 doc_section(label = "Regular Expression Functions"),
42 description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
43 syntax_example = "regexp_like(str, regexp[, flags])",
44 sql_example = r#"```sql
45select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
46+--------------------------------------------------------+
47| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
48+--------------------------------------------------------+
49| true |
50+--------------------------------------------------------+
51SELECT regexp_like('aBc', '(b|d)', 'i');
52+--------------------------------------------------+
53| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
54+--------------------------------------------------+
55| true |
56+--------------------------------------------------+
57```
58Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
59"#,
60 standard_argument(name = "str", prefix = "String"),
61 standard_argument(name = "regexp", prefix = "Regular"),
62 argument(
63 name = "flags",
64 description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
65 - **i**: case-insensitive: letters match both upper and lower case
66 - **m**: multi-line mode: ^ and $ match begin/end of line
67 - **s**: allow . to match \n
68 - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
69 - **U**: swap the meaning of x* and x*?"#
70 )
71)]
72#[derive(Debug, PartialEq, Eq, Hash)]
73pub struct RegexpLikeFunc {
74 signature: Signature,
75}
76
77impl Default for RegexpLikeFunc {
78 fn default() -> Self {
79 Self::new()
80 }
81}
82
83impl RegexpLikeFunc {
84 pub fn new() -> Self {
85 Self {
86 signature: Signature::one_of(
87 vec![
88 TypeSignature::Coercible(vec![
89 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
90 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
91 ]),
92 TypeSignature::Coercible(vec![
93 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
94 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
95 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
96 ]),
97 ],
98 Volatility::Immutable,
99 ),
100 }
101 }
102}
103
104impl ScalarUDFImpl for RegexpLikeFunc {
105 fn as_any(&self) -> &dyn Any {
106 self
107 }
108
109 fn name(&self) -> &str {
110 "regexp_like"
111 }
112
113 fn signature(&self) -> &Signature {
114 &self.signature
115 }
116
117 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
118 use DataType::*;
119
120 Ok(match &arg_types[0] {
121 Null => Null,
122 _ => Boolean,
125 })
126 }
127
128 fn invoke_with_args(
129 &self,
130 args: datafusion_expr::ScalarFunctionArgs,
131 ) -> Result<ColumnarValue> {
132 let args = &args.args;
133
134 let len = args
135 .iter()
136 .fold(Option::<usize>::None, |acc, arg| match arg {
137 ColumnarValue::Scalar(_) => acc,
138 ColumnarValue::Array(a) => Some(a.len()),
139 });
140
141 let is_scalar = len.is_none();
142 let inferred_length = len.unwrap_or(1);
143 let args = args
144 .iter()
145 .map(|arg| arg.to_array(inferred_length))
146 .collect::<Result<Vec<_>>>()?;
147
148 let result = regexp_like(&args);
149 if is_scalar {
150 let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
152 result.map(ColumnarValue::Scalar)
153 } else {
154 result.map(ColumnarValue::Array)
155 }
156 }
157
158 fn simplify(
159 &self,
160 mut args: Vec<Expr>,
161 info: &dyn SimplifyInfo,
162 ) -> Result<ExprSimplifyResult> {
163 let Some(op) = derive_operator(&args) else {
168 return Ok(ExprSimplifyResult::Original(args));
169 };
170
171 let string_type = info.get_data_type(&args[0])?;
172 let regexp_type = info.get_data_type(&args[1])?;
173 let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, ®exp_type);
174 let Ok((coerced_string_type, coerced_regexp_type)) =
175 binary_type_coercer.get_input_types()
176 else {
177 return Ok(ExprSimplifyResult::Original(args));
178 };
179
180 let regexp = args.swap_remove(1);
182 let string = args.swap_remove(0);
183
184 Ok(ExprSimplifyResult::Simplified(binary_expr(
185 if string_type != coerced_string_type {
186 cast(string, coerced_string_type)
187 } else {
188 string
189 },
190 op,
191 if regexp_type != coerced_regexp_type {
192 cast(regexp, coerced_regexp_type)
193 } else {
194 regexp
195 },
196 )))
197 }
198
199 fn documentation(&self) -> Option<&Documentation> {
200 self.doc()
201 }
202}
203
204fn derive_operator(args: &[Expr]) -> Option<Operator> {
205 match args.len() {
206 3 => {
208 match &args[2] {
209 Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
210 match flags.as_str() {
211 "i" => Some(Operator::RegexIMatch),
212 "" => Some(Operator::RegexMatch),
213 _ => None,
215 }
216 }
217 _ => None,
219 }
220 }
221 2 => Some(Operator::RegexMatch),
223 _ => None,
225 }
226}
227
228pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
271 match args.len() {
272 2 => handle_regexp_like(&args[0], &args[1], None),
273 3 => {
274 let flags = match args[2].data_type() {
275 Utf8 => args[2].as_string::<i32>(),
276 LargeUtf8 => {
277 let large_string_array = args[2].as_string::<i64>();
278 let string_vec: Vec<Option<&str>> = (0..large_string_array.len())
279 .map(|i| {
280 if large_string_array.is_null(i) {
281 None
282 } else {
283 Some(large_string_array.value(i))
284 }
285 })
286 .collect();
287
288 &GenericStringArray::<i32>::from(string_vec)
289 }
290 _ => {
291 let string_view_array = args[2].as_string_view();
292 let string_vec: Vec<Option<String>> = (0..string_view_array.len())
293 .map(|i| {
294 if string_view_array.is_null(i) {
295 None
296 } else {
297 Some(string_view_array.value(i).to_string())
298 }
299 })
300 .collect();
301 &GenericStringArray::<i32>::from(string_vec)
302 }
303 };
304
305 if flags.iter().any(|s| s == Some("g")) {
306 return plan_err!("regexp_like() does not support the \"global\" option");
307 }
308
309 handle_regexp_like(&args[0], &args[1], Some(flags))
310 }
311 other => exec_err!(
312 "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
313 ),
314 }
315}
316
317fn handle_regexp_like(
318 values: &ArrayRef,
319 patterns: &ArrayRef,
320 flags: Option<&GenericStringArray<i32>>,
321) -> Result<ArrayRef> {
322 let array = match (values.data_type(), patterns.data_type()) {
323 (Utf8View, Utf8) => {
324 let value = values.as_string_view();
325 let pattern = patterns.as_string::<i32>();
326
327 regexp::regexp_is_match(value, pattern, flags)
328 .map_err(|e| arrow_datafusion_err!(e))?
329 }
330 (Utf8View, Utf8View) => {
331 let value = values.as_string_view();
332 let pattern = patterns.as_string_view();
333
334 regexp::regexp_is_match(value, pattern, flags)
335 .map_err(|e| arrow_datafusion_err!(e))?
336 }
337 (Utf8View, LargeUtf8) => {
338 let value = values.as_string_view();
339 let pattern = patterns.as_string::<i64>();
340
341 regexp::regexp_is_match(value, pattern, flags)
342 .map_err(|e| arrow_datafusion_err!(e))?
343 }
344 (Utf8, Utf8) => {
345 let value = values.as_string::<i32>();
346 let pattern = patterns.as_string::<i32>();
347
348 regexp::regexp_is_match(value, pattern, flags)
349 .map_err(|e| arrow_datafusion_err!(e))?
350 }
351 (Utf8, Utf8View) => {
352 let value = values.as_string::<i32>();
353 let pattern = patterns.as_string_view();
354
355 regexp::regexp_is_match(value, pattern, flags)
356 .map_err(|e| arrow_datafusion_err!(e))?
357 }
358 (Utf8, LargeUtf8) => {
359 let value = values.as_string_view();
360 let pattern = patterns.as_string::<i64>();
361
362 regexp::regexp_is_match(value, pattern, flags)
363 .map_err(|e| arrow_datafusion_err!(e))?
364 }
365 (LargeUtf8, Utf8) => {
366 let value = values.as_string::<i64>();
367 let pattern = patterns.as_string::<i32>();
368
369 regexp::regexp_is_match(value, pattern, flags)
370 .map_err(|e| arrow_datafusion_err!(e))?
371 }
372 (LargeUtf8, Utf8View) => {
373 let value = values.as_string::<i64>();
374 let pattern = patterns.as_string_view();
375
376 regexp::regexp_is_match(value, pattern, flags)
377 .map_err(|e| arrow_datafusion_err!(e))?
378 }
379 (LargeUtf8, LargeUtf8) => {
380 let value = values.as_string::<i64>();
381 let pattern = patterns.as_string::<i64>();
382
383 regexp::regexp_is_match(value, pattern, flags)
384 .map_err(|e| arrow_datafusion_err!(e))?
385 }
386 other => {
387 return internal_err!(
388 "Unsupported data type {other:?} for function `regexp_like`"
389 );
390 }
391 };
392
393 Ok(Arc::new(array) as ArrayRef)
394}
395
396#[cfg(test)]
397mod tests {
398 use std::sync::Arc;
399
400 use arrow::array::StringArray;
401 use arrow::array::{BooleanBuilder, StringViewArray};
402
403 use crate::regex::regexplike::regexp_like;
404
405 #[test]
406 fn test_case_sensitive_regexp_like_utf8() {
407 let values = StringArray::from(vec!["abc"; 5]);
408
409 let patterns =
410 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
411
412 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
413 expected_builder.append_value(true);
414 expected_builder.append_value(false);
415 expected_builder.append_value(true);
416 expected_builder.append_value(false);
417 expected_builder.append_value(false);
418 let expected = expected_builder.finish();
419
420 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
421
422 assert_eq!(re.as_ref(), &expected);
423 }
424
425 #[test]
426 fn test_case_sensitive_regexp_like_utf8view() {
427 let values = StringViewArray::from(vec!["abc"; 5]);
428
429 let patterns =
430 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
431
432 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
433 expected_builder.append_value(true);
434 expected_builder.append_value(false);
435 expected_builder.append_value(true);
436 expected_builder.append_value(false);
437 expected_builder.append_value(false);
438 let expected = expected_builder.finish();
439
440 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
441
442 assert_eq!(re.as_ref(), &expected);
443 }
444
445 #[test]
446 fn test_case_insensitive_regexp_like_utf8() {
447 let values = StringArray::from(vec!["abc"; 5]);
448 let patterns =
449 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
450 let flags = StringArray::from(vec!["i"; 5]);
451
452 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
453 expected_builder.append_value(true);
454 expected_builder.append_value(true);
455 expected_builder.append_value(true);
456 expected_builder.append_value(true);
457 expected_builder.append_value(false);
458 let expected = expected_builder.finish();
459
460 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
461 .unwrap();
462
463 assert_eq!(re.as_ref(), &expected);
464 }
465
466 #[test]
467 fn test_case_insensitive_regexp_like_utf8view() {
468 let values = StringViewArray::from(vec!["abc"; 5]);
469 let patterns =
470 StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
471 let flags = StringArray::from(vec!["i"; 5]);
472
473 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
474 expected_builder.append_value(true);
475 expected_builder.append_value(true);
476 expected_builder.append_value(true);
477 expected_builder.append_value(true);
478 expected_builder.append_value(false);
479 let expected = expected_builder.finish();
480
481 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
482 .unwrap();
483
484 assert_eq!(re.as_ref(), &expected);
485 }
486
487 #[test]
488 fn test_unsupported_global_flag_regexp_like() {
489 let values = StringArray::from(vec!["abc"]);
490 let patterns = StringArray::from(vec!["^(a)"]);
491 let flags = StringArray::from(vec!["g"]);
492
493 let re_err =
494 regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
495 .expect_err("unsupported flag should have failed");
496
497 assert_eq!(
498 re_err.strip_backtrace(),
499 "Error during planning: regexp_like() does not support the \"global\" option"
500 );
501 }
502}