1use arrow::array::{Array, ArrayRef, AsArray, BooleanArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::types::logical_string;
25use datafusion_common::{
26 Result, ScalarValue, arrow_datafusion_err, exec_err, internal_err, plan_err,
27};
28use datafusion_expr::{
29 Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature,
30 TypeSignature, TypeSignatureClass, Volatility, binary_expr, cast,
31};
32use datafusion_macros::user_doc;
33
34use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyContext};
35use datafusion_expr_common::operator::Operator;
36use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
37use regex::Regex;
38use std::any::Any;
39use std::sync::Arc;
40
41#[user_doc(
42 doc_section(label = "Regular Expression Functions"),
43 description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
44 syntax_example = "regexp_like(str, regexp[, flags])",
45 sql_example = r#"```sql
46select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
47+--------------------------------------------------------+
48| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
49+--------------------------------------------------------+
50| true |
51+--------------------------------------------------------+
52SELECT regexp_like('aBc', '(b|d)', 'i');
53+--------------------------------------------------+
54| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
55+--------------------------------------------------+
56| true |
57+--------------------------------------------------+
58```
59Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
60"#,
61 standard_argument(name = "str", prefix = "String"),
62 standard_argument(name = "regexp", prefix = "Regular"),
63 argument(
64 name = "flags",
65 description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
66 - **i**: case-insensitive: letters match both upper and lower case
67 - **m**: multi-line mode: ^ and $ match begin/end of line
68 - **s**: allow . to match \n
69 - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
70 - **U**: swap the meaning of x* and x*?"#
71 )
72)]
73#[derive(Debug, PartialEq, Eq, Hash)]
74pub struct RegexpLikeFunc {
75 signature: Signature,
76}
77
78impl Default for RegexpLikeFunc {
79 fn default() -> Self {
80 Self::new()
81 }
82}
83
84impl RegexpLikeFunc {
85 pub fn new() -> Self {
86 Self {
87 signature: Signature::one_of(
88 vec![
89 TypeSignature::Coercible(vec![
90 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
91 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
92 ]),
93 TypeSignature::Coercible(vec![
94 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
95 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
96 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
97 ]),
98 ],
99 Volatility::Immutable,
100 ),
101 }
102 }
103}
104
105impl ScalarUDFImpl for RegexpLikeFunc {
106 fn as_any(&self) -> &dyn Any {
107 self
108 }
109
110 fn name(&self) -> &str {
111 "regexp_like"
112 }
113
114 fn signature(&self) -> &Signature {
115 &self.signature
116 }
117
118 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
119 use DataType::*;
120
121 Ok(match &arg_types[0] {
122 Null => Null,
123 _ => Boolean,
126 })
127 }
128
129 fn invoke_with_args(
130 &self,
131 args: datafusion_expr::ScalarFunctionArgs,
132 ) -> Result<ColumnarValue> {
133 let args = &args.args;
134 match args.as_slice() {
135 [ColumnarValue::Scalar(value), ColumnarValue::Scalar(pattern)] => {
136 let value = scalar_string(value)?;
137 let pattern = scalar_string(pattern)?;
138 regexp_like_scalar(value, pattern, None)
139 }
140 [
141 ColumnarValue::Scalar(value),
142 ColumnarValue::Scalar(pattern),
143 ColumnarValue::Scalar(flags),
144 ] => {
145 let value = scalar_string(value)?;
146 let pattern = scalar_string(pattern)?;
147 let flags = scalar_string(flags)?;
148 regexp_like_scalar(value, pattern, flags)
149 }
150 [ColumnarValue::Array(values), ColumnarValue::Scalar(pattern)] => {
151 let pattern = scalar_string(pattern)?;
152 let array = regexp_like_array_scalar(values, pattern, None)?;
153 Ok(ColumnarValue::Array(array))
154 }
155 [
156 ColumnarValue::Array(values),
157 ColumnarValue::Scalar(pattern),
158 ColumnarValue::Scalar(flags),
159 ] => {
160 let flags = scalar_string(flags)?;
161 if flags.is_some_and(|flagz| flagz.contains('g')) {
162 plan_err!("regexp_like() does not support the \"global\" option")
163 } else {
164 let pattern = scalar_string(pattern)?;
165 let array = regexp_like_array_scalar(values, pattern, flags)?;
166 Ok(ColumnarValue::Array(array))
167 }
168 }
169 _ => {
170 let args = ColumnarValue::values_to_arrays(args)?;
171 regexp_like(&args).map(ColumnarValue::Array)
172 }
173 }
174 }
175
176 fn simplify(
177 &self,
178 mut args: Vec<Expr>,
179 info: &SimplifyContext,
180 ) -> Result<ExprSimplifyResult> {
181 let Some(op) = derive_operator(&args) else {
186 return Ok(ExprSimplifyResult::Original(args));
187 };
188
189 let string_type = info.get_data_type(&args[0])?;
190 let regexp_type = info.get_data_type(&args[1])?;
191 let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, ®exp_type);
192 let Ok((coerced_string_type, coerced_regexp_type)) =
193 binary_type_coercer.get_input_types()
194 else {
195 return Ok(ExprSimplifyResult::Original(args));
196 };
197
198 let regexp = args.swap_remove(1);
200 let string = args.swap_remove(0);
201
202 Ok(ExprSimplifyResult::Simplified(binary_expr(
203 if string_type != coerced_string_type {
204 cast(string, coerced_string_type)
205 } else {
206 string
207 },
208 op,
209 if regexp_type != coerced_regexp_type {
210 cast(regexp, coerced_regexp_type)
211 } else {
212 regexp
213 },
214 )))
215 }
216
217 fn documentation(&self) -> Option<&Documentation> {
218 self.doc()
219 }
220}
221
222fn derive_operator(args: &[Expr]) -> Option<Operator> {
223 match args.len() {
224 3 => {
226 match &args[2] {
227 Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
228 match flags.as_str() {
229 "i" => Some(Operator::RegexIMatch),
230 "" => Some(Operator::RegexMatch),
231 _ => None,
233 }
234 }
235 _ => None,
237 }
238 }
239 2 => Some(Operator::RegexMatch),
241 _ => None,
243 }
244}
245
246pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
289 match args.len() {
290 2 => handle_regexp_like(&args[0], &args[1], None),
291 3 => {
292 let flags = match args[2].data_type() {
293 Utf8 => args[2].as_string::<i32>(),
294 LargeUtf8 => {
295 let large_string_array = args[2].as_string::<i64>();
296 let string_vec: Vec<Option<&str>> = (0..large_string_array.len())
297 .map(|i| {
298 if large_string_array.is_null(i) {
299 None
300 } else {
301 Some(large_string_array.value(i))
302 }
303 })
304 .collect();
305
306 &GenericStringArray::<i32>::from(string_vec)
307 }
308 _ => {
309 let string_view_array = args[2].as_string_view();
310 let string_vec: Vec<Option<String>> = (0..string_view_array.len())
311 .map(|i| {
312 if string_view_array.is_null(i) {
313 None
314 } else {
315 Some(string_view_array.value(i).to_string())
316 }
317 })
318 .collect();
319 &GenericStringArray::<i32>::from(string_vec)
320 }
321 };
322
323 if flags
324 .iter()
325 .any(|s| s.is_some_and(|flagz| flagz.contains('g')))
326 {
327 return plan_err!("regexp_like() does not support the \"global\" option");
328 }
329
330 handle_regexp_like(&args[0], &args[1], Some(flags))
331 }
332 other => exec_err!(
333 "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
334 ),
335 }
336}
337
338fn scalar_string(value: &ScalarValue) -> Result<Option<&str>> {
339 match value.try_as_str() {
340 Some(v) => Ok(v),
341 None => internal_err!(
342 "Unsupported data type {:?} for function `regexp_like`",
343 value.data_type()
344 ),
345 }
346}
347
348fn regexp_like_array_scalar(
349 values: &ArrayRef,
350 pattern: Option<&str>,
351 flags: Option<&str>,
352) -> Result<ArrayRef> {
353 use DataType::*;
354
355 let Some(pattern) = pattern else {
356 return Ok(Arc::new(BooleanArray::new_null(values.len())));
357 };
358 let array = match values.data_type() {
359 Utf8 => {
360 let array = values.as_string::<i32>();
361 regexp::regexp_is_match_scalar(array, pattern, flags)?
362 }
363 Utf8View => {
364 let array = values.as_string_view();
365 regexp::regexp_is_match_scalar(array, pattern, flags)?
366 }
367 LargeUtf8 => {
368 let array = values.as_string::<i64>();
369 regexp::regexp_is_match_scalar(array, pattern, flags)?
370 }
371 other => {
372 return internal_err!(
373 "Unsupported data type {other:?} for function `regexp_like`"
374 );
375 }
376 };
377
378 Ok(Arc::new(array))
379}
380
381fn regexp_like_scalar(
382 value: Option<&str>,
383 pattern: Option<&str>,
384 flags: Option<&str>,
385) -> Result<ColumnarValue> {
386 if flags.is_some_and(|flagz| flagz.contains('g')) {
387 return plan_err!("regexp_like() does not support the \"global\" option");
388 }
389
390 if value.is_none() || pattern.is_none() {
391 return Ok(ColumnarValue::Scalar(ScalarValue::Boolean(None)));
392 }
393
394 let value = value.unwrap();
395 let pattern = pattern.unwrap();
396 let pattern = match flags {
397 Some(flagz) => format!("(?{flagz}){pattern}"),
398 None => pattern.to_string(),
399 };
400
401 let result = if pattern.is_empty() {
402 true
403 } else {
404 let re = Regex::new(pattern.as_str()).map_err(|e| {
405 datafusion_common::DataFusionError::Execution(format!(
406 "Regular expression did not compile: {e:?}"
407 ))
408 })?;
409 re.is_match(value)
410 };
411
412 Ok(ColumnarValue::Scalar(ScalarValue::Boolean(Some(result))))
413}
414
415fn handle_regexp_like(
416 values: &ArrayRef,
417 patterns: &ArrayRef,
418 flags: Option<&GenericStringArray<i32>>,
419) -> Result<ArrayRef> {
420 let array = match (values.data_type(), patterns.data_type()) {
421 (Utf8View, Utf8) => {
422 let value = values.as_string_view();
423 let pattern = patterns.as_string::<i32>();
424
425 regexp::regexp_is_match(value, pattern, flags)
426 .map_err(|e| arrow_datafusion_err!(e))?
427 }
428 (Utf8View, Utf8View) => {
429 let value = values.as_string_view();
430 let pattern = patterns.as_string_view();
431
432 regexp::regexp_is_match(value, pattern, flags)
433 .map_err(|e| arrow_datafusion_err!(e))?
434 }
435 (Utf8View, LargeUtf8) => {
436 let value = values.as_string_view();
437 let pattern = patterns.as_string::<i64>();
438
439 regexp::regexp_is_match(value, pattern, flags)
440 .map_err(|e| arrow_datafusion_err!(e))?
441 }
442 (Utf8, Utf8) => {
443 let value = values.as_string::<i32>();
444 let pattern = patterns.as_string::<i32>();
445
446 regexp::regexp_is_match(value, pattern, flags)
447 .map_err(|e| arrow_datafusion_err!(e))?
448 }
449 (Utf8, Utf8View) => {
450 let value = values.as_string::<i32>();
451 let pattern = patterns.as_string_view();
452
453 regexp::regexp_is_match(value, pattern, flags)
454 .map_err(|e| arrow_datafusion_err!(e))?
455 }
456 (Utf8, LargeUtf8) => {
457 let value = values.as_string::<i32>();
458 let pattern = patterns.as_string::<i64>();
459
460 regexp::regexp_is_match(value, pattern, flags)
461 .map_err(|e| arrow_datafusion_err!(e))?
462 }
463 (LargeUtf8, Utf8) => {
464 let value = values.as_string::<i64>();
465 let pattern = patterns.as_string::<i32>();
466
467 regexp::regexp_is_match(value, pattern, flags)
468 .map_err(|e| arrow_datafusion_err!(e))?
469 }
470 (LargeUtf8, Utf8View) => {
471 let value = values.as_string::<i64>();
472 let pattern = patterns.as_string_view();
473
474 regexp::regexp_is_match(value, pattern, flags)
475 .map_err(|e| arrow_datafusion_err!(e))?
476 }
477 (LargeUtf8, LargeUtf8) => {
478 let value = values.as_string::<i64>();
479 let pattern = patterns.as_string::<i64>();
480
481 regexp::regexp_is_match(value, pattern, flags)
482 .map_err(|e| arrow_datafusion_err!(e))?
483 }
484 other => {
485 return internal_err!(
486 "Unsupported data type {other:?} for function `regexp_like`"
487 );
488 }
489 };
490
491 Ok(Arc::new(array) as ArrayRef)
492}
493
494#[cfg(test)]
495mod tests {
496 use std::sync::Arc;
497
498 use arrow::array::StringArray;
499 use arrow::array::{BooleanBuilder, StringViewArray};
500 use arrow::datatypes::{DataType, Field};
501 use datafusion_common::config::ConfigOptions;
502 use datafusion_common::{Result, ScalarValue};
503 use datafusion_expr::{ColumnarValue, ScalarFunctionArgs, ScalarUDFImpl};
504
505 use crate::regex::regexplike::{RegexpLikeFunc, regexp_like};
506
507 fn invoke_regexp_like(args: Vec<ColumnarValue>) -> Result<ColumnarValue> {
508 let number_rows = args
509 .iter()
510 .find_map(|arg| match arg {
511 ColumnarValue::Array(array) => Some(array.len()),
512 _ => None,
513 })
514 .unwrap_or(1);
515 let arg_fields = args
516 .iter()
517 .enumerate()
518 .map(|(idx, arg)| {
519 Arc::new(Field::new(format!("arg_{idx}"), arg.data_type(), true))
520 })
521 .collect::<Vec<_>>();
522
523 RegexpLikeFunc::new().invoke_with_args(ScalarFunctionArgs {
524 args,
525 arg_fields,
526 number_rows,
527 return_field: Arc::new(Field::new("f", DataType::Boolean, true)),
528 config_options: Arc::new(ConfigOptions::default()),
529 })
530 }
531
532 #[test]
533 fn test_case_sensitive_regexp_like_utf8() {
534 let values = StringArray::from(vec!["abc"; 5]);
535
536 let patterns =
537 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
538
539 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
540 expected_builder.append_value(true);
541 expected_builder.append_value(false);
542 expected_builder.append_value(true);
543 expected_builder.append_value(false);
544 expected_builder.append_value(false);
545 let expected = expected_builder.finish();
546
547 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
548
549 assert_eq!(re.as_ref(), &expected);
550 }
551
552 #[test]
553 fn test_case_sensitive_regexp_like_utf8view() {
554 let values = StringViewArray::from(vec!["abc"; 5]);
555
556 let patterns =
557 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
558
559 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
560 expected_builder.append_value(true);
561 expected_builder.append_value(false);
562 expected_builder.append_value(true);
563 expected_builder.append_value(false);
564 expected_builder.append_value(false);
565 let expected = expected_builder.finish();
566
567 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
568
569 assert_eq!(re.as_ref(), &expected);
570 }
571
572 #[test]
573 fn test_case_insensitive_regexp_like_utf8() {
574 let values = StringArray::from(vec!["abc"; 5]);
575 let patterns =
576 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
577 let flags = StringArray::from(vec!["i"; 5]);
578
579 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
580 expected_builder.append_value(true);
581 expected_builder.append_value(true);
582 expected_builder.append_value(true);
583 expected_builder.append_value(true);
584 expected_builder.append_value(false);
585 let expected = expected_builder.finish();
586
587 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
588 .unwrap();
589
590 assert_eq!(re.as_ref(), &expected);
591 }
592
593 #[test]
594 fn test_case_insensitive_regexp_like_utf8view() {
595 let values = StringViewArray::from(vec!["abc"; 5]);
596 let patterns =
597 StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
598 let flags = StringArray::from(vec!["i"; 5]);
599
600 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
601 expected_builder.append_value(true);
602 expected_builder.append_value(true);
603 expected_builder.append_value(true);
604 expected_builder.append_value(true);
605 expected_builder.append_value(false);
606 let expected = expected_builder.finish();
607
608 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
609 .unwrap();
610
611 assert_eq!(re.as_ref(), &expected);
612 }
613
614 #[test]
615 fn test_unsupported_global_flag_regexp_like() {
616 let values = StringArray::from(vec!["abc"]);
617 let patterns = StringArray::from(vec!["^(a)"]);
618 let flags = StringArray::from(vec!["g"]);
619
620 let re_err =
621 regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
622 .expect_err("unsupported flag should have failed");
623
624 assert_eq!(
625 re_err.strip_backtrace(),
626 "Error during planning: regexp_like() does not support the \"global\" option"
627 );
628 }
629
630 #[test]
631 fn test_regexp_like_scalar_invoke() {
632 let args = vec![
633 ColumnarValue::Scalar(ScalarValue::Utf8(Some("foobarbequebaz".to_string()))),
634 ColumnarValue::Scalar(ScalarValue::Utf8(Some("(bar)(beque)".to_string()))),
635 ];
636 let result = invoke_regexp_like(args).unwrap();
637 match result {
638 ColumnarValue::Scalar(ScalarValue::Boolean(Some(true))) => {}
639 other => panic!("Unexpected result {other:?}"),
640 }
641 }
642
643 #[test]
644 fn test_regexp_like_array_scalar_invoke() {
645 let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
646 let args = vec![
647 ColumnarValue::Array(values),
648 ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
649 ];
650 let result = invoke_regexp_like(args).unwrap();
651 let mut expected_builder = BooleanBuilder::new();
652 expected_builder.append_value(true);
653 expected_builder.append_value(false);
654 let expected = expected_builder.finish();
655 match result {
656 ColumnarValue::Array(array) => {
657 assert_eq!(array.as_ref(), &expected);
658 }
659 other => panic!("Unexpected result {other:?}"),
660 }
661 }
662
663 #[test]
664 fn test_regexp_like_scalar_flags_with_global() {
665 let args = vec![
666 ColumnarValue::Scalar(ScalarValue::Utf8(Some("abc".to_string()))),
667 ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
668 ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
669 ];
670 let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
671 assert_eq!(
672 err.strip_backtrace(),
673 "Error during planning: regexp_like() does not support the \"global\" option"
674 );
675 }
676
677 #[test]
678 fn test_regexp_like_array_scalar_flags_with_global() {
679 let values = Arc::new(StringArray::from(vec!["abc", "xyz"]));
680 let args = vec![
681 ColumnarValue::Array(values),
682 ColumnarValue::Scalar(ScalarValue::Utf8(Some("^(a)".to_string()))),
683 ColumnarValue::Scalar(ScalarValue::Utf8(Some("ig".to_string()))),
684 ];
685 let err = invoke_regexp_like(args).expect_err("global flag should be rejected");
686 assert_eq!(
687 err.strip_backtrace(),
688 "Error during planning: regexp_like() does not support the \"global\" option"
689 );
690 }
691}