datafusion_functions/regex/
regexplike.rs1use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::types::logical_string;
25use datafusion_common::{
26 arrow_datafusion_err, exec_err, internal_err, plan_err, DataFusionError, Result,
27 ScalarValue,
28};
29use datafusion_expr::{
30 binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl,
31 Signature, TypeSignature, TypeSignatureClass, Volatility,
32};
33use datafusion_macros::user_doc;
34
35use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
36use datafusion_expr_common::operator::Operator;
37use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
38use std::any::Any;
39use std::sync::Arc;
40
41#[user_doc(
42 doc_section(label = "Regular Expression Functions"),
43 description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
44 syntax_example = "regexp_like(str, regexp[, flags])",
45 sql_example = r#"```sql
46select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
47+--------------------------------------------------------+
48| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
49+--------------------------------------------------------+
50| true |
51+--------------------------------------------------------+
52SELECT regexp_like('aBc', '(b|d)', 'i');
53+--------------------------------------------------+
54| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
55+--------------------------------------------------+
56| true |
57+--------------------------------------------------+
58```
59Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
60"#,
61 standard_argument(name = "str", prefix = "String"),
62 standard_argument(name = "regexp", prefix = "Regular"),
63 argument(
64 name = "flags",
65 description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
66 - **i**: case-insensitive: letters match both upper and lower case
67 - **m**: multi-line mode: ^ and $ match begin/end of line
68 - **s**: allow . to match \n
69 - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
70 - **U**: swap the meaning of x* and x*?"#
71 )
72)]
73#[derive(Debug, PartialEq, Eq, Hash)]
74pub struct RegexpLikeFunc {
75 signature: Signature,
76}
77
78impl Default for RegexpLikeFunc {
79 fn default() -> Self {
80 Self::new()
81 }
82}
83
84impl RegexpLikeFunc {
85 pub fn new() -> Self {
86 Self {
87 signature: Signature::one_of(
88 vec![
89 TypeSignature::Coercible(vec![
90 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
91 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
92 ]),
93 TypeSignature::Coercible(vec![
94 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
95 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
96 Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
97 ]),
98 ],
99 Volatility::Immutable,
100 ),
101 }
102 }
103}
104
105impl ScalarUDFImpl for RegexpLikeFunc {
106 fn as_any(&self) -> &dyn Any {
107 self
108 }
109
110 fn name(&self) -> &str {
111 "regexp_like"
112 }
113
114 fn signature(&self) -> &Signature {
115 &self.signature
116 }
117
118 fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
119 use DataType::*;
120
121 Ok(match &arg_types[0] {
122 Null => Null,
123 _ => Boolean,
126 })
127 }
128
129 fn invoke_with_args(
130 &self,
131 args: datafusion_expr::ScalarFunctionArgs,
132 ) -> Result<ColumnarValue> {
133 let args = &args.args;
134
135 let len = args
136 .iter()
137 .fold(Option::<usize>::None, |acc, arg| match arg {
138 ColumnarValue::Scalar(_) => acc,
139 ColumnarValue::Array(a) => Some(a.len()),
140 });
141
142 let is_scalar = len.is_none();
143 let inferred_length = len.unwrap_or(1);
144 let args = args
145 .iter()
146 .map(|arg| arg.to_array(inferred_length))
147 .collect::<Result<Vec<_>>>()?;
148
149 let result = regexp_like(&args);
150 if is_scalar {
151 let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
153 result.map(ColumnarValue::Scalar)
154 } else {
155 result.map(ColumnarValue::Array)
156 }
157 }
158
159 fn simplify(
160 &self,
161 mut args: Vec<Expr>,
162 info: &dyn SimplifyInfo,
163 ) -> Result<ExprSimplifyResult> {
164 let Some(op) = derive_operator(&args) else {
169 return Ok(ExprSimplifyResult::Original(args));
170 };
171
172 let string_type = info.get_data_type(&args[0])?;
173 let regexp_type = info.get_data_type(&args[1])?;
174 let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, ®exp_type);
175 let Ok((coerced_string_type, coerced_regexp_type)) =
176 binary_type_coercer.get_input_types()
177 else {
178 return Ok(ExprSimplifyResult::Original(args));
179 };
180
181 let regexp = args.swap_remove(1);
183 let string = args.swap_remove(0);
184
185 Ok(ExprSimplifyResult::Simplified(binary_expr(
186 if string_type != coerced_string_type {
187 cast(string, coerced_string_type)
188 } else {
189 string
190 },
191 op,
192 if regexp_type != coerced_regexp_type {
193 cast(regexp, coerced_regexp_type)
194 } else {
195 regexp
196 },
197 )))
198 }
199
200 fn documentation(&self) -> Option<&Documentation> {
201 self.doc()
202 }
203}
204
205fn derive_operator(args: &[Expr]) -> Option<Operator> {
206 match args.len() {
207 3 => {
209 match &args[2] {
210 Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
211 match flags.as_str() {
212 "i" => Some(Operator::RegexIMatch),
213 "" => Some(Operator::RegexMatch),
214 _ => None,
216 }
217 }
218 _ => None,
220 }
221 }
222 2 => Some(Operator::RegexMatch),
224 _ => None,
226 }
227}
228
229pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
272 match args.len() {
273 2 => handle_regexp_like(&args[0], &args[1], None),
274 3 => {
275 let flags = match args[2].data_type() {
276 Utf8 => args[2].as_string::<i32>(),
277 LargeUtf8 => {
278 let large_string_array = args[2].as_string::<i64>();
279 let string_vec: Vec<Option<&str>> = (0..large_string_array.len()).map(|i| {
280 if large_string_array.is_null(i) {
281 None
282 } else {
283 Some(large_string_array.value(i))
284 }
285 })
286 .collect();
287
288 &GenericStringArray::<i32>::from(string_vec)
289 },
290 _ => {
291 let string_view_array = args[2].as_string_view();
292 let string_vec: Vec<Option<String>> = (0..string_view_array.len()).map(|i| {
293 if string_view_array.is_null(i) {
294 None
295 } else {
296 Some(string_view_array.value(i).to_string())
297 }
298 })
299 .collect();
300 &GenericStringArray::<i32>::from(string_vec)
301 },
302 };
303
304 if flags.iter().any(|s| s == Some("g")) {
305 return plan_err!("regexp_like() does not support the \"global\" option");
306 }
307
308 handle_regexp_like(&args[0], &args[1], Some(flags))
309 },
310 other => exec_err!(
311 "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
312 ),
313 }
314}
315
316fn handle_regexp_like(
317 values: &ArrayRef,
318 patterns: &ArrayRef,
319 flags: Option<&GenericStringArray<i32>>,
320) -> Result<ArrayRef> {
321 let array = match (values.data_type(), patterns.data_type()) {
322 (Utf8View, Utf8) => {
323 let value = values.as_string_view();
324 let pattern = patterns.as_string::<i32>();
325
326 regexp::regexp_is_match(value, pattern, flags)
327 .map_err(|e| arrow_datafusion_err!(e))?
328 }
329 (Utf8View, Utf8View) => {
330 let value = values.as_string_view();
331 let pattern = patterns.as_string_view();
332
333 regexp::regexp_is_match(value, pattern, flags)
334 .map_err(|e| arrow_datafusion_err!(e))?
335 }
336 (Utf8View, LargeUtf8) => {
337 let value = values.as_string_view();
338 let pattern = patterns.as_string::<i64>();
339
340 regexp::regexp_is_match(value, pattern, flags)
341 .map_err(|e| arrow_datafusion_err!(e))?
342 }
343 (Utf8, Utf8) => {
344 let value = values.as_string::<i32>();
345 let pattern = patterns.as_string::<i32>();
346
347 regexp::regexp_is_match(value, pattern, flags)
348 .map_err(|e| arrow_datafusion_err!(e))?
349 }
350 (Utf8, Utf8View) => {
351 let value = values.as_string::<i32>();
352 let pattern = patterns.as_string_view();
353
354 regexp::regexp_is_match(value, pattern, flags)
355 .map_err(|e| arrow_datafusion_err!(e))?
356 }
357 (Utf8, LargeUtf8) => {
358 let value = values.as_string_view();
359 let pattern = patterns.as_string::<i64>();
360
361 regexp::regexp_is_match(value, pattern, flags)
362 .map_err(|e| arrow_datafusion_err!(e))?
363 }
364 (LargeUtf8, Utf8) => {
365 let value = values.as_string::<i64>();
366 let pattern = patterns.as_string::<i32>();
367
368 regexp::regexp_is_match(value, pattern, flags)
369 .map_err(|e| arrow_datafusion_err!(e))?
370 }
371 (LargeUtf8, Utf8View) => {
372 let value = values.as_string::<i64>();
373 let pattern = patterns.as_string_view();
374
375 regexp::regexp_is_match(value, pattern, flags)
376 .map_err(|e| arrow_datafusion_err!(e))?
377 }
378 (LargeUtf8, LargeUtf8) => {
379 let value = values.as_string::<i64>();
380 let pattern = patterns.as_string::<i64>();
381
382 regexp::regexp_is_match(value, pattern, flags)
383 .map_err(|e| arrow_datafusion_err!(e))?
384 }
385 other => {
386 return internal_err!(
387 "Unsupported data type {other:?} for function `regexp_like`"
388 )
389 }
390 };
391
392 Ok(Arc::new(array) as ArrayRef)
393}
394
395#[cfg(test)]
396mod tests {
397 use std::sync::Arc;
398
399 use arrow::array::StringArray;
400 use arrow::array::{BooleanBuilder, StringViewArray};
401
402 use crate::regex::regexplike::regexp_like;
403
404 #[test]
405 fn test_case_sensitive_regexp_like_utf8() {
406 let values = StringArray::from(vec!["abc"; 5]);
407
408 let patterns =
409 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
410
411 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
412 expected_builder.append_value(true);
413 expected_builder.append_value(false);
414 expected_builder.append_value(true);
415 expected_builder.append_value(false);
416 expected_builder.append_value(false);
417 let expected = expected_builder.finish();
418
419 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
420
421 assert_eq!(re.as_ref(), &expected);
422 }
423
424 #[test]
425 fn test_case_sensitive_regexp_like_utf8view() {
426 let values = StringViewArray::from(vec!["abc"; 5]);
427
428 let patterns =
429 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
430
431 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
432 expected_builder.append_value(true);
433 expected_builder.append_value(false);
434 expected_builder.append_value(true);
435 expected_builder.append_value(false);
436 expected_builder.append_value(false);
437 let expected = expected_builder.finish();
438
439 let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
440
441 assert_eq!(re.as_ref(), &expected);
442 }
443
444 #[test]
445 fn test_case_insensitive_regexp_like_utf8() {
446 let values = StringArray::from(vec!["abc"; 5]);
447 let patterns =
448 StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
449 let flags = StringArray::from(vec!["i"; 5]);
450
451 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
452 expected_builder.append_value(true);
453 expected_builder.append_value(true);
454 expected_builder.append_value(true);
455 expected_builder.append_value(true);
456 expected_builder.append_value(false);
457 let expected = expected_builder.finish();
458
459 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
460 .unwrap();
461
462 assert_eq!(re.as_ref(), &expected);
463 }
464
465 #[test]
466 fn test_case_insensitive_regexp_like_utf8view() {
467 let values = StringViewArray::from(vec!["abc"; 5]);
468 let patterns =
469 StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
470 let flags = StringArray::from(vec!["i"; 5]);
471
472 let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
473 expected_builder.append_value(true);
474 expected_builder.append_value(true);
475 expected_builder.append_value(true);
476 expected_builder.append_value(true);
477 expected_builder.append_value(false);
478 let expected = expected_builder.finish();
479
480 let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
481 .unwrap();
482
483 assert_eq!(re.as_ref(), &expected);
484 }
485
486 #[test]
487 fn test_unsupported_global_flag_regexp_like() {
488 let values = StringArray::from(vec!["abc"]);
489 let patterns = StringArray::from(vec!["^(a)"]);
490 let flags = StringArray::from(vec!["g"]);
491
492 let re_err =
493 regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
494 .expect_err("unsupported flag should have failed");
495
496 assert_eq!(
497 re_err.strip_backtrace(),
498 "Error during planning: regexp_like() does not support the \"global\" option"
499 );
500 }
501}