datafusion_functions/regex/
regexplike.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Regex expressions
19
20use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::types::logical_string;
25use datafusion_common::{
26    Result, ScalarValue, arrow_datafusion_err, exec_err, internal_err, plan_err,
27};
28use datafusion_expr::{
29    Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature,
30    TypeSignature, TypeSignatureClass, Volatility, binary_expr, cast,
31};
32use datafusion_macros::user_doc;
33
34use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
35use datafusion_expr_common::operator::Operator;
36use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
37use std::any::Any;
38use std::sync::Arc;
39
40#[user_doc(
41    doc_section(label = "Regular Expression Functions"),
42    description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
43    syntax_example = "regexp_like(str, regexp[, flags])",
44    sql_example = r#"```sql
45select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
46+--------------------------------------------------------+
47| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
48+--------------------------------------------------------+
49| true                                                   |
50+--------------------------------------------------------+
51SELECT regexp_like('aBc', '(b|d)', 'i');
52+--------------------------------------------------+
53| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
54+--------------------------------------------------+
55| true                                             |
56+--------------------------------------------------+
57```
58Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
59"#,
60    standard_argument(name = "str", prefix = "String"),
61    standard_argument(name = "regexp", prefix = "Regular"),
62    argument(
63        name = "flags",
64        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
65  - **i**: case-insensitive: letters match both upper and lower case
66  - **m**: multi-line mode: ^ and $ match begin/end of line
67  - **s**: allow . to match \n
68  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
69  - **U**: swap the meaning of x* and x*?"#
70    )
71)]
72#[derive(Debug, PartialEq, Eq, Hash)]
73pub struct RegexpLikeFunc {
74    signature: Signature,
75}
76
77impl Default for RegexpLikeFunc {
78    fn default() -> Self {
79        Self::new()
80    }
81}
82
83impl RegexpLikeFunc {
84    pub fn new() -> Self {
85        Self {
86            signature: Signature::one_of(
87                vec![
88                    TypeSignature::Coercible(vec![
89                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
90                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
91                    ]),
92                    TypeSignature::Coercible(vec![
93                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
94                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
95                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
96                    ]),
97                ],
98                Volatility::Immutable,
99            ),
100        }
101    }
102}
103
104impl ScalarUDFImpl for RegexpLikeFunc {
105    fn as_any(&self) -> &dyn Any {
106        self
107    }
108
109    fn name(&self) -> &str {
110        "regexp_like"
111    }
112
113    fn signature(&self) -> &Signature {
114        &self.signature
115    }
116
117    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
118        use DataType::*;
119
120        Ok(match &arg_types[0] {
121            Null => Null,
122            // Type coercion is done by DataFusion based on signature, so if we
123            // get here, the first argument is always a string
124            _ => Boolean,
125        })
126    }
127
128    fn invoke_with_args(
129        &self,
130        args: datafusion_expr::ScalarFunctionArgs,
131    ) -> Result<ColumnarValue> {
132        let args = &args.args;
133
134        let len = args
135            .iter()
136            .fold(Option::<usize>::None, |acc, arg| match arg {
137                ColumnarValue::Scalar(_) => acc,
138                ColumnarValue::Array(a) => Some(a.len()),
139            });
140
141        let is_scalar = len.is_none();
142        let inferred_length = len.unwrap_or(1);
143        let args = args
144            .iter()
145            .map(|arg| arg.to_array(inferred_length))
146            .collect::<Result<Vec<_>>>()?;
147
148        let result = regexp_like(&args);
149        if is_scalar {
150            // If all inputs are scalar, keeps output as scalar
151            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
152            result.map(ColumnarValue::Scalar)
153        } else {
154            result.map(ColumnarValue::Array)
155        }
156    }
157
158    fn simplify(
159        &self,
160        mut args: Vec<Expr>,
161        info: &dyn SimplifyInfo,
162    ) -> Result<ExprSimplifyResult> {
163        // Try to simplify regexp_like usage to one of the builtin operators since those have
164        // optimized code paths for the case where the regular expression pattern is a scalar.
165        // Additionally, the expression simplification optimization pass will attempt to further
166        // simplify regular expression patterns used in operator expressions.
167        let Some(op) = derive_operator(&args) else {
168            return Ok(ExprSimplifyResult::Original(args));
169        };
170
171        let string_type = info.get_data_type(&args[0])?;
172        let regexp_type = info.get_data_type(&args[1])?;
173        let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, &regexp_type);
174        let Ok((coerced_string_type, coerced_regexp_type)) =
175            binary_type_coercer.get_input_types()
176        else {
177            return Ok(ExprSimplifyResult::Original(args));
178        };
179
180        // regexp_like(str, regexp [, flags])
181        let regexp = args.swap_remove(1);
182        let string = args.swap_remove(0);
183
184        Ok(ExprSimplifyResult::Simplified(binary_expr(
185            if string_type != coerced_string_type {
186                cast(string, coerced_string_type)
187            } else {
188                string
189            },
190            op,
191            if regexp_type != coerced_regexp_type {
192                cast(regexp, coerced_regexp_type)
193            } else {
194                regexp
195            },
196        )))
197    }
198
199    fn documentation(&self) -> Option<&Documentation> {
200        self.doc()
201    }
202}
203
204fn derive_operator(args: &[Expr]) -> Option<Operator> {
205    match args.len() {
206        // regexp_like(str, regexp, flags)
207        3 => {
208            match &args[2] {
209                Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
210                    match flags.as_str() {
211                        "i" => Some(Operator::RegexIMatch),
212                        "" => Some(Operator::RegexMatch),
213                        // Any flags besides 'i' have no operator equivalent
214                        _ => None,
215                    }
216                }
217                // `flags` is not a literal, so we can't derive the correct operator statically
218                _ => None,
219            }
220        }
221        // regexp_like(str, regexp)
222        2 => Some(Operator::RegexMatch),
223        // Should never happen, but just in case
224        _ => None,
225    }
226}
227
228/// Tests a string using a regular expression returning true if at
229/// least one match, false otherwise.
230///
231/// The full list of supported features and syntax can be found at
232/// <https://docs.rs/regex/latest/regex/#syntax>
233///
234/// Supported flags can be found at
235/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
236///
237/// # Examples
238///
239/// ```ignore
240/// # use datafusion::prelude::*;
241/// # use datafusion::error::Result;
242/// # #[tokio::main]
243/// # async fn main() -> Result<()> {
244/// let ctx = SessionContext::new();
245/// let df = ctx.read_csv("tests/data/regex.csv", CsvReadOptions::new()).await?;
246///
247/// // use the regexp_like function to test col 'values',
248/// // against patterns in col 'patterns' without flags
249/// let df = df.with_column(
250///     "a",
251///     regexp_like(vec![col("values"), col("patterns")])
252/// )?;
253/// // use the regexp_like function to test col 'values',
254/// // against patterns in col 'patterns' with flags
255/// let df = df.with_column(
256///     "b",
257///     regexp_like(vec![col("values"), col("patterns"), col("flags")])
258/// )?;
259/// // literals can be used as well with dataframe calls
260/// let df = df.with_column(
261///     "c",
262///     regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
263/// )?;
264///
265/// df.show().await?;
266///
267/// # Ok(())
268/// # }
269/// ```
270pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
271    match args.len() {
272        2 => handle_regexp_like(&args[0], &args[1], None),
273        3 => {
274            let flags = match args[2].data_type() {
275                Utf8 => args[2].as_string::<i32>(),
276                LargeUtf8 => {
277                    let large_string_array = args[2].as_string::<i64>();
278                    let string_vec: Vec<Option<&str>> = (0..large_string_array.len())
279                        .map(|i| {
280                            if large_string_array.is_null(i) {
281                                None
282                            } else {
283                                Some(large_string_array.value(i))
284                            }
285                        })
286                        .collect();
287
288                    &GenericStringArray::<i32>::from(string_vec)
289                }
290                _ => {
291                    let string_view_array = args[2].as_string_view();
292                    let string_vec: Vec<Option<String>> = (0..string_view_array.len())
293                        .map(|i| {
294                            if string_view_array.is_null(i) {
295                                None
296                            } else {
297                                Some(string_view_array.value(i).to_string())
298                            }
299                        })
300                        .collect();
301                    &GenericStringArray::<i32>::from(string_vec)
302                }
303            };
304
305            if flags.iter().any(|s| s == Some("g")) {
306                return plan_err!("regexp_like() does not support the \"global\" option");
307            }
308
309            handle_regexp_like(&args[0], &args[1], Some(flags))
310        }
311        other => exec_err!(
312            "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
313        ),
314    }
315}
316
317fn handle_regexp_like(
318    values: &ArrayRef,
319    patterns: &ArrayRef,
320    flags: Option<&GenericStringArray<i32>>,
321) -> Result<ArrayRef> {
322    let array = match (values.data_type(), patterns.data_type()) {
323        (Utf8View, Utf8) => {
324            let value = values.as_string_view();
325            let pattern = patterns.as_string::<i32>();
326
327            regexp::regexp_is_match(value, pattern, flags)
328                .map_err(|e| arrow_datafusion_err!(e))?
329        }
330        (Utf8View, Utf8View) => {
331            let value = values.as_string_view();
332            let pattern = patterns.as_string_view();
333
334            regexp::regexp_is_match(value, pattern, flags)
335                .map_err(|e| arrow_datafusion_err!(e))?
336        }
337        (Utf8View, LargeUtf8) => {
338            let value = values.as_string_view();
339            let pattern = patterns.as_string::<i64>();
340
341            regexp::regexp_is_match(value, pattern, flags)
342                .map_err(|e| arrow_datafusion_err!(e))?
343        }
344        (Utf8, Utf8) => {
345            let value = values.as_string::<i32>();
346            let pattern = patterns.as_string::<i32>();
347
348            regexp::regexp_is_match(value, pattern, flags)
349                .map_err(|e| arrow_datafusion_err!(e))?
350        }
351        (Utf8, Utf8View) => {
352            let value = values.as_string::<i32>();
353            let pattern = patterns.as_string_view();
354
355            regexp::regexp_is_match(value, pattern, flags)
356                .map_err(|e| arrow_datafusion_err!(e))?
357        }
358        (Utf8, LargeUtf8) => {
359            let value = values.as_string_view();
360            let pattern = patterns.as_string::<i64>();
361
362            regexp::regexp_is_match(value, pattern, flags)
363                .map_err(|e| arrow_datafusion_err!(e))?
364        }
365        (LargeUtf8, Utf8) => {
366            let value = values.as_string::<i64>();
367            let pattern = patterns.as_string::<i32>();
368
369            regexp::regexp_is_match(value, pattern, flags)
370                .map_err(|e| arrow_datafusion_err!(e))?
371        }
372        (LargeUtf8, Utf8View) => {
373            let value = values.as_string::<i64>();
374            let pattern = patterns.as_string_view();
375
376            regexp::regexp_is_match(value, pattern, flags)
377                .map_err(|e| arrow_datafusion_err!(e))?
378        }
379        (LargeUtf8, LargeUtf8) => {
380            let value = values.as_string::<i64>();
381            let pattern = patterns.as_string::<i64>();
382
383            regexp::regexp_is_match(value, pattern, flags)
384                .map_err(|e| arrow_datafusion_err!(e))?
385        }
386        other => {
387            return internal_err!(
388                "Unsupported data type {other:?} for function `regexp_like`"
389            );
390        }
391    };
392
393    Ok(Arc::new(array) as ArrayRef)
394}
395
396#[cfg(test)]
397mod tests {
398    use std::sync::Arc;
399
400    use arrow::array::StringArray;
401    use arrow::array::{BooleanBuilder, StringViewArray};
402
403    use crate::regex::regexplike::regexp_like;
404
405    #[test]
406    fn test_case_sensitive_regexp_like_utf8() {
407        let values = StringArray::from(vec!["abc"; 5]);
408
409        let patterns =
410            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
411
412        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
413        expected_builder.append_value(true);
414        expected_builder.append_value(false);
415        expected_builder.append_value(true);
416        expected_builder.append_value(false);
417        expected_builder.append_value(false);
418        let expected = expected_builder.finish();
419
420        let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
421
422        assert_eq!(re.as_ref(), &expected);
423    }
424
425    #[test]
426    fn test_case_sensitive_regexp_like_utf8view() {
427        let values = StringViewArray::from(vec!["abc"; 5]);
428
429        let patterns =
430            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
431
432        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
433        expected_builder.append_value(true);
434        expected_builder.append_value(false);
435        expected_builder.append_value(true);
436        expected_builder.append_value(false);
437        expected_builder.append_value(false);
438        let expected = expected_builder.finish();
439
440        let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
441
442        assert_eq!(re.as_ref(), &expected);
443    }
444
445    #[test]
446    fn test_case_insensitive_regexp_like_utf8() {
447        let values = StringArray::from(vec!["abc"; 5]);
448        let patterns =
449            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
450        let flags = StringArray::from(vec!["i"; 5]);
451
452        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
453        expected_builder.append_value(true);
454        expected_builder.append_value(true);
455        expected_builder.append_value(true);
456        expected_builder.append_value(true);
457        expected_builder.append_value(false);
458        let expected = expected_builder.finish();
459
460        let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
461            .unwrap();
462
463        assert_eq!(re.as_ref(), &expected);
464    }
465
466    #[test]
467    fn test_case_insensitive_regexp_like_utf8view() {
468        let values = StringViewArray::from(vec!["abc"; 5]);
469        let patterns =
470            StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
471        let flags = StringArray::from(vec!["i"; 5]);
472
473        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
474        expected_builder.append_value(true);
475        expected_builder.append_value(true);
476        expected_builder.append_value(true);
477        expected_builder.append_value(true);
478        expected_builder.append_value(false);
479        let expected = expected_builder.finish();
480
481        let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
482            .unwrap();
483
484        assert_eq!(re.as_ref(), &expected);
485    }
486
487    #[test]
488    fn test_unsupported_global_flag_regexp_like() {
489        let values = StringArray::from(vec!["abc"]);
490        let patterns = StringArray::from(vec!["^(a)"]);
491        let flags = StringArray::from(vec!["g"]);
492
493        let re_err =
494            regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
495                .expect_err("unsupported flag should have failed");
496
497        assert_eq!(
498            re_err.strip_backtrace(),
499            "Error during planning: regexp_like() does not support the \"global\" option"
500        );
501    }
502}