datafusion_functions/regex/
regexplike.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Regex expressions
19
20use arrow::array::{Array, ArrayRef, AsArray, GenericStringArray};
21use arrow::compute::kernels::regexp;
22use arrow::datatypes::DataType;
23use arrow::datatypes::DataType::{LargeUtf8, Utf8, Utf8View};
24use datafusion_common::types::logical_string;
25use datafusion_common::{
26    arrow_datafusion_err, exec_err, internal_err, plan_err, DataFusionError, Result,
27    ScalarValue,
28};
29use datafusion_expr::{
30    binary_expr, cast, Coercion, ColumnarValue, Documentation, Expr, ScalarUDFImpl,
31    Signature, TypeSignature, TypeSignatureClass, Volatility,
32};
33use datafusion_macros::user_doc;
34
35use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo};
36use datafusion_expr_common::operator::Operator;
37use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer;
38use std::any::Any;
39use std::sync::Arc;
40
41#[user_doc(
42    doc_section(label = "Regular Expression Functions"),
43    description = "Returns true if a [regular expression](https://docs.rs/regex/latest/regex/#syntax) has at least one match in a string, false otherwise.",
44    syntax_example = "regexp_like(str, regexp[, flags])",
45    sql_example = r#"```sql
46select regexp_like('Köln', '[a-zA-Z]ö[a-zA-Z]{2}');
47+--------------------------------------------------------+
48| regexp_like(Utf8("Köln"),Utf8("[a-zA-Z]ö[a-zA-Z]{2}")) |
49+--------------------------------------------------------+
50| true                                                   |
51+--------------------------------------------------------+
52SELECT regexp_like('aBc', '(b|d)', 'i');
53+--------------------------------------------------+
54| regexp_like(Utf8("aBc"),Utf8("(b|d)"),Utf8("i")) |
55+--------------------------------------------------+
56| true                                             |
57+--------------------------------------------------+
58```
59Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/regexp.rs)
60"#,
61    standard_argument(name = "str", prefix = "String"),
62    standard_argument(name = "regexp", prefix = "Regular"),
63    argument(
64        name = "flags",
65        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
66  - **i**: case-insensitive: letters match both upper and lower case
67  - **m**: multi-line mode: ^ and $ match begin/end of line
68  - **s**: allow . to match \n
69  - **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
70  - **U**: swap the meaning of x* and x*?"#
71    )
72)]
73#[derive(Debug, PartialEq, Eq, Hash)]
74pub struct RegexpLikeFunc {
75    signature: Signature,
76}
77
78impl Default for RegexpLikeFunc {
79    fn default() -> Self {
80        Self::new()
81    }
82}
83
84impl RegexpLikeFunc {
85    pub fn new() -> Self {
86        Self {
87            signature: Signature::one_of(
88                vec![
89                    TypeSignature::Coercible(vec![
90                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
91                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
92                    ]),
93                    TypeSignature::Coercible(vec![
94                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
95                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
96                        Coercion::new_exact(TypeSignatureClass::Native(logical_string())),
97                    ]),
98                ],
99                Volatility::Immutable,
100            ),
101        }
102    }
103}
104
105impl ScalarUDFImpl for RegexpLikeFunc {
106    fn as_any(&self) -> &dyn Any {
107        self
108    }
109
110    fn name(&self) -> &str {
111        "regexp_like"
112    }
113
114    fn signature(&self) -> &Signature {
115        &self.signature
116    }
117
118    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
119        use DataType::*;
120
121        Ok(match &arg_types[0] {
122            Null => Null,
123            // Type coercion is done by DataFusion based on signature, so if we
124            // get here, the first argument is always a string
125            _ => Boolean,
126        })
127    }
128
129    fn invoke_with_args(
130        &self,
131        args: datafusion_expr::ScalarFunctionArgs,
132    ) -> Result<ColumnarValue> {
133        let args = &args.args;
134
135        let len = args
136            .iter()
137            .fold(Option::<usize>::None, |acc, arg| match arg {
138                ColumnarValue::Scalar(_) => acc,
139                ColumnarValue::Array(a) => Some(a.len()),
140            });
141
142        let is_scalar = len.is_none();
143        let inferred_length = len.unwrap_or(1);
144        let args = args
145            .iter()
146            .map(|arg| arg.to_array(inferred_length))
147            .collect::<Result<Vec<_>>>()?;
148
149        let result = regexp_like(&args);
150        if is_scalar {
151            // If all inputs are scalar, keeps output as scalar
152            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
153            result.map(ColumnarValue::Scalar)
154        } else {
155            result.map(ColumnarValue::Array)
156        }
157    }
158
159    fn simplify(
160        &self,
161        mut args: Vec<Expr>,
162        info: &dyn SimplifyInfo,
163    ) -> Result<ExprSimplifyResult> {
164        // Try to simplify regexp_like usage to one of the builtin operators since those have
165        // optimized code paths for the case where the regular expression pattern is a scalar.
166        // Additionally, the expression simplification optimization pass will attempt to further
167        // simplify regular expression patterns used in operator expressions.
168        let Some(op) = derive_operator(&args) else {
169            return Ok(ExprSimplifyResult::Original(args));
170        };
171
172        let string_type = info.get_data_type(&args[0])?;
173        let regexp_type = info.get_data_type(&args[1])?;
174        let binary_type_coercer = BinaryTypeCoercer::new(&string_type, &op, &regexp_type);
175        let Ok((coerced_string_type, coerced_regexp_type)) =
176            binary_type_coercer.get_input_types()
177        else {
178            return Ok(ExprSimplifyResult::Original(args));
179        };
180
181        // regexp_like(str, regexp [, flags])
182        let regexp = args.swap_remove(1);
183        let string = args.swap_remove(0);
184
185        Ok(ExprSimplifyResult::Simplified(binary_expr(
186            if string_type != coerced_string_type {
187                cast(string, coerced_string_type)
188            } else {
189                string
190            },
191            op,
192            if regexp_type != coerced_regexp_type {
193                cast(regexp, coerced_regexp_type)
194            } else {
195                regexp
196            },
197        )))
198    }
199
200    fn documentation(&self) -> Option<&Documentation> {
201        self.doc()
202    }
203}
204
205fn derive_operator(args: &[Expr]) -> Option<Operator> {
206    match args.len() {
207        // regexp_like(str, regexp, flags)
208        3 => {
209            match &args[2] {
210                Expr::Literal(ScalarValue::Utf8(Some(flags)), _) => {
211                    match flags.as_str() {
212                        "i" => Some(Operator::RegexIMatch),
213                        "" => Some(Operator::RegexMatch),
214                        // Any flags besides 'i' have no operator equivalent
215                        _ => None,
216                    }
217                }
218                // `flags` is not a literal, so we can't derive the correct operator statically
219                _ => None,
220            }
221        }
222        // regexp_like(str, regexp)
223        2 => Some(Operator::RegexMatch),
224        // Should never happen, but just in case
225        _ => None,
226    }
227}
228
229/// Tests a string using a regular expression returning true if at
230/// least one match, false otherwise.
231///
232/// The full list of supported features and syntax can be found at
233/// <https://docs.rs/regex/latest/regex/#syntax>
234///
235/// Supported flags can be found at
236/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
237///
238/// # Examples
239///
240/// ```ignore
241/// # use datafusion::prelude::*;
242/// # use datafusion::error::Result;
243/// # #[tokio::main]
244/// # async fn main() -> Result<()> {
245/// let ctx = SessionContext::new();
246/// let df = ctx.read_csv("tests/data/regex.csv", CsvReadOptions::new()).await?;
247///
248/// // use the regexp_like function to test col 'values',
249/// // against patterns in col 'patterns' without flags
250/// let df = df.with_column(
251///     "a",
252///     regexp_like(vec![col("values"), col("patterns")])
253/// )?;
254/// // use the regexp_like function to test col 'values',
255/// // against patterns in col 'patterns' with flags
256/// let df = df.with_column(
257///     "b",
258///     regexp_like(vec![col("values"), col("patterns"), col("flags")])
259/// )?;
260/// // literals can be used as well with dataframe calls
261/// let df = df.with_column(
262///     "c",
263///     regexp_like(vec![lit("foobarbequebaz"), lit("(bar)(beque)")])
264/// )?;
265///
266/// df.show().await?;
267///
268/// # Ok(())
269/// # }
270/// ```
271pub fn regexp_like(args: &[ArrayRef]) -> Result<ArrayRef> {
272    match args.len() {
273        2 => handle_regexp_like(&args[0], &args[1], None),
274        3 => {
275            let flags = match args[2].data_type() {
276                Utf8 => args[2].as_string::<i32>(),
277                LargeUtf8 => {
278                    let large_string_array = args[2].as_string::<i64>();
279                    let string_vec: Vec<Option<&str>> = (0..large_string_array.len()).map(|i| {
280                        if large_string_array.is_null(i) {
281                            None
282                        } else {
283                            Some(large_string_array.value(i))
284                        }
285                    })
286                    .collect();
287
288                    &GenericStringArray::<i32>::from(string_vec)
289                },
290                _ => {
291                    let string_view_array = args[2].as_string_view();
292                    let string_vec: Vec<Option<String>> = (0..string_view_array.len()).map(|i| {
293                        if string_view_array.is_null(i) {
294                            None
295                        } else {
296                            Some(string_view_array.value(i).to_string())
297                        }
298                    })
299                    .collect();
300                    &GenericStringArray::<i32>::from(string_vec)
301                },
302            };
303
304            if flags.iter().any(|s| s == Some("g")) {
305                return plan_err!("regexp_like() does not support the \"global\" option");
306            }
307
308            handle_regexp_like(&args[0], &args[1], Some(flags))
309        },
310        other => exec_err!(
311            "`regexp_like` was called with {other} arguments. It requires at least 2 and at most 3."
312        ),
313    }
314}
315
316fn handle_regexp_like(
317    values: &ArrayRef,
318    patterns: &ArrayRef,
319    flags: Option<&GenericStringArray<i32>>,
320) -> Result<ArrayRef> {
321    let array = match (values.data_type(), patterns.data_type()) {
322        (Utf8View, Utf8) => {
323            let value = values.as_string_view();
324            let pattern = patterns.as_string::<i32>();
325
326            regexp::regexp_is_match(value, pattern, flags)
327                .map_err(|e| arrow_datafusion_err!(e))?
328        }
329        (Utf8View, Utf8View) => {
330            let value = values.as_string_view();
331            let pattern = patterns.as_string_view();
332
333            regexp::regexp_is_match(value, pattern, flags)
334                .map_err(|e| arrow_datafusion_err!(e))?
335        }
336        (Utf8View, LargeUtf8) => {
337            let value = values.as_string_view();
338            let pattern = patterns.as_string::<i64>();
339
340            regexp::regexp_is_match(value, pattern, flags)
341                .map_err(|e| arrow_datafusion_err!(e))?
342        }
343        (Utf8, Utf8) => {
344            let value = values.as_string::<i32>();
345            let pattern = patterns.as_string::<i32>();
346
347            regexp::regexp_is_match(value, pattern, flags)
348                .map_err(|e| arrow_datafusion_err!(e))?
349        }
350        (Utf8, Utf8View) => {
351            let value = values.as_string::<i32>();
352            let pattern = patterns.as_string_view();
353
354            regexp::regexp_is_match(value, pattern, flags)
355                .map_err(|e| arrow_datafusion_err!(e))?
356        }
357        (Utf8, LargeUtf8) => {
358            let value = values.as_string_view();
359            let pattern = patterns.as_string::<i64>();
360
361            regexp::regexp_is_match(value, pattern, flags)
362                .map_err(|e| arrow_datafusion_err!(e))?
363        }
364        (LargeUtf8, Utf8) => {
365            let value = values.as_string::<i64>();
366            let pattern = patterns.as_string::<i32>();
367
368            regexp::regexp_is_match(value, pattern, flags)
369                .map_err(|e| arrow_datafusion_err!(e))?
370        }
371        (LargeUtf8, Utf8View) => {
372            let value = values.as_string::<i64>();
373            let pattern = patterns.as_string_view();
374
375            regexp::regexp_is_match(value, pattern, flags)
376                .map_err(|e| arrow_datafusion_err!(e))?
377        }
378        (LargeUtf8, LargeUtf8) => {
379            let value = values.as_string::<i64>();
380            let pattern = patterns.as_string::<i64>();
381
382            regexp::regexp_is_match(value, pattern, flags)
383                .map_err(|e| arrow_datafusion_err!(e))?
384        }
385        other => {
386            return internal_err!(
387                "Unsupported data type {other:?} for function `regexp_like`"
388            )
389        }
390    };
391
392    Ok(Arc::new(array) as ArrayRef)
393}
394
395#[cfg(test)]
396mod tests {
397    use std::sync::Arc;
398
399    use arrow::array::StringArray;
400    use arrow::array::{BooleanBuilder, StringViewArray};
401
402    use crate::regex::regexplike::regexp_like;
403
404    #[test]
405    fn test_case_sensitive_regexp_like_utf8() {
406        let values = StringArray::from(vec!["abc"; 5]);
407
408        let patterns =
409            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
410
411        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
412        expected_builder.append_value(true);
413        expected_builder.append_value(false);
414        expected_builder.append_value(true);
415        expected_builder.append_value(false);
416        expected_builder.append_value(false);
417        let expected = expected_builder.finish();
418
419        let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
420
421        assert_eq!(re.as_ref(), &expected);
422    }
423
424    #[test]
425    fn test_case_sensitive_regexp_like_utf8view() {
426        let values = StringViewArray::from(vec!["abc"; 5]);
427
428        let patterns =
429            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
430
431        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
432        expected_builder.append_value(true);
433        expected_builder.append_value(false);
434        expected_builder.append_value(true);
435        expected_builder.append_value(false);
436        expected_builder.append_value(false);
437        let expected = expected_builder.finish();
438
439        let re = regexp_like(&[Arc::new(values), Arc::new(patterns)]).unwrap();
440
441        assert_eq!(re.as_ref(), &expected);
442    }
443
444    #[test]
445    fn test_case_insensitive_regexp_like_utf8() {
446        let values = StringArray::from(vec!["abc"; 5]);
447        let patterns =
448            StringArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
449        let flags = StringArray::from(vec!["i"; 5]);
450
451        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
452        expected_builder.append_value(true);
453        expected_builder.append_value(true);
454        expected_builder.append_value(true);
455        expected_builder.append_value(true);
456        expected_builder.append_value(false);
457        let expected = expected_builder.finish();
458
459        let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
460            .unwrap();
461
462        assert_eq!(re.as_ref(), &expected);
463    }
464
465    #[test]
466    fn test_case_insensitive_regexp_like_utf8view() {
467        let values = StringViewArray::from(vec!["abc"; 5]);
468        let patterns =
469            StringViewArray::from(vec!["^(a)", "^(A)", "(b|d)", "(B|D)", "^(b|c)"]);
470        let flags = StringArray::from(vec!["i"; 5]);
471
472        let mut expected_builder: BooleanBuilder = BooleanBuilder::new();
473        expected_builder.append_value(true);
474        expected_builder.append_value(true);
475        expected_builder.append_value(true);
476        expected_builder.append_value(true);
477        expected_builder.append_value(false);
478        let expected = expected_builder.finish();
479
480        let re = regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
481            .unwrap();
482
483        assert_eq!(re.as_ref(), &expected);
484    }
485
486    #[test]
487    fn test_unsupported_global_flag_regexp_like() {
488        let values = StringArray::from(vec!["abc"]);
489        let patterns = StringArray::from(vec!["^(a)"]);
490        let flags = StringArray::from(vec!["g"]);
491
492        let re_err =
493            regexp_like(&[Arc::new(values), Arc::new(patterns), Arc::new(flags)])
494                .expect_err("unsupported flag should have failed");
495
496        assert_eq!(
497            re_err.strip_backtrace(),
498            "Error during planning: regexp_like() does not support the \"global\" option"
499        );
500    }
501}