datafusion_functions/regex/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! "regex" DataFusion functions
19
20use arrow::error::ArrowError;
21use regex::Regex;
22use std::collections::hash_map::Entry;
23use std::collections::HashMap;
24use std::sync::Arc;
25pub mod regexpcount;
26pub mod regexpinstr;
27pub mod regexplike;
28pub mod regexpmatch;
29pub mod regexpreplace;
30
31// create UDFs
32make_udf_function!(regexpcount::RegexpCountFunc, regexp_count);
33make_udf_function!(regexpinstr::RegexpInstrFunc, regexp_instr);
34make_udf_function!(regexpmatch::RegexpMatchFunc, regexp_match);
35make_udf_function!(regexplike::RegexpLikeFunc, regexp_like);
36make_udf_function!(regexpreplace::RegexpReplaceFunc, regexp_replace);
37
38pub mod expr_fn {
39    use datafusion_expr::Expr;
40
41    /// Returns the number of consecutive occurrences of a regular expression in a string.
42    pub fn regexp_count(
43        values: Expr,
44        regex: Expr,
45        start: Option<Expr>,
46        flags: Option<Expr>,
47    ) -> Expr {
48        let mut args = vec![values, regex];
49        if let Some(start) = start {
50            args.push(start);
51        };
52
53        if let Some(flags) = flags {
54            args.push(flags);
55        };
56        super::regexp_count().call(args)
57    }
58
59    /// Returns a list of regular expression matches in a string.
60    pub fn regexp_match(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
61        let mut args = vec![values, regex];
62        if let Some(flags) = flags {
63            args.push(flags);
64        };
65        super::regexp_match().call(args)
66    }
67
68    /// Returns index of regular expression matches in a string.
69    pub fn regexp_instr(
70        values: Expr,
71        regex: Expr,
72        start: Option<Expr>,
73        n: Option<Expr>,
74        endoption: Option<Expr>,
75        flags: Option<Expr>,
76        subexpr: Option<Expr>,
77    ) -> Expr {
78        let mut args = vec![values, regex];
79        if let Some(start) = start {
80            args.push(start);
81        };
82        if let Some(n) = n {
83            args.push(n);
84        };
85        if let Some(endoption) = endoption {
86            args.push(endoption);
87        };
88        if let Some(flags) = flags {
89            args.push(flags);
90        };
91        if let Some(subexpr) = subexpr {
92            args.push(subexpr);
93        };
94        super::regexp_instr().call(args)
95    }
96    /// Returns true if a regex has at least one match in a string, false otherwise.
97    pub fn regexp_like(values: Expr, regex: Expr, flags: Option<Expr>) -> Expr {
98        let mut args = vec![values, regex];
99        if let Some(flags) = flags {
100            args.push(flags);
101        };
102        super::regexp_like().call(args)
103    }
104
105    /// Replaces substrings in a string that match.
106    pub fn regexp_replace(
107        string: Expr,
108        pattern: Expr,
109        replacement: Expr,
110        flags: Option<Expr>,
111    ) -> Expr {
112        let mut args = vec![string, pattern, replacement];
113        if let Some(flags) = flags {
114            args.push(flags);
115        };
116        super::regexp_replace().call(args)
117    }
118}
119
120/// Returns all DataFusion functions defined in this package
121pub fn functions() -> Vec<Arc<datafusion_expr::ScalarUDF>> {
122    vec![
123        regexp_count(),
124        regexp_match(),
125        regexp_instr(),
126        regexp_like(),
127        regexp_replace(),
128    ]
129}
130
131pub fn compile_and_cache_regex<'strings, 'cache>(
132    regex: &'strings str,
133    flags: Option<&'strings str>,
134    regex_cache: &'cache mut HashMap<(&'strings str, Option<&'strings str>), Regex>,
135) -> Result<&'cache Regex, ArrowError>
136where
137    'strings: 'cache,
138{
139    let result = match regex_cache.entry((regex, flags)) {
140        Entry::Occupied(occupied_entry) => occupied_entry.into_mut(),
141        Entry::Vacant(vacant_entry) => {
142            let compiled = compile_regex(regex, flags)?;
143            vacant_entry.insert(compiled)
144        }
145    };
146    Ok(result)
147}
148
149pub fn compile_regex(regex: &str, flags: Option<&str>) -> Result<Regex, ArrowError> {
150    let pattern = match flags {
151        None | Some("") => regex.to_string(),
152        Some(flags) => {
153            if flags.contains("g") {
154                return Err(ArrowError::ComputeError(
155                    "regexp_count()/regexp_instr() does not support the global flag"
156                        .to_string(),
157                ));
158            }
159            format!("(?{flags}){regex}")
160        }
161    };
162
163    Regex::new(&pattern).map_err(|_| {
164        ArrowError::ComputeError(format!("Regular expression did not compile: {pattern}"))
165    })
166}