Skip to main content

datafusion_functions/regex/
regexpreplace.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Regex expressions
19use arrow::array::ArrayDataBuilder;
20use arrow::array::BufferBuilder;
21use arrow::array::GenericStringArray;
22use arrow::array::StringViewBuilder;
23use arrow::array::{Array, ArrayRef, OffsetSizeTrait};
24use arrow::array::{ArrayAccessor, StringViewArray};
25use arrow::array::{ArrayIter, AsArray, new_null_array};
26use arrow::datatypes::DataType;
27use datafusion_common::ScalarValue;
28use datafusion_common::cast::{
29    as_large_string_array, as_string_array, as_string_view_array,
30};
31use datafusion_common::exec_err;
32use datafusion_common::plan_err;
33use datafusion_common::{
34    DataFusionError, Result, cast::as_generic_string_array, internal_err,
35};
36use datafusion_expr::ColumnarValue;
37use datafusion_expr::TypeSignature;
38use datafusion_expr::function::Hint;
39use datafusion_expr::{Documentation, ScalarUDFImpl, Signature, Volatility};
40use datafusion_macros::user_doc;
41use regex::Regex;
42use std::any::Any;
43use std::collections::HashMap;
44use std::sync::{Arc, LazyLock};
45
46#[user_doc(
47    doc_section(label = "Regular Expression Functions"),
48    description = "Replaces substrings in a string that match a [regular expression](https://docs.rs/regex/latest/regex/#syntax).",
49    syntax_example = "regexp_replace(str, regexp, replacement[, flags])",
50    sql_example = r#"```sql
51> select regexp_replace('foobarbaz', 'b(..)', 'X\\1Y', 'g');
52+------------------------------------------------------------------------+
53| regexp_replace(Utf8("foobarbaz"),Utf8("b(..)"),Utf8("X\1Y"),Utf8("g")) |
54+------------------------------------------------------------------------+
55| fooXarYXazY                                                            |
56+------------------------------------------------------------------------+
57SELECT regexp_replace('aBc', '(b|d)', 'Ab\\1a', 'i');
58+-------------------------------------------------------------------+
59| regexp_replace(Utf8("aBc"),Utf8("(b|d)"),Utf8("Ab\1a"),Utf8("i")) |
60+-------------------------------------------------------------------+
61| aAbBac                                                            |
62+-------------------------------------------------------------------+
63```
64Additional examples can be found [here](https://github.com/apache/datafusion/blob/main/datafusion-examples/examples/builtin_functions/regexp.rs)
65"#,
66    standard_argument(name = "str", prefix = "String"),
67    argument(
68        name = "regexp",
69        description = "Regular expression to match against.
70  Can be a constant, column, or function."
71    ),
72    argument(
73        name = "replacement",
74        description = "Replacement string expression to operate on. Can be a constant, column, or function, and any combination of operators."
75    ),
76    argument(
77        name = "flags",
78        description = r#"Optional regular expression flags that control the behavior of the regular expression. The following flags are supported:
79- **g**: (global) Search globally and don't return after the first match
80- **i**: case-insensitive: letters match both upper and lower case
81- **m**: multi-line mode: ^ and $ match begin/end of line
82- **s**: allow . to match \n
83- **R**: enables CRLF mode: when multi-line mode is enabled, \r\n is used
84- **U**: swap the meaning of x* and x*?"#
85    )
86)]
87#[derive(Debug, PartialEq, Eq, Hash)]
88pub struct RegexpReplaceFunc {
89    signature: Signature,
90}
91impl Default for RegexpReplaceFunc {
92    fn default() -> Self {
93        Self::new()
94    }
95}
96
97impl RegexpReplaceFunc {
98    pub fn new() -> Self {
99        use DataType::*;
100        use TypeSignature::*;
101        Self {
102            signature: Signature::one_of(
103                vec![
104                    Uniform(3, vec![Utf8View, LargeUtf8, Utf8]),
105                    Uniform(4, vec![Utf8View, LargeUtf8, Utf8]),
106                ],
107                Volatility::Immutable,
108            ),
109        }
110    }
111}
112
113impl ScalarUDFImpl for RegexpReplaceFunc {
114    fn as_any(&self) -> &dyn Any {
115        self
116    }
117
118    fn name(&self) -> &str {
119        "regexp_replace"
120    }
121
122    fn signature(&self) -> &Signature {
123        &self.signature
124    }
125
126    fn return_type(&self, arg_types: &[DataType]) -> Result<DataType> {
127        use DataType::*;
128        Ok(match &arg_types[0] {
129            LargeUtf8 | LargeBinary => LargeUtf8,
130            Utf8 | Binary => Utf8,
131            Utf8View | BinaryView => Utf8View,
132            Null => Null,
133            Dictionary(_, t) => match **t {
134                LargeUtf8 | LargeBinary => LargeUtf8,
135                Utf8 | Binary => Utf8,
136                Null => Null,
137                _ => {
138                    return plan_err!(
139                        "the regexp_replace can only accept strings but got {:?}",
140                        **t
141                    );
142                }
143            },
144            other => {
145                return plan_err!(
146                    "The regexp_replace function can only accept strings. Got {other}"
147                );
148            }
149        })
150    }
151
152    fn invoke_with_args(
153        &self,
154        args: datafusion_expr::ScalarFunctionArgs,
155    ) -> Result<ColumnarValue> {
156        let args = &args.args;
157
158        let len = args
159            .iter()
160            .fold(Option::<usize>::None, |acc, arg| match arg {
161                ColumnarValue::Scalar(_) => acc,
162                ColumnarValue::Array(a) => Some(a.len()),
163            });
164
165        let is_scalar = len.is_none();
166        let result = regexp_replace_func(args);
167        if is_scalar {
168            // If all inputs are scalar, keeps output as scalar
169            let result = result.and_then(|arr| ScalarValue::try_from_array(&arr, 0));
170            result.map(ColumnarValue::Scalar)
171        } else {
172            result.map(ColumnarValue::Array)
173        }
174    }
175
176    fn documentation(&self) -> Option<&Documentation> {
177        self.doc()
178    }
179}
180
181fn regexp_replace_func(args: &[ColumnarValue]) -> Result<ArrayRef> {
182    match args[0].data_type() {
183        DataType::Utf8 => specialize_regexp_replace::<i32>(args),
184        DataType::LargeUtf8 => specialize_regexp_replace::<i64>(args),
185        DataType::Utf8View => specialize_regexp_replace::<i32>(args),
186        other => {
187            internal_err!("Unsupported data type {other:?} for function regexp_replace")
188        }
189    }
190}
191
192/// replace POSIX capture groups (like \1 or \\1) with Rust Regex group (like ${1})
193/// used by regexp_replace
194/// Handles both single backslash (\1) and double backslash (\\1) which can occur
195/// when SQL strings with escaped backslashes are passed through
196///
197/// Note: \0 is converted to ${0}, which in Rust's regex replacement syntax
198/// substitutes the entire match. This is consistent with POSIX behavior where
199/// \0 (or &) refers to the entire matched string.
200fn regex_replace_posix_groups(replacement: &str) -> String {
201    static CAPTURE_GROUPS_RE_LOCK: LazyLock<Regex> =
202        LazyLock::new(|| Regex::new(r"\\{1,2}(\d+)").unwrap());
203    CAPTURE_GROUPS_RE_LOCK
204        .replace_all(replacement, "$${$1}")
205        .into_owned()
206}
207
208/// Replaces substring(s) matching a PCRE-like regular expression.
209///
210/// The full list of supported features and syntax can be found at
211/// <https://docs.rs/regex/latest/regex/#syntax>
212///
213/// Supported flags with the addition of 'g' can be found at
214/// <https://docs.rs/regex/latest/regex/#grouping-and-flags>
215///
216/// # Examples
217///
218/// ```ignore
219/// # use datafusion::prelude::*;
220/// # use datafusion::error::Result;
221/// # #[tokio::main]
222/// # async fn main() -> Result<()> {
223/// let ctx = SessionContext::new();
224/// let df = ctx.read_csv("tests/data/regex.csv", CsvReadOptions::new()).await?;
225///
226/// // use the regexp_replace function to replace substring(s) without flags
227/// let df = df.with_column(
228///     "a",
229///     regexp_replace(vec![col("values"), col("patterns"), col("replacement")])
230/// )?;
231/// // use the regexp_replace function to replace substring(s) with flags
232/// let df = df.with_column(
233///     "b",
234///     regexp_replace(vec![col("values"), col("patterns"), col("replacement"), col("flags")]),
235/// )?;
236///
237/// // literals can be used as well
238/// let df = df.with_column(
239///     "c",
240///     regexp_replace(vec![lit("foobarbequebaz"), lit("(bar)(beque)"), lit(r"\2")]),
241/// )?;
242///
243/// df.show().await?;
244///
245/// # Ok(())
246/// # }
247/// ```
248pub fn regexp_replace<'a, T: OffsetSizeTrait, U>(
249    string_array: U,
250    pattern_array: U,
251    replacement_array: U,
252    flags_array: Option<U>,
253) -> Result<ArrayRef>
254where
255    U: ArrayAccessor<Item = &'a str>,
256{
257    // Default implementation for regexp_replace, assumes all args are arrays
258    // and args is a sequence of 3 or 4 elements.
259
260    // creating Regex is expensive so create hashmap for memoization
261    let mut patterns: HashMap<String, Regex> = HashMap::new();
262
263    let datatype = string_array.data_type().to_owned();
264
265    let string_array_iter = ArrayIter::new(string_array);
266    let pattern_array_iter = ArrayIter::new(pattern_array);
267    let replacement_array_iter = ArrayIter::new(replacement_array);
268
269    match flags_array {
270        None => {
271            let result_iter = string_array_iter
272                .zip(pattern_array_iter)
273                .zip(replacement_array_iter)
274                .map(|((string, pattern), replacement)| {
275                    match (string, pattern, replacement) {
276                        (Some(string), Some(pattern), Some(replacement)) => {
277                            let replacement = regex_replace_posix_groups(replacement);
278                            // if patterns hashmap already has regexp then use else create and return
279                            let re = match patterns.get(pattern) {
280                                Some(re) => Ok(re),
281                                None => match Regex::new(pattern) {
282                                    Ok(re) => {
283                                        patterns.insert(pattern.to_string(), re);
284                                        Ok(patterns.get(pattern).unwrap())
285                                    }
286                                    Err(err) => {
287                                        Err(DataFusionError::External(Box::new(err)))
288                                    }
289                                },
290                            };
291
292                            Some(re.map(|re| re.replace(string, replacement.as_str())))
293                                .transpose()
294                        }
295                        _ => Ok(None),
296                    }
297                });
298
299            match datatype {
300                DataType::Utf8 | DataType::LargeUtf8 => {
301                    let result =
302                        result_iter.collect::<Result<GenericStringArray<T>>>()?;
303                    Ok(Arc::new(result) as ArrayRef)
304                }
305                DataType::Utf8View => {
306                    let result = result_iter.collect::<Result<StringViewArray>>()?;
307                    Ok(Arc::new(result) as ArrayRef)
308                }
309                other => {
310                    exec_err!(
311                        "Unsupported data type {other:?} for function regex_replace"
312                    )
313                }
314            }
315        }
316        Some(flags_array) => {
317            let flags_array_iter = ArrayIter::new(flags_array);
318
319            let result_iter = string_array_iter
320                .zip(pattern_array_iter)
321                .zip(replacement_array_iter)
322                .zip(flags_array_iter)
323                .map(|(((string, pattern), replacement), flags)| {
324                    match (string, pattern, replacement, flags) {
325                        (Some(string), Some(pattern), Some(replacement), Some(flags)) => {
326                            let replacement = regex_replace_posix_groups(replacement);
327
328                            // format flags into rust pattern
329                            let (pattern, replace_all) = if flags == "g" {
330                                (pattern.to_string(), true)
331                            } else if flags.contains('g') {
332                                (
333                                    format!(
334                                        "(?{}){}",
335                                        flags.to_string().replace('g', ""),
336                                        pattern
337                                    ),
338                                    true,
339                                )
340                            } else {
341                                (format!("(?{flags}){pattern}"), false)
342                            };
343
344                            // if patterns hashmap already has regexp then use else create and return
345                            let re = match patterns.get(&pattern) {
346                                Some(re) => Ok(re),
347                                None => match Regex::new(pattern.as_str()) {
348                                    Ok(re) => {
349                                        patterns.insert(pattern.clone(), re);
350                                        Ok(patterns.get(&pattern).unwrap())
351                                    }
352                                    Err(err) => {
353                                        Err(DataFusionError::External(Box::new(err)))
354                                    }
355                                },
356                            };
357
358                            Some(re.map(|re| {
359                                if replace_all {
360                                    re.replace_all(string, replacement.as_str())
361                                } else {
362                                    re.replace(string, replacement.as_str())
363                                }
364                            }))
365                            .transpose()
366                        }
367                        _ => Ok(None),
368                    }
369                });
370
371            match datatype {
372                DataType::Utf8 | DataType::LargeUtf8 => {
373                    let result =
374                        result_iter.collect::<Result<GenericStringArray<T>>>()?;
375                    Ok(Arc::new(result) as ArrayRef)
376                }
377                DataType::Utf8View => {
378                    let result = result_iter.collect::<Result<StringViewArray>>()?;
379                    Ok(Arc::new(result) as ArrayRef)
380                }
381                other => {
382                    exec_err!(
383                        "Unsupported data type {other:?} for function regex_replace"
384                    )
385                }
386            }
387        }
388    }
389}
390
391/// Get the first argument from the given string array.
392///
393/// Note: If the array is empty or the first argument is null,
394/// then aborts early.
395macro_rules! fetch_string_arg {
396    ($ARG:expr, $NAME:expr, $ARRAY_SIZE:expr) => {{
397        let string_array_type = ($ARG).data_type();
398        match string_array_type {
399            dt if $ARG.len() == 0 || $ARG.is_null(0) => {
400                // Mimicking the existing behavior of regexp_replace, if any of the scalar arguments
401                // are actually null, then the result will be an array of the same size as the first argument with all nulls.
402                //
403                // Also acts like an early abort mechanism when the input array is empty.
404                return Ok(new_null_array(dt, $ARRAY_SIZE));
405            }
406            DataType::Utf8 => {
407                let array = as_string_array($ARG)?;
408                array.value(0)
409            }
410            DataType::LargeUtf8 => {
411                let array = as_large_string_array($ARG)?;
412                array.value(0)
413            }
414            DataType::Utf8View => {
415                let array = as_string_view_array($ARG)?;
416                array.value(0)
417            }
418            _ => unreachable!(
419                "Invalid data type for regexp_replace: {}",
420                string_array_type
421            ),
422        }
423    }};
424}
425/// Special cased regex_replace implementation for the scenario where
426/// the pattern, replacement and flags are static (arrays that are derived
427/// from scalars). This means we can skip regex caching system and basically
428/// hold a single Regex object for the replace operation. This also speeds
429/// up the pre-processing time of the replacement string, since it only
430/// needs to processed once.
431fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
432    args: &[ArrayRef],
433) -> Result<ArrayRef> {
434    let array_size = args[0].len();
435    let pattern = fetch_string_arg!(&args[1], "pattern", array_size);
436    let replacement = fetch_string_arg!(&args[2], "replacement", array_size);
437    let flags = match args.len() {
438        3 => None,
439        4 => Some(fetch_string_arg!(&args[3], "flags", array_size)),
440        other => {
441            return exec_err!(
442                "regexp_replace was called with {other} arguments. It requires at least 3 and at most 4."
443            );
444        }
445    };
446
447    // Embed the flag (if it exists) into the pattern. Limit will determine
448    // whether this is a global match (as in replace all) or just a single
449    // replace operation.
450    let (pattern, limit) = match flags {
451        Some("g") => (pattern.to_string(), 0),
452        Some(flags) => (
453            format!("(?{}){}", flags.to_string().replace('g', ""), pattern),
454            !flags.contains('g') as usize,
455        ),
456        None => (pattern.to_string(), 1),
457    };
458
459    let re =
460        Regex::new(&pattern).map_err(|err| DataFusionError::External(Box::new(err)))?;
461
462    // Replaces the posix groups in the replacement string
463    // with rust ones.
464    let replacement = regex_replace_posix_groups(replacement);
465
466    let string_array_type = args[0].data_type();
467    match string_array_type {
468        DataType::Utf8 | DataType::LargeUtf8 => {
469            let string_array = as_generic_string_array::<T>(&args[0])?;
470
471            // We are going to create the underlying string buffer from its parts
472            // to be able to re-use the existing null buffer for sparse arrays.
473            let mut vals = BufferBuilder::<u8>::new({
474                let offsets = string_array.value_offsets();
475                (offsets[string_array.len()] - offsets[0])
476                    .to_usize()
477                    .unwrap()
478            });
479            let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
480            new_offsets.append(T::zero());
481
482            string_array.iter().for_each(|val| {
483                if let Some(val) = val {
484                    let result = re.replacen(val, limit, replacement.as_str());
485                    vals.append_slice(result.as_bytes());
486                }
487                new_offsets.append(T::from_usize(vals.len()).unwrap());
488            });
489
490            let data = ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
491                .len(string_array.len())
492                .nulls(string_array.nulls().cloned())
493                .buffers(vec![new_offsets.finish(), vals.finish()])
494                .build()?;
495            let result_array = GenericStringArray::<T>::from(data);
496            Ok(Arc::new(result_array) as ArrayRef)
497        }
498        DataType::Utf8View => {
499            let string_view_array = as_string_view_array(&args[0])?;
500
501            let mut builder = StringViewBuilder::with_capacity(string_view_array.len());
502
503            for val in string_view_array.iter() {
504                if let Some(val) = val {
505                    let result = re.replacen(val, limit, replacement.as_str());
506                    builder.append_value(result);
507                } else {
508                    builder.append_null();
509                }
510            }
511
512            let result = builder.finish();
513            Ok(Arc::new(result) as ArrayRef)
514        }
515        _ => unreachable!(
516            "Invalid data type for regexp_replace: {}",
517            string_array_type
518        ),
519    }
520}
521
522/// Determine which implementation of the regexp_replace to use based
523/// on the given set of arguments.
524fn specialize_regexp_replace<T: OffsetSizeTrait>(
525    args: &[ColumnarValue],
526) -> Result<ArrayRef> {
527    // This will serve as a dispatch table where we can
528    // leverage it in order to determine whether the scalarity
529    // of the given set of arguments fits a better specialized
530    // function.
531    let (is_source_scalar, is_pattern_scalar, is_replacement_scalar, is_flags_scalar) = (
532        matches!(args[0], ColumnarValue::Scalar(_)),
533        matches!(args[1], ColumnarValue::Scalar(_)),
534        matches!(args[2], ColumnarValue::Scalar(_)),
535        // The forth argument (flags) is optional; so in the event that
536        // it is not available, we'll claim that it is scalar.
537        matches!(args.get(3), Some(ColumnarValue::Scalar(_)) | None),
538    );
539    let len = args
540        .iter()
541        .fold(Option::<usize>::None, |acc, arg| match arg {
542            ColumnarValue::Scalar(_) => acc,
543            ColumnarValue::Array(a) => Some(a.len()),
544        });
545    let inferred_length = len.unwrap_or(1);
546    match (
547        is_source_scalar,
548        is_pattern_scalar,
549        is_replacement_scalar,
550        is_flags_scalar,
551    ) {
552        // This represents a very hot path for the case where the there is
553        // a single pattern that is being matched against and a single replacement.
554        // This is extremely important to specialize on since it removes the overhead
555        // of DF's in-house regex pattern cache (since there will be at most a single
556        // pattern) and the pre-processing of the same replacement pattern at each
557        // query.
558        //
559        // The flags needs to be a scalar as well since each pattern is actually
560        // constructed with the flags embedded into the pattern itself. This means
561        // even if the pattern itself is scalar, if the flags are an array then
562        // we will create many regexes and it is best to use the implementation
563        // that caches it. If there are no flags, we can simply ignore it here,
564        // and let the specialized function handle it.
565        (_, true, true, true) => {
566            let hints = [
567                Hint::Pad,
568                Hint::AcceptsSingular,
569                Hint::AcceptsSingular,
570                Hint::AcceptsSingular,
571            ];
572            let args = args
573                .iter()
574                .zip(hints.iter().chain(std::iter::repeat(&Hint::Pad)))
575                .map(|(arg, hint)| {
576                    // Decide on the length to expand this scalar to depending
577                    // on the given hints.
578                    let expansion_len = match hint {
579                        Hint::AcceptsSingular => 1,
580                        Hint::Pad => inferred_length,
581                    };
582                    arg.to_array(expansion_len)
583                })
584                .collect::<Result<Vec<_>>>()?;
585            _regexp_replace_static_pattern_replace::<T>(&args)
586        }
587
588        // If there are no specialized implementations, we'll fall back to the
589        // generic implementation.
590        (_, _, _, _) => {
591            let args = args
592                .iter()
593                .map(|arg| arg.to_array(inferred_length))
594                .collect::<Result<Vec<_>>>()?;
595
596            match (
597                args[0].data_type(),
598                args[1].data_type(),
599                args[2].data_type(),
600                args.get(3).map(|a| a.data_type()),
601            ) {
602                (
603                    DataType::Utf8,
604                    DataType::Utf8,
605                    DataType::Utf8,
606                    Some(DataType::Utf8) | None,
607                ) => {
608                    let string_array = args[0].as_string::<i32>();
609                    let pattern_array = args[1].as_string::<i32>();
610                    let replacement_array = args[2].as_string::<i32>();
611                    let flags_array = args.get(3).map(|a| a.as_string::<i32>());
612                    regexp_replace::<i32, _>(
613                        string_array,
614                        pattern_array,
615                        replacement_array,
616                        flags_array,
617                    )
618                }
619                (
620                    DataType::Utf8View,
621                    DataType::Utf8View,
622                    DataType::Utf8View,
623                    Some(DataType::Utf8View) | None,
624                ) => {
625                    let string_array = args[0].as_string_view();
626                    let pattern_array = args[1].as_string_view();
627                    let replacement_array = args[2].as_string_view();
628                    let flags_array = args.get(3).map(|a| a.as_string_view());
629                    regexp_replace::<i32, _>(
630                        string_array,
631                        pattern_array,
632                        replacement_array,
633                        flags_array,
634                    )
635                }
636                (
637                    DataType::LargeUtf8,
638                    DataType::LargeUtf8,
639                    DataType::LargeUtf8,
640                    Some(DataType::LargeUtf8) | None,
641                ) => {
642                    let string_array = args[0].as_string::<i64>();
643                    let pattern_array = args[1].as_string::<i64>();
644                    let replacement_array = args[2].as_string::<i64>();
645                    let flags_array = args.get(3).map(|a| a.as_string::<i64>());
646                    regexp_replace::<i64, _>(
647                        string_array,
648                        pattern_array,
649                        replacement_array,
650                        flags_array,
651                    )
652                }
653                other => {
654                    exec_err!(
655                        "Unsupported data type {other:?} for function regex_replace"
656                    )
657                }
658            }
659        }
660    }
661}
662#[cfg(test)]
663mod tests {
664    use arrow::array::*;
665
666    use super::*;
667
668    #[test]
669    fn test_regex_replace_posix_groups() {
670        // Test that \1, \2, etc. are replaced with ${1}, ${2}, etc.
671        assert_eq!(regex_replace_posix_groups(r"\1"), "${1}");
672        assert_eq!(regex_replace_posix_groups(r"\12"), "${12}");
673        assert_eq!(regex_replace_posix_groups(r"X\1Y"), "X${1}Y");
674        assert_eq!(regex_replace_posix_groups(r"\1\2"), "${1}${2}");
675
676        // Test double backslash (from SQL escaped strings like '\\1')
677        assert_eq!(regex_replace_posix_groups(r"\\1"), "${1}");
678        assert_eq!(regex_replace_posix_groups(r"X\\1Y"), "X${1}Y");
679        assert_eq!(regex_replace_posix_groups(r"\\1\\2"), "${1}${2}");
680
681        // Test 3 or 4 backslashes before digits to document expected behavior
682        assert_eq!(regex_replace_posix_groups(r"\\\1"), r"\${1}");
683        assert_eq!(regex_replace_posix_groups(r"\\\\1"), r"\\${1}");
684        assert_eq!(regex_replace_posix_groups(r"\\\1\\\\2"), r"\${1}\\${2}");
685
686        // Test that a lone backslash is NOT replaced (requires at least one digit)
687        assert_eq!(regex_replace_posix_groups(r"\"), r"\");
688        assert_eq!(regex_replace_posix_groups(r"foo\bar"), r"foo\bar");
689
690        // Test that backslash followed by non-digit is preserved
691        assert_eq!(regex_replace_posix_groups(r"\n"), r"\n");
692        assert_eq!(regex_replace_posix_groups(r"\t"), r"\t");
693
694        // Test \0 behavior: \0 is converted to ${0}, which in Rust's regex
695        // replacement syntax substitutes the entire match. This is consistent
696        // with POSIX behavior where \0 (or &) refers to the entire matched string.
697        assert_eq!(regex_replace_posix_groups(r"\0"), "${0}");
698        assert_eq!(
699            regex_replace_posix_groups(r"prefix\0suffix"),
700            "prefix${0}suffix"
701        );
702    }
703
704    macro_rules! static_pattern_regexp_replace {
705        ($name:ident, $T:ty, $O:ty) => {
706            #[test]
707            fn $name() {
708                let values = vec!["abc", "acd", "abcd1234567890123", "123456789012abc"];
709                let patterns = vec!["b"; 4];
710                let replacement = vec!["foo"; 4];
711                let expected =
712                    vec!["afooc", "acd", "afoocd1234567890123", "123456789012afooc"];
713
714                let values = <$T>::from(values);
715                let patterns = <$T>::from(patterns);
716                let replacements = <$T>::from(replacement);
717                let expected = <$T>::from(expected);
718
719                let re = _regexp_replace_static_pattern_replace::<$O>(&[
720                    Arc::new(values),
721                    Arc::new(patterns),
722                    Arc::new(replacements),
723                ])
724                .unwrap();
725
726                assert_eq!(re.as_ref(), &expected);
727            }
728        };
729    }
730
731    static_pattern_regexp_replace!(string_array, StringArray, i32);
732    static_pattern_regexp_replace!(string_view_array, StringViewArray, i32);
733    static_pattern_regexp_replace!(large_string_array, LargeStringArray, i64);
734
735    macro_rules! static_pattern_regexp_replace_with_flags {
736        ($name:ident, $T:ty, $O: ty) => {
737            #[test]
738            fn $name() {
739                let values = vec![
740                    "abc",
741                    "aBc",
742                    "acd",
743                    "abcd1234567890123",
744                    "aBcd1234567890123",
745                    "123456789012abc",
746                    "123456789012aBc",
747                ];
748                let expected = vec![
749                    "afooc",
750                    "afooc",
751                    "acd",
752                    "afoocd1234567890123",
753                    "afoocd1234567890123",
754                    "123456789012afooc",
755                    "123456789012afooc",
756                ];
757
758                let values = <$T>::from(values);
759                let patterns = StringArray::from(vec!["b"; 7]);
760                let replacements = StringArray::from(vec!["foo"; 7]);
761                let flags = StringArray::from(vec!["i"; 5]);
762                let expected = <$T>::from(expected);
763
764                let re = _regexp_replace_static_pattern_replace::<$O>(&[
765                    Arc::new(values),
766                    Arc::new(patterns),
767                    Arc::new(replacements),
768                    Arc::new(flags),
769                ])
770                .unwrap();
771
772                assert_eq!(re.as_ref(), &expected);
773            }
774        };
775    }
776
777    static_pattern_regexp_replace_with_flags!(string_array_with_flags, StringArray, i32);
778    static_pattern_regexp_replace_with_flags!(
779        string_view_array_with_flags,
780        StringViewArray,
781        i32
782    );
783    static_pattern_regexp_replace_with_flags!(
784        large_string_array_with_flags,
785        LargeStringArray,
786        i64
787    );
788
789    #[test]
790    fn test_static_pattern_regexp_replace_early_abort() {
791        let values = StringArray::from(vec!["abc"; 5]);
792        let patterns = StringArray::from(vec![None::<&str>; 5]);
793        let replacements = StringArray::from(vec!["foo"; 5]);
794        let expected = StringArray::from(vec![None::<&str>; 5]);
795
796        let re = _regexp_replace_static_pattern_replace::<i32>(&[
797            Arc::new(values),
798            Arc::new(patterns),
799            Arc::new(replacements),
800        ])
801        .unwrap();
802
803        assert_eq!(re.as_ref(), &expected);
804    }
805
806    #[test]
807    fn test_static_pattern_regexp_replace_early_abort_when_empty() {
808        let values = StringArray::from(Vec::<Option<&str>>::new());
809        let patterns = StringArray::from(Vec::<Option<&str>>::new());
810        let replacements = StringArray::from(Vec::<Option<&str>>::new());
811        let expected = StringArray::from(Vec::<Option<&str>>::new());
812
813        let re = _regexp_replace_static_pattern_replace::<i32>(&[
814            Arc::new(values),
815            Arc::new(patterns),
816            Arc::new(replacements),
817        ])
818        .unwrap();
819
820        assert_eq!(re.as_ref(), &expected);
821    }
822
823    #[test]
824    fn test_static_pattern_regexp_replace_early_abort_flags() {
825        let values = StringArray::from(vec!["abc"; 5]);
826        let patterns = StringArray::from(vec!["a"; 5]);
827        let replacements = StringArray::from(vec!["foo"; 5]);
828        let flags = StringArray::from(vec![None::<&str>; 5]);
829        let expected = StringArray::from(vec![None::<&str>; 5]);
830
831        let re = _regexp_replace_static_pattern_replace::<i32>(&[
832            Arc::new(values),
833            Arc::new(patterns),
834            Arc::new(replacements),
835            Arc::new(flags),
836        ])
837        .unwrap();
838
839        assert_eq!(re.as_ref(), &expected);
840    }
841
842    #[test]
843    fn test_static_pattern_regexp_replace_pattern_error() {
844        let values = StringArray::from(vec!["abc"; 5]);
845        // Deliberately using an invalid pattern to see how the single pattern
846        // error is propagated on regexp_replace.
847        let patterns = StringArray::from(vec!["["; 5]);
848        let replacements = StringArray::from(vec!["foo"; 5]);
849
850        let re = _regexp_replace_static_pattern_replace::<i32>(&[
851            Arc::new(values),
852            Arc::new(patterns),
853            Arc::new(replacements),
854        ]);
855        let pattern_err = re.expect_err("broken pattern should have failed");
856        assert_eq!(
857            pattern_err.strip_backtrace(),
858            "External error: regex parse error:\n    [\n    ^\nerror: unclosed character class"
859        );
860    }
861
862    #[test]
863    fn test_static_pattern_regexp_replace_with_null_buffers() {
864        let values = StringArray::from(vec![
865            Some("a"),
866            None,
867            Some("b"),
868            None,
869            Some("a"),
870            None,
871            None,
872            Some("c"),
873        ]);
874        let patterns = StringArray::from(vec!["a"; 1]);
875        let replacements = StringArray::from(vec!["foo"; 1]);
876        let expected = StringArray::from(vec![
877            Some("foo"),
878            None,
879            Some("b"),
880            None,
881            Some("foo"),
882            None,
883            None,
884            Some("c"),
885        ]);
886
887        let re = _regexp_replace_static_pattern_replace::<i32>(&[
888            Arc::new(values),
889            Arc::new(patterns),
890            Arc::new(replacements),
891        ])
892        .unwrap();
893
894        assert_eq!(re.as_ref(), &expected);
895        assert_eq!(re.null_count(), 4);
896    }
897
898    #[test]
899    fn test_static_pattern_regexp_replace_with_sliced_null_buffer() {
900        let values = StringArray::from(vec![
901            Some("a"),
902            None,
903            Some("b"),
904            None,
905            Some("a"),
906            None,
907            None,
908            Some("c"),
909        ]);
910        let values = values.slice(2, 5);
911        let patterns = StringArray::from(vec!["a"; 1]);
912        let replacements = StringArray::from(vec!["foo"; 1]);
913        let expected = StringArray::from(vec![Some("b"), None, Some("foo"), None, None]);
914
915        let re = _regexp_replace_static_pattern_replace::<i32>(&[
916            Arc::new(values),
917            Arc::new(patterns),
918            Arc::new(replacements),
919        ])
920        .unwrap();
921        assert_eq!(re.as_ref(), &expected);
922        assert_eq!(re.null_count(), 3);
923    }
924}