Skip to main content

ferray_strings/
regex_ops.rs

1// ferray-strings: Regex operations (REQ-12, REQ-13)
2//
3// Implements match_ and extract using the `regex` crate.
4
5use ferray_core::Array;
6use ferray_core::dimension::{Dimension, Ix1};
7use ferray_core::error::{FerrayError, FerrayResult};
8use regex::Regex;
9
10use crate::string_array::{StringArray, StringArray1};
11
12/// Test whether each string element matches the given regex pattern.
13///
14/// Returns an `Array<bool, D>` preserving the input shape, where each element
15/// indicates whether the corresponding string contains a match for the pattern.
16///
17/// Compiles the pattern once per call. Call sites that reuse the same
18/// pattern across many arrays should compile a `Regex` themselves and
19/// pass it to [`match_compiled`] to avoid repeated compilation
20/// (see issue #521).
21///
22/// # Errors
23/// Returns `FerrayError::InvalidValue` if the regex pattern is invalid.
24/// Returns an error if the internal array construction fails.
25pub fn match_<D: Dimension>(a: &StringArray<D>, pattern: &str) -> FerrayResult<Array<bool, D>> {
26    let re = Regex::new(pattern)
27        .map_err(|e| FerrayError::invalid_value(format!("invalid regex pattern: {e}")))?;
28    match_compiled(a, &re)
29}
30
31/// Like [`match_`] but takes a pre-compiled `Regex`. Callers that run
32/// the same pattern against many arrays can compile the pattern once
33/// and reuse it across all calls, avoiding the regex engine's
34/// per-call compilation overhead (#521).
35pub fn match_compiled<D: Dimension>(
36    a: &StringArray<D>,
37    re: &Regex,
38) -> FerrayResult<Array<bool, D>> {
39    let data: Vec<bool> = a.map_to_vec(|s| re.is_match(s));
40    Array::from_vec(a.dim().clone(), data)
41}
42
43/// Extract the first capture group from each string element.
44///
45/// For each string, finds the first match of the pattern and returns
46/// the contents of capture group 1. If there is no match or no capture
47/// group, an empty string is returned.
48///
49/// The pattern must contain at least one capture group `(...)`.
50///
51/// Compiles the pattern once per call. Use [`extract_compiled`] for a
52/// pre-compiled-regex entry point (#521).
53///
54/// # Errors
55/// Returns `FerrayError::InvalidValue` if the regex pattern is invalid.
56/// Returns an error if the internal array construction fails.
57pub fn extract<D: Dimension>(a: &StringArray<D>, pattern: &str) -> FerrayResult<StringArray1> {
58    let re = Regex::new(pattern)
59        .map_err(|e| FerrayError::invalid_value(format!("invalid regex pattern: {e}")))?;
60    extract_compiled(a, &re)
61}
62
63/// Like [`extract`] but takes a pre-compiled `Regex` (#521).
64pub fn extract_compiled<D: Dimension>(
65    a: &StringArray<D>,
66    re: &Regex,
67) -> FerrayResult<StringArray1> {
68    let data: Vec<String> = a
69        .iter()
70        .map(|s| {
71            re.captures(s)
72                .and_then(|caps| caps.get(1))
73                .map(|m| m.as_str().to_string())
74                .unwrap_or_default()
75        })
76        .collect();
77
78    let dim = Ix1::new([data.len()]);
79    StringArray1::from_vec(dim, data)
80}
81
82#[cfg(test)]
83mod tests {
84    use super::*;
85    use crate::string_array::array;
86
87    #[test]
88    fn test_match_basic() {
89        let a = array(&["hello123", "world", "foo42"]).unwrap();
90        let result = match_(&a, r"\d+").unwrap();
91        let data = result.as_slice().unwrap();
92        assert_eq!(data, &[true, false, true]);
93    }
94
95    #[test]
96    fn test_match_full_pattern() {
97        let a = array(&["abc", "def", "abcdef"]).unwrap();
98        let result = match_(&a, r"^abc$").unwrap();
99        let data = result.as_slice().unwrap();
100        assert_eq!(data, &[true, false, false]);
101    }
102
103    #[test]
104    fn test_match_invalid_regex() {
105        let a = array(&["hello"]).unwrap();
106        let result = match_(&a, r"[invalid");
107        assert!(result.is_err());
108    }
109
110    #[test]
111    fn test_extract_capture_group() {
112        let a = array(&["hello123world", "foo42bar", "nodigits"]).unwrap();
113        let result = extract(&a, r"(\d+)").unwrap();
114        assert_eq!(result.as_slice(), &["123", "42", ""]);
115    }
116
117    #[test]
118    fn test_extract_named_group() {
119        let a = array(&["user:alice", "user:bob", "invalid"]).unwrap();
120        let result = extract(&a, r"user:(\w+)").unwrap();
121        assert_eq!(result.as_slice(), &["alice", "bob", ""]);
122    }
123
124    #[test]
125    fn test_extract_no_match() {
126        let a = array(&["no match here"]).unwrap();
127        let result = extract(&a, r"(\d+)").unwrap();
128        assert_eq!(result.as_slice(), &[""]);
129    }
130
131    #[test]
132    fn test_extract_invalid_regex() {
133        let a = array(&["hello"]).unwrap();
134        let result = extract(&a, r"[invalid");
135        assert!(result.is_err());
136    }
137
138    #[test]
139    fn test_match_and_extract_ac5() {
140        // AC-5: Regex match_ and extract work correctly with capture groups
141        let a = array(&["abc123", "def", "ghi456"]).unwrap();
142
143        let matched = match_(&a, r"\d+").unwrap();
144        let matched_data = matched.as_slice().unwrap();
145        assert_eq!(matched_data, &[true, false, true]);
146
147        let extracted = extract(&a, r"([a-z]+)(\d+)").unwrap();
148        // "def" has no digits, so no match => empty string
149        assert_eq!(extracted.as_slice(), &["abc", "", "ghi"]);
150    }
151
152    #[test]
153    fn test_extract_empty_string() {
154        let a = array(&["", "abc"]).unwrap();
155        let result = extract(&a, r"(abc)").unwrap();
156        assert_eq!(result.as_slice(), &["", "abc"]);
157    }
158}