Skip to main content

ferray_strings/
regex_ops.rs

1// ferray-strings: Regex operations (REQ-12, REQ-13)
2//
3// Implements match_ and extract using the `regex` crate.
4
5use ferray_core::Array;
6use ferray_core::dimension::{Dimension, Ix1};
7use ferray_core::error::{FerrayError, FerrayResult};
8use regex::Regex;
9
10use crate::string_array::{StringArray, StringArray1};
11
12/// Test whether each string element matches the given regex pattern.
13///
14/// Returns an `Array<bool, D>` preserving the input shape, where each element
15/// indicates whether the corresponding string contains a match for the pattern.
16///
17/// # Errors
18/// Returns `FerrayError::InvalidValue` if the regex pattern is invalid.
19/// Returns an error if the internal array construction fails.
20pub fn match_<D: Dimension>(a: &StringArray<D>, pattern: &str) -> FerrayResult<Array<bool, D>> {
21    let re = Regex::new(pattern)
22        .map_err(|e| FerrayError::invalid_value(format!("invalid regex pattern: {e}")))?;
23
24    let data: Vec<bool> = a.map_to_vec(|s| re.is_match(s));
25    Array::from_vec(a.dim().clone(), data)
26}
27
28/// Extract the first capture group from each string element.
29///
30/// For each string, finds the first match of the pattern and returns
31/// the contents of capture group 1. If there is no match or no capture
32/// group, an empty string is returned.
33///
34/// The pattern must contain at least one capture group `(...)`.
35///
36/// # Errors
37/// Returns `FerrayError::InvalidValue` if the regex pattern is invalid.
38/// Returns an error if the internal array construction fails.
39pub fn extract<D: Dimension>(a: &StringArray<D>, pattern: &str) -> FerrayResult<StringArray1> {
40    let re = Regex::new(pattern)
41        .map_err(|e| FerrayError::invalid_value(format!("invalid regex pattern: {e}")))?;
42
43    let data: Vec<String> = a
44        .iter()
45        .map(|s| {
46            re.captures(s)
47                .and_then(|caps| caps.get(1))
48                .map(|m| m.as_str().to_string())
49                .unwrap_or_default()
50        })
51        .collect();
52
53    let dim = Ix1::new([data.len()]);
54    StringArray1::from_vec(dim, data)
55}
56
57#[cfg(test)]
58mod tests {
59    use super::*;
60    use crate::string_array::array;
61
62    #[test]
63    fn test_match_basic() {
64        let a = array(&["hello123", "world", "foo42"]).unwrap();
65        let result = match_(&a, r"\d+").unwrap();
66        let data = result.as_slice().unwrap();
67        assert_eq!(data, &[true, false, true]);
68    }
69
70    #[test]
71    fn test_match_full_pattern() {
72        let a = array(&["abc", "def", "abcdef"]).unwrap();
73        let result = match_(&a, r"^abc$").unwrap();
74        let data = result.as_slice().unwrap();
75        assert_eq!(data, &[true, false, false]);
76    }
77
78    #[test]
79    fn test_match_invalid_regex() {
80        let a = array(&["hello"]).unwrap();
81        let result = match_(&a, r"[invalid");
82        assert!(result.is_err());
83    }
84
85    #[test]
86    fn test_extract_capture_group() {
87        let a = array(&["hello123world", "foo42bar", "nodigits"]).unwrap();
88        let result = extract(&a, r"(\d+)").unwrap();
89        assert_eq!(result.as_slice(), &["123", "42", ""]);
90    }
91
92    #[test]
93    fn test_extract_named_group() {
94        let a = array(&["user:alice", "user:bob", "invalid"]).unwrap();
95        let result = extract(&a, r"user:(\w+)").unwrap();
96        assert_eq!(result.as_slice(), &["alice", "bob", ""]);
97    }
98
99    #[test]
100    fn test_extract_no_match() {
101        let a = array(&["no match here"]).unwrap();
102        let result = extract(&a, r"(\d+)").unwrap();
103        assert_eq!(result.as_slice(), &[""]);
104    }
105
106    #[test]
107    fn test_extract_invalid_regex() {
108        let a = array(&["hello"]).unwrap();
109        let result = extract(&a, r"[invalid");
110        assert!(result.is_err());
111    }
112
113    #[test]
114    fn test_match_and_extract_ac5() {
115        // AC-5: Regex match_ and extract work correctly with capture groups
116        let a = array(&["abc123", "def", "ghi456"]).unwrap();
117
118        let matched = match_(&a, r"\d+").unwrap();
119        let matched_data = matched.as_slice().unwrap();
120        assert_eq!(matched_data, &[true, false, true]);
121
122        let extracted = extract(&a, r"([a-z]+)(\d+)").unwrap();
123        // "def" has no digits, so no match => empty string
124        assert_eq!(extracted.as_slice(), &["abc", "", "ghi"]);
125    }
126
127    #[test]
128    fn test_extract_empty_string() {
129        let a = array(&["", "abc"]).unwrap();
130        let result = extract(&a, r"(abc)").unwrap();
131        assert_eq!(result.as_slice(), &["", "abc"]);
132    }
133}