Skip to main content

ferray_strings/
regex_ops.rs

1// ferray-strings: Regex operations (REQ-12, REQ-13)
2//
3// Implements match_ and extract using the `regex` crate.
4
5use ferray_core::Array;
6use ferray_core::dimension::{Dimension, Ix1};
7use ferray_core::error::{FerrayError, FerrayResult};
8use regex::Regex;
9
10use crate::string_array::{StringArray, StringArray1};
11
12/// Test whether each string element matches the given regex pattern.
13///
14/// Returns an `Array<bool>` where each element indicates whether the
15/// corresponding string contains a match for the pattern.
16///
17/// # Errors
18/// Returns `FerrayError::InvalidValue` if the regex pattern is invalid.
19/// Returns an error if the internal array construction fails.
20pub fn match_<D: Dimension>(a: &StringArray<D>, pattern: &str) -> FerrayResult<Array<bool, Ix1>> {
21    let re = Regex::new(pattern)
22        .map_err(|e| FerrayError::invalid_value(format!("invalid regex pattern: {e}")))?;
23
24    let data: Vec<bool> = a.map_to_vec(|s| re.is_match(s));
25    let dim = Ix1::new([data.len()]);
26    Array::from_vec(dim, data)
27}
28
29/// Extract the first capture group from each string element.
30///
31/// For each string, finds the first match of the pattern and returns
32/// the contents of capture group 1. If there is no match or no capture
33/// group, an empty string is returned.
34///
35/// The pattern must contain at least one capture group `(...)`.
36///
37/// # Errors
38/// Returns `FerrayError::InvalidValue` if the regex pattern is invalid.
39/// Returns an error if the internal array construction fails.
40pub fn extract<D: Dimension>(a: &StringArray<D>, pattern: &str) -> FerrayResult<StringArray1> {
41    let re = Regex::new(pattern)
42        .map_err(|e| FerrayError::invalid_value(format!("invalid regex pattern: {e}")))?;
43
44    let data: Vec<String> = a
45        .iter()
46        .map(|s| {
47            re.captures(s)
48                .and_then(|caps| caps.get(1))
49                .map(|m| m.as_str().to_string())
50                .unwrap_or_default()
51        })
52        .collect();
53
54    let dim = Ix1::new([data.len()]);
55    StringArray1::from_vec(dim, data)
56}
57
58#[cfg(test)]
59mod tests {
60    use super::*;
61    use crate::string_array::array;
62
63    #[test]
64    fn test_match_basic() {
65        let a = array(&["hello123", "world", "foo42"]).unwrap();
66        let result = match_(&a, r"\d+").unwrap();
67        let data = result.as_slice().unwrap();
68        assert_eq!(data, &[true, false, true]);
69    }
70
71    #[test]
72    fn test_match_full_pattern() {
73        let a = array(&["abc", "def", "abcdef"]).unwrap();
74        let result = match_(&a, r"^abc$").unwrap();
75        let data = result.as_slice().unwrap();
76        assert_eq!(data, &[true, false, false]);
77    }
78
79    #[test]
80    fn test_match_invalid_regex() {
81        let a = array(&["hello"]).unwrap();
82        let result = match_(&a, r"[invalid");
83        assert!(result.is_err());
84    }
85
86    #[test]
87    fn test_extract_capture_group() {
88        let a = array(&["hello123world", "foo42bar", "nodigits"]).unwrap();
89        let result = extract(&a, r"(\d+)").unwrap();
90        assert_eq!(result.as_slice(), &["123", "42", ""]);
91    }
92
93    #[test]
94    fn test_extract_named_group() {
95        let a = array(&["user:alice", "user:bob", "invalid"]).unwrap();
96        let result = extract(&a, r"user:(\w+)").unwrap();
97        assert_eq!(result.as_slice(), &["alice", "bob", ""]);
98    }
99
100    #[test]
101    fn test_extract_no_match() {
102        let a = array(&["no match here"]).unwrap();
103        let result = extract(&a, r"(\d+)").unwrap();
104        assert_eq!(result.as_slice(), &[""]);
105    }
106
107    #[test]
108    fn test_extract_invalid_regex() {
109        let a = array(&["hello"]).unwrap();
110        let result = extract(&a, r"[invalid");
111        assert!(result.is_err());
112    }
113
114    #[test]
115    fn test_match_and_extract_ac5() {
116        // AC-5: Regex match_ and extract work correctly with capture groups
117        let a = array(&["abc123", "def", "ghi456"]).unwrap();
118
119        let matched = match_(&a, r"\d+").unwrap();
120        let matched_data = matched.as_slice().unwrap();
121        assert_eq!(matched_data, &[true, false, true]);
122
123        let extracted = extract(&a, r"([a-z]+)(\d+)").unwrap();
124        // "def" has no digits, so no match => empty string
125        assert_eq!(extracted.as_slice(), &["abc", "", "ghi"]);
126    }
127
128    #[test]
129    fn test_extract_empty_string() {
130        let a = array(&["", "abc"]).unwrap();
131        let result = extract(&a, r"(abc)").unwrap();
132        assert_eq!(result.as_slice(), &["", "abc"]);
133    }
134}