Skip to main content

ferray_strings/
search.rs

1// ferray-strings: Search operations (REQ-8, REQ-9, REQ-10)
2//
3// Implements find, rfind, index, rindex, count, startswith, endswith, replace —
4// elementwise on StringArray.
5
6// `find` returns an `i64` array following NumPy's `numpy.strings.find`
7// contract (with `-1` for "not found"); converting the `usize` char count
8// to `i64` is the published return-type, not a precision bug.
9#![allow(clippy::cast_possible_wrap)]
10
11use ferray_core::Array;
12use ferray_core::dimension::Dimension;
13use ferray_core::error::{FerrayError, FerrayResult};
14
15use crate::string_array::StringArray;
16
17/// Find the lowest index of `sub` in each string element.
18///
19/// Returns an `Array<i64, D>` preserving the input shape, where each element
20/// is the index of the first occurrence of `sub`, or -1 if not found.
21///
22/// # Errors
23/// Returns an error if the internal array construction fails.
24pub fn find<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
25    let data: Vec<i64> = a.map_to_vec(|s| {
26        match s.find(sub) {
27            Some(byte_idx) => {
28                // Convert byte index to character index
29                s[..byte_idx].chars().count() as i64
30            }
31            None => -1,
32        }
33    });
34    Array::from_vec(a.dim().clone(), data)
35}
36
37/// Count non-overlapping occurrences of `sub` in each string element.
38///
39/// Returns an `Array<u64, D>` preserving the input shape.
40///
41/// # Errors
42/// Returns an error if the internal array construction fails.
43pub fn count<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<u64, D>> {
44    let data: Vec<u64> = a.map_to_vec(|s| s.matches(sub).count() as u64);
45    Array::from_vec(a.dim().clone(), data)
46}
47
48/// Test whether each string element starts with the given prefix.
49///
50/// Returns an `Array<bool, D>` preserving the input shape.
51///
52/// # Errors
53/// Returns an error if the internal array construction fails.
54pub fn startswith<D: Dimension>(a: &StringArray<D>, prefix: &str) -> FerrayResult<Array<bool, D>> {
55    let data: Vec<bool> = a.map_to_vec(|s| s.starts_with(prefix));
56    Array::from_vec(a.dim().clone(), data)
57}
58
59/// Test whether each string element ends with the given suffix.
60///
61/// Returns an `Array<bool, D>` preserving the input shape.
62///
63/// # Errors
64/// Returns an error if the internal array construction fails.
65pub fn endswith<D: Dimension>(a: &StringArray<D>, suffix: &str) -> FerrayResult<Array<bool, D>> {
66    let data: Vec<bool> = a.map_to_vec(|s| s.ends_with(suffix));
67    Array::from_vec(a.dim().clone(), data)
68}
69
70/// Replace occurrences of `old` with `new` in each string element.
71///
72/// If `max_count` is `Some(n)`, only the first `n` occurrences are replaced.
73/// If `None`, all occurrences are replaced.
74///
75/// # Errors
76/// Returns an error if the internal array construction fails.
77pub fn replace<D: Dimension>(
78    a: &StringArray<D>,
79    old: &str,
80    new: &str,
81    max_count: Option<usize>,
82) -> FerrayResult<StringArray<D>> {
83    a.map(|s| match max_count {
84        None => s.replace(old, new),
85        Some(n) => s.replacen(old, new, n),
86    })
87}
88
89/// Find the highest character index of `sub` in each string element.
90///
91/// Returns `-1` for elements where `sub` is not found. Mirrors
92/// `numpy.strings.rfind`.
93///
94/// # Errors
95/// Returns an error if the internal array construction fails.
96pub fn rfind<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
97    let data: Vec<i64> = a.map_to_vec(|s| match s.rfind(sub) {
98        Some(byte_idx) => s[..byte_idx].chars().count() as i64,
99        None => -1,
100    });
101    Array::from_vec(a.dim().clone(), data)
102}
103
104/// Find the lowest character index of `sub` in each string element, raising
105/// when any element does not contain `sub`.
106///
107/// Mirrors `numpy.strings.index`. The first miss surfaces as
108/// `FerrayError::InvalidValue`.
109///
110/// # Errors
111/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
112pub fn index<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
113    let mut data = Vec::with_capacity(a.iter().len());
114    for s in a.iter() {
115        match s.find(sub) {
116            Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
117            None => {
118                return Err(FerrayError::invalid_value(format!(
119                    "index: substring {sub:?} not found in element {s:?}"
120                )));
121            }
122        }
123    }
124    Array::from_vec(a.dim().clone(), data)
125}
126
127/// Find the highest character index of `sub` in each string element, raising
128/// when any element does not contain `sub`.
129///
130/// Mirrors `numpy.strings.rindex`.
131///
132/// # Errors
133/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
134pub fn rindex<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
135    let mut data = Vec::with_capacity(a.iter().len());
136    for s in a.iter() {
137        match s.rfind(sub) {
138            Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
139            None => {
140                return Err(FerrayError::invalid_value(format!(
141                    "rindex: substring {sub:?} not found in element {s:?}"
142                )));
143            }
144        }
145    }
146    Array::from_vec(a.dim().clone(), data)
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152    use crate::string_array::array;
153
154    #[test]
155    fn test_find() {
156        let a = array(&["hello", "world", "bell"]).unwrap();
157        let b = find(&a, "ll").unwrap();
158        let data = b.as_slice().unwrap();
159        assert_eq!(data, &[2, -1, 2]);
160    }
161
162    #[test]
163    fn test_find_at_start() {
164        let a = array(&["abc", "def"]).unwrap();
165        let b = find(&a, "abc").unwrap();
166        let data = b.as_slice().unwrap();
167        assert_eq!(data, &[0, -1]);
168    }
169
170    #[test]
171    fn test_find_empty_sub() {
172        let a = array(&["hello"]).unwrap();
173        let b = find(&a, "").unwrap();
174        let data = b.as_slice().unwrap();
175        assert_eq!(data, &[0]);
176    }
177
178    #[test]
179    fn test_count() {
180        let a = array(&["abcabc", "abc", "xyz"]).unwrap();
181        let b = count(&a, "abc").unwrap();
182        let data = b.as_slice().unwrap();
183        assert_eq!(data, &[2_u64, 1, 0]);
184    }
185
186    #[test]
187    fn test_startswith() {
188        let a = array(&["hello", "world", "help"]).unwrap();
189        let b = startswith(&a, "hel").unwrap();
190        let data = b.as_slice().unwrap();
191        assert_eq!(data, &[true, false, true]);
192    }
193
194    #[test]
195    fn test_endswith() {
196        let a = array(&["hello", "world", "bello"]).unwrap();
197        let b = endswith(&a, "llo").unwrap();
198        let data = b.as_slice().unwrap();
199        assert_eq!(data, &[true, false, true]);
200    }
201
202    #[test]
203    fn test_replace_all() {
204        let a = array(&["aabbcc", "aabba"]).unwrap();
205        let b = replace(&a, "aa", "XX", None).unwrap();
206        assert_eq!(b.as_slice(), &["XXbbcc", "XXbba"]);
207    }
208
209    #[test]
210    fn test_replace_with_count() {
211        let a = array(&["ababab"]).unwrap();
212        let b = replace(&a, "ab", "X", Some(2)).unwrap();
213        assert_eq!(b.as_slice(), &["XXab"]);
214    }
215
216    #[test]
217    fn test_find_ac3() {
218        // AC-3: strings::find(&a, "ll") returns correct indices (2 for "hello", -1 for "world")
219        let a = array(&["hello", "world"]).unwrap();
220        let b = find(&a, "ll").unwrap();
221        let data = b.as_slice().unwrap();
222        assert_eq!(data, &[2, -1]);
223    }
224}