Skip to main content

ferray_strings/
search.rs

1// ferray-strings: Search operations (REQ-8, REQ-9, REQ-10)
2//
3// Implements find, rfind, index, rindex, count, startswith, endswith, replace —
4// elementwise on StringArray.
5//
6// ## REQ status
7//
8// SHIPPED:
9//   - REQ-8 replace — `replace` (`pub fn`): elementwise substring
10//     replacement with an optional count, matching `numpy.strings.replace`.
11//   - REQ-9 search predicates — `startswith`, `endswith` (`pub fn`) ->
12//     `Array<bool, D>`, matching `numpy.strings.startswith`/`endswith`.
13//   - REQ-10 search indices — `find`, `count` (`pub fn`) -> `Array<i64, D>`
14//     (`find` returns `-1` when absent; both return **character** indices /
15//     counts, the published `numpy.strings.find`/`count` contract writing a
16//     signed `npy_intp`). The full search family `rfind`, `index`, `rindex`
17//     ships alongside (`index`/`rindex` raise `ValueError` when `sub` is
18//     absent, like CPython `str.index`/`rindex`). Audited NO DIVERGENCE.
19//
20// Consumers (non-test): re-exported from the crate root
21// (`ferray-strings/src/lib.rs` `pub use search::{count, endswith, find,
22// index, replace, rfind, rindex, startswith}`) and bound at the Python
23// surface by the `#[pyfunction]` shims `count`, `find`, `startswith`,
24// `endswith`, `replace`, `rfind` in `ferray-python/src/char.rs` (each
25// calling the matching `fs::` function); the `index`/`rindex` shims build on
26// `fs::find`/`fs::rfind`. These back `numpy.char`/`numpy.strings`.
27
28// `find` returns an `i64` array following NumPy's `numpy.strings.find`
29// contract (with `-1` for "not found"); converting the `usize` char count
30// to `i64` is the published return-type, not a precision bug.
31#![allow(clippy::cast_possible_wrap)]
32
33use ferray_core::Array;
34use ferray_core::dimension::Dimension;
35use ferray_core::error::{FerrayError, FerrayResult};
36
37use crate::string_array::StringArray;
38
39/// Find the lowest index of `sub` in each string element.
40///
41/// Returns an `Array<i64, D>` preserving the input shape, where each element
42/// is the index of the first occurrence of `sub`, or -1 if not found.
43///
44/// # Errors
45/// Returns an error if the internal array construction fails.
46pub fn find<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
47    let data: Vec<i64> = a.map_to_vec(|s| {
48        match s.find(sub) {
49            Some(byte_idx) => {
50                // Convert byte index to character index
51                s[..byte_idx].chars().count() as i64
52            }
53            None => -1,
54        }
55    });
56    Array::from_vec(a.dim().clone(), data)
57}
58
59/// Count non-overlapping occurrences of `sub` in each string element.
60///
61/// Returns an `Array<i64, D>` preserving the input shape. numpy's `count`
62/// ufunc writes a signed `npy_intp` (int64) result, so the element type is
63/// signed `i64`.
64///
65/// # Errors
66/// Returns an error if the internal array construction fails.
67pub fn count<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
68    let data: Vec<i64> = a.map_to_vec(|s| s.matches(sub).count() as i64);
69    Array::from_vec(a.dim().clone(), data)
70}
71
72/// Test whether each string element starts with the given prefix.
73///
74/// Returns an `Array<bool, D>` preserving the input shape.
75///
76/// # Errors
77/// Returns an error if the internal array construction fails.
78pub fn startswith<D: Dimension>(a: &StringArray<D>, prefix: &str) -> FerrayResult<Array<bool, D>> {
79    let data: Vec<bool> = a.map_to_vec(|s| s.starts_with(prefix));
80    Array::from_vec(a.dim().clone(), data)
81}
82
83/// Test whether each string element ends with the given suffix.
84///
85/// Returns an `Array<bool, D>` preserving the input shape.
86///
87/// # Errors
88/// Returns an error if the internal array construction fails.
89pub fn endswith<D: Dimension>(a: &StringArray<D>, suffix: &str) -> FerrayResult<Array<bool, D>> {
90    let data: Vec<bool> = a.map_to_vec(|s| s.ends_with(suffix));
91    Array::from_vec(a.dim().clone(), data)
92}
93
94/// Replace occurrences of `old` with `new` in each string element.
95///
96/// If `max_count` is `Some(n)`, only the first `n` occurrences are replaced.
97/// If `None`, all occurrences are replaced.
98///
99/// # Errors
100/// Returns an error if the internal array construction fails.
101pub fn replace<D: Dimension>(
102    a: &StringArray<D>,
103    old: &str,
104    new: &str,
105    max_count: Option<usize>,
106) -> FerrayResult<StringArray<D>> {
107    a.map(|s| match max_count {
108        None => s.replace(old, new),
109        Some(n) => s.replacen(old, new, n),
110    })
111}
112
113/// Find the highest character index of `sub` in each string element.
114///
115/// Returns `-1` for elements where `sub` is not found. Mirrors
116/// `numpy.strings.rfind`.
117///
118/// # Errors
119/// Returns an error if the internal array construction fails.
120pub fn rfind<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
121    let data: Vec<i64> = a.map_to_vec(|s| match s.rfind(sub) {
122        Some(byte_idx) => s[..byte_idx].chars().count() as i64,
123        None => -1,
124    });
125    Array::from_vec(a.dim().clone(), data)
126}
127
128/// Find the lowest character index of `sub` in each string element, raising
129/// when any element does not contain `sub`.
130///
131/// Mirrors `numpy.strings.index`. The first miss surfaces as
132/// `FerrayError::InvalidValue`.
133///
134/// # Errors
135/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
136pub fn index<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
137    let mut data = Vec::with_capacity(a.iter().len());
138    for s in a.iter() {
139        match s.find(sub) {
140            Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
141            None => {
142                return Err(FerrayError::invalid_value(format!(
143                    "index: substring {sub:?} not found in element {s:?}"
144                )));
145            }
146        }
147    }
148    Array::from_vec(a.dim().clone(), data)
149}
150
151/// Find the highest character index of `sub` in each string element, raising
152/// when any element does not contain `sub`.
153///
154/// Mirrors `numpy.strings.rindex`.
155///
156/// # Errors
157/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
158pub fn rindex<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
159    let mut data = Vec::with_capacity(a.iter().len());
160    for s in a.iter() {
161        match s.rfind(sub) {
162            Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
163            None => {
164                return Err(FerrayError::invalid_value(format!(
165                    "rindex: substring {sub:?} not found in element {s:?}"
166                )));
167            }
168        }
169    }
170    Array::from_vec(a.dim().clone(), data)
171}
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176    use crate::string_array::array;
177
178    #[test]
179    fn test_find() {
180        let a = array(&["hello", "world", "bell"]).unwrap();
181        let b = find(&a, "ll").unwrap();
182        let data = b.as_slice().unwrap();
183        assert_eq!(data, &[2, -1, 2]);
184    }
185
186    #[test]
187    fn test_find_at_start() {
188        let a = array(&["abc", "def"]).unwrap();
189        let b = find(&a, "abc").unwrap();
190        let data = b.as_slice().unwrap();
191        assert_eq!(data, &[0, -1]);
192    }
193
194    #[test]
195    fn test_find_empty_sub() {
196        let a = array(&["hello"]).unwrap();
197        let b = find(&a, "").unwrap();
198        let data = b.as_slice().unwrap();
199        assert_eq!(data, &[0]);
200    }
201
202    #[test]
203    fn test_count() {
204        let a = array(&["abcabc", "abc", "xyz"]).unwrap();
205        let b = count(&a, "abc").unwrap();
206        let data = b.as_slice().unwrap();
207        assert_eq!(data, &[2_i64, 1, 0]);
208    }
209
210    #[test]
211    fn test_startswith() {
212        let a = array(&["hello", "world", "help"]).unwrap();
213        let b = startswith(&a, "hel").unwrap();
214        let data = b.as_slice().unwrap();
215        assert_eq!(data, &[true, false, true]);
216    }
217
218    #[test]
219    fn test_endswith() {
220        let a = array(&["hello", "world", "bello"]).unwrap();
221        let b = endswith(&a, "llo").unwrap();
222        let data = b.as_slice().unwrap();
223        assert_eq!(data, &[true, false, true]);
224    }
225
226    #[test]
227    fn test_replace_all() {
228        let a = array(&["aabbcc", "aabba"]).unwrap();
229        let b = replace(&a, "aa", "XX", None).unwrap();
230        assert_eq!(b.as_slice(), &["XXbbcc", "XXbba"]);
231    }
232
233    #[test]
234    fn test_replace_with_count() {
235        let a = array(&["ababab"]).unwrap();
236        let b = replace(&a, "ab", "X", Some(2)).unwrap();
237        assert_eq!(b.as_slice(), &["XXab"]);
238    }
239
240    #[test]
241    fn test_find_ac3() {
242        // AC-3: strings::find(&a, "ll") returns correct indices (2 for "hello", -1 for "world")
243        let a = array(&["hello", "world"]).unwrap();
244        let b = find(&a, "ll").unwrap();
245        let data = b.as_slice().unwrap();
246        assert_eq!(data, &[2, -1]);
247    }
248}