ferray_strings/search.rs
1// ferray-strings: Search operations (REQ-8, REQ-9, REQ-10)
2//
3// Implements find, rfind, index, rindex, count, startswith, endswith, replace —
4// elementwise on StringArray.
5//
6// ## REQ status
7//
8// SHIPPED:
9// - REQ-8 replace — `replace` (`pub fn`): elementwise substring
10// replacement with an optional count, matching `numpy.strings.replace`.
11// - REQ-9 search predicates — `startswith`, `endswith` (`pub fn`) ->
12// `Array<bool, D>`, matching `numpy.strings.startswith`/`endswith`.
13// - REQ-10 search indices — `find`, `count` (`pub fn`) -> `Array<i64, D>`
14// (`find` returns `-1` when absent; both return **character** indices /
15// counts, the published `numpy.strings.find`/`count` contract writing a
16// signed `npy_intp`). The full search family `rfind`, `index`, `rindex`
17// ships alongside (`index`/`rindex` raise `ValueError` when `sub` is
18// absent, like CPython `str.index`/`rindex`). Audited NO DIVERGENCE.
19//
20// Consumers (non-test): re-exported from the crate root
21// (`ferray-strings/src/lib.rs` `pub use search::{count, endswith, find,
22// index, replace, rfind, rindex, startswith}`) and bound at the Python
23// surface by the `#[pyfunction]` shims `count`, `find`, `startswith`,
24// `endswith`, `replace`, `rfind` in `ferray-python/src/char.rs` (each
25// calling the matching `fs::` function); the `index`/`rindex` shims build on
26// `fs::find`/`fs::rfind`. These back `numpy.char`/`numpy.strings`.
27
28// `find` returns an `i64` array following NumPy's `numpy.strings.find`
29// contract (with `-1` for "not found"); converting the `usize` char count
30// to `i64` is the published return-type, not a precision bug.
31#![allow(clippy::cast_possible_wrap)]
32
33use ferray_core::Array;
34use ferray_core::dimension::Dimension;
35use ferray_core::error::{FerrayError, FerrayResult};
36
37use crate::string_array::StringArray;
38
39/// Find the lowest index of `sub` in each string element.
40///
41/// Returns an `Array<i64, D>` preserving the input shape, where each element
42/// is the index of the first occurrence of `sub`, or -1 if not found.
43///
44/// # Errors
45/// Returns an error if the internal array construction fails.
46pub fn find<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
47 let data: Vec<i64> = a.map_to_vec(|s| {
48 match s.find(sub) {
49 Some(byte_idx) => {
50 // Convert byte index to character index
51 s[..byte_idx].chars().count() as i64
52 }
53 None => -1,
54 }
55 });
56 Array::from_vec(a.dim().clone(), data)
57}
58
59/// Count non-overlapping occurrences of `sub` in each string element.
60///
61/// Returns an `Array<i64, D>` preserving the input shape. numpy's `count`
62/// ufunc writes a signed `npy_intp` (int64) result, so the element type is
63/// signed `i64`.
64///
65/// # Errors
66/// Returns an error if the internal array construction fails.
67pub fn count<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
68 let data: Vec<i64> = a.map_to_vec(|s| s.matches(sub).count() as i64);
69 Array::from_vec(a.dim().clone(), data)
70}
71
72/// Test whether each string element starts with the given prefix.
73///
74/// Returns an `Array<bool, D>` preserving the input shape.
75///
76/// # Errors
77/// Returns an error if the internal array construction fails.
78pub fn startswith<D: Dimension>(a: &StringArray<D>, prefix: &str) -> FerrayResult<Array<bool, D>> {
79 let data: Vec<bool> = a.map_to_vec(|s| s.starts_with(prefix));
80 Array::from_vec(a.dim().clone(), data)
81}
82
83/// Test whether each string element ends with the given suffix.
84///
85/// Returns an `Array<bool, D>` preserving the input shape.
86///
87/// # Errors
88/// Returns an error if the internal array construction fails.
89pub fn endswith<D: Dimension>(a: &StringArray<D>, suffix: &str) -> FerrayResult<Array<bool, D>> {
90 let data: Vec<bool> = a.map_to_vec(|s| s.ends_with(suffix));
91 Array::from_vec(a.dim().clone(), data)
92}
93
94/// Replace occurrences of `old` with `new` in each string element.
95///
96/// If `max_count` is `Some(n)`, only the first `n` occurrences are replaced.
97/// If `None`, all occurrences are replaced.
98///
99/// # Errors
100/// Returns an error if the internal array construction fails.
101pub fn replace<D: Dimension>(
102 a: &StringArray<D>,
103 old: &str,
104 new: &str,
105 max_count: Option<usize>,
106) -> FerrayResult<StringArray<D>> {
107 a.map(|s| match max_count {
108 None => s.replace(old, new),
109 Some(n) => s.replacen(old, new, n),
110 })
111}
112
113/// Find the highest character index of `sub` in each string element.
114///
115/// Returns `-1` for elements where `sub` is not found. Mirrors
116/// `numpy.strings.rfind`.
117///
118/// # Errors
119/// Returns an error if the internal array construction fails.
120pub fn rfind<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
121 let data: Vec<i64> = a.map_to_vec(|s| match s.rfind(sub) {
122 Some(byte_idx) => s[..byte_idx].chars().count() as i64,
123 None => -1,
124 });
125 Array::from_vec(a.dim().clone(), data)
126}
127
128/// Find the lowest character index of `sub` in each string element, raising
129/// when any element does not contain `sub`.
130///
131/// Mirrors `numpy.strings.index`. The first miss surfaces as
132/// `FerrayError::InvalidValue`.
133///
134/// # Errors
135/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
136pub fn index<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
137 let mut data = Vec::with_capacity(a.iter().len());
138 for s in a.iter() {
139 match s.find(sub) {
140 Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
141 None => {
142 return Err(FerrayError::invalid_value(format!(
143 "index: substring {sub:?} not found in element {s:?}"
144 )));
145 }
146 }
147 }
148 Array::from_vec(a.dim().clone(), data)
149}
150
151/// Find the highest character index of `sub` in each string element, raising
152/// when any element does not contain `sub`.
153///
154/// Mirrors `numpy.strings.rindex`.
155///
156/// # Errors
157/// - `FerrayError::InvalidValue` if any element does not contain `sub`.
158pub fn rindex<D: Dimension>(a: &StringArray<D>, sub: &str) -> FerrayResult<Array<i64, D>> {
159 let mut data = Vec::with_capacity(a.iter().len());
160 for s in a.iter() {
161 match s.rfind(sub) {
162 Some(byte_idx) => data.push(s[..byte_idx].chars().count() as i64),
163 None => {
164 return Err(FerrayError::invalid_value(format!(
165 "rindex: substring {sub:?} not found in element {s:?}"
166 )));
167 }
168 }
169 }
170 Array::from_vec(a.dim().clone(), data)
171}
172
173#[cfg(test)]
174mod tests {
175 use super::*;
176 use crate::string_array::array;
177
178 #[test]
179 fn test_find() {
180 let a = array(&["hello", "world", "bell"]).unwrap();
181 let b = find(&a, "ll").unwrap();
182 let data = b.as_slice().unwrap();
183 assert_eq!(data, &[2, -1, 2]);
184 }
185
186 #[test]
187 fn test_find_at_start() {
188 let a = array(&["abc", "def"]).unwrap();
189 let b = find(&a, "abc").unwrap();
190 let data = b.as_slice().unwrap();
191 assert_eq!(data, &[0, -1]);
192 }
193
194 #[test]
195 fn test_find_empty_sub() {
196 let a = array(&["hello"]).unwrap();
197 let b = find(&a, "").unwrap();
198 let data = b.as_slice().unwrap();
199 assert_eq!(data, &[0]);
200 }
201
202 #[test]
203 fn test_count() {
204 let a = array(&["abcabc", "abc", "xyz"]).unwrap();
205 let b = count(&a, "abc").unwrap();
206 let data = b.as_slice().unwrap();
207 assert_eq!(data, &[2_i64, 1, 0]);
208 }
209
210 #[test]
211 fn test_startswith() {
212 let a = array(&["hello", "world", "help"]).unwrap();
213 let b = startswith(&a, "hel").unwrap();
214 let data = b.as_slice().unwrap();
215 assert_eq!(data, &[true, false, true]);
216 }
217
218 #[test]
219 fn test_endswith() {
220 let a = array(&["hello", "world", "bello"]).unwrap();
221 let b = endswith(&a, "llo").unwrap();
222 let data = b.as_slice().unwrap();
223 assert_eq!(data, &[true, false, true]);
224 }
225
226 #[test]
227 fn test_replace_all() {
228 let a = array(&["aabbcc", "aabba"]).unwrap();
229 let b = replace(&a, "aa", "XX", None).unwrap();
230 assert_eq!(b.as_slice(), &["XXbbcc", "XXbba"]);
231 }
232
233 #[test]
234 fn test_replace_with_count() {
235 let a = array(&["ababab"]).unwrap();
236 let b = replace(&a, "ab", "X", Some(2)).unwrap();
237 assert_eq!(b.as_slice(), &["XXab"]);
238 }
239
240 #[test]
241 fn test_find_ac3() {
242 // AC-3: strings::find(&a, "ll") returns correct indices (2 for "hello", -1 for "world")
243 let a = array(&["hello", "world"]).unwrap();
244 let b = find(&a, "ll").unwrap();
245 let data = b.as_slice().unwrap();
246 assert_eq!(data, &[2, -1]);
247 }
248}