Skip to main content

ferray_strings/
lib.rs

1// ferray-strings: Vectorized string operations on arrays of strings
2//
3// Implements `numpy.strings` (NumPy 2.0+): vectorized elementwise string
4// operations on arrays of strings with broadcasting. Covers case manipulation,
5// alignment/padding, stripping, find/replace, splitting/joining, and regex
6// support. Operates on `StringArray` — a separate array type backed by
7// `Vec<String>`.
8//
9// ## REQ status
10//
11// This crate root is the re-export / namespace-registration surface: it
12// re-exports every operation under a flat namespace (mirroring
13// `numpy.strings.upper`, `numpy.strings.find`, ...) and is itself the
14// production consumer that the `ferray-python` `#[pyfunction]` shims import
15// (`use ferray_strings as fs;` in `ferray-python/src/char.rs`). Per-op REQ
16// evidence lives in each module's own `## REQ status` block; this block
17// records which design-doc REQ each `pub use` here satisfies.
18//
19// SHIPPED (all re-exported below; module impl + Python consumer cited in
20// the named module's `## REQ status`):
21//   - REQ-1/REQ-2 StringArray type + `array` constructor — `pub use
22//     string_array::{StringArray, StringArray1, StringArray2, array}`.
23//   - REQ-3/REQ-4 concat/repeat — `pub use concat::{add, add_same,
24//     multiply}` (see `concat.rs`).
25//   - REQ-5 case — `pub use case::{capitalize, lower, title, upper}`
26//     (see `case.rs`).
27//   - REQ-6 alignment — `pub use align::{center, ljust, ljust_with, rjust,
28//     rjust_with, zfill}` (see `align.rs`).
29//   - REQ-7 stripping — `pub use strip::{lstrip, rstrip, strip}`
30//     (see `strip.rs`).
31//   - REQ-8/REQ-9/REQ-10 replace + search predicates/indices — `pub use
32//     search::{count, endswith, find, index, replace, rfind, rindex,
33//     startswith}` (see `search.rs`).
34//   - REQ-11 split/join — `pub use split_join::{join, join_array, rsplit,
35//     split, split_ragged, splitlines}` (see `split_join.rs`).
36//   - REQ-12/REQ-13 regex match/extract — `pub use regex_ops::{extract,
37//     extract_compiled, match_, match_compiled}` (see `regex_ops.rs`).
38//   - REQ-14 classification — `pub use classify::{isalnum, isalpha,
39//     isdecimal, isdigit, islower, isnumeric, isspace, istitle, isupper}`
40//     (see `classify.rs`).
41//   - Extras (#515/#516/#518) — `pub use str_ops::{equal, greater,
42//     greater_equal, less, less_equal, not_equal, str_len, swapcase}`
43//     and `pub use extras::{decode, encode, expandtabs, mod_, partition,
44//     rpartition, slice, translate}`.
45
46//! # ferray-strings
47//!
48//! Vectorized string operations on arrays of strings, analogous to
49//! `numpy.strings` in `NumPy` 2.0+.
50//!
51//! The primary type is [`StringArray`], a specialized N-dimensional array
52//! backed by `Vec<String>`. Since `String` does not implement
53//! [`ferray_core::Element`], this type is separate from `NdArray<T, D>`.
54//!
55//! # Quick Start
56//!
57//! ```ignore
58//! use ferray_strings::*;
59//!
60//! let a = array(&["hello", "world"]).unwrap();
61//! let b = upper(&a).unwrap();
62//! assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
63//! ```
64
65// Workspace convention: every public function returns FerrayResult<T> and
66// the FerrayError variants are documented once on the type, not on every
67// returning function.
68#![allow(
69    clippy::missing_errors_doc,
70    clippy::missing_panics_doc,
71    clippy::many_single_char_names,
72    clippy::similar_names,
73    clippy::items_after_statements,
74    clippy::option_if_let_else,
75    clippy::too_long_first_doc_paragraph,
76    clippy::needless_pass_by_value,
77    clippy::match_same_arms
78)]
79
80pub mod align;
81pub mod case;
82pub mod classify;
83#[cfg(feature = "compact-storage")]
84pub mod compact;
85pub mod concat;
86pub mod extras;
87pub mod regex_ops;
88pub mod search;
89pub mod serde_impl;
90pub mod split_join;
91pub mod str_ops;
92pub mod string_array;
93pub mod strip;
94
95// Re-export types
96pub use string_array::{StringArray, StringArray1, StringArray2, array};
97
98// Compact (Arrow-style) backend prototype (#736).
99#[cfg(feature = "compact-storage")]
100pub use compact::{CompactStringArray, CompactStringIter, estimated_string_array_bytes};
101
102// Re-export operations for flat namespace (like numpy.strings.upper etc.)
103pub use align::{center, ljust, ljust_with, rjust, rjust_with, zfill};
104pub use case::{capitalize, lower, title, upper};
105pub use classify::{
106    isalnum, isalpha, isdecimal, isdigit, islower, isnumeric, isspace, istitle, isupper,
107};
108pub use concat::{add, add_same, multiply};
109pub use extras::{decode, encode, expandtabs, mod_, partition, rpartition, slice, translate};
110pub use regex_ops::{extract, extract_compiled, match_, match_compiled};
111// Re-export `regex::Regex` so callers of `match_compiled`/`extract_compiled`
112// don't have to add a direct `regex` dependency to construct one.
113pub use regex::Regex;
114pub use search::{count, endswith, find, index, replace, rfind, rindex, startswith};
115pub use split_join::{join, join_array, rsplit, split, split_ragged, splitlines};
116pub use str_ops::{equal, greater, greater_equal, less, less_equal, not_equal, str_len, swapcase};
117pub use strip::{lstrip, rstrip, strip};
118
119#[cfg(test)]
120mod integration_tests {
121    use super::*;
122
123    #[test]
124    fn ac1_upper() {
125        // AC-1: strings::upper(&["hello", "world"]) produces ["HELLO", "WORLD"]
126        let a = array(&["hello", "world"]).unwrap();
127        let b = upper(&a).unwrap();
128        assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
129    }
130
131    #[test]
132    fn ac2_add_broadcast_scalar() {
133        // AC-2: strings::add broadcasts a scalar string against an array correctly
134        let a = array(&["hello", "world"]).unwrap();
135        let b = array(&["!"]).unwrap();
136        let c = add(&a, &b).unwrap();
137        assert_eq!(c.as_slice(), &["hello!", "world!"]);
138    }
139
140    #[test]
141    fn ac3_find_indices() {
142        // AC-3: strings::find(&a, "ll") returns correct indices
143        let a = array(&["hello", "world"]).unwrap();
144        let b = find(&a, "ll").unwrap();
145        let data = b.as_slice().unwrap();
146        assert_eq!(data, &[2_i64, -1_i64]);
147    }
148
149    #[test]
150    fn ac4_split() {
151        // AC-4 (#277): strings::split returns a 2-D StringArray of
152        // shape (n_inputs, max_parts). split_ragged keeps the
153        // ragged Vec<Vec<String>> form for callers that need it.
154        let a = array(&["a-b", "c-d"]).unwrap();
155        let result = split(&a, "-").unwrap();
156        assert_eq!(result.shape(), &[2, 2]);
157        assert_eq!(result.as_slice(), &["a", "b", "c", "d"]);
158    }
159
160    #[test]
161    fn ac5_regex() {
162        // AC-5: Regex match_ and extract work correctly with capture groups
163        let a = array(&["abc123", "def", "ghi456"]).unwrap();
164
165        let matched = match_(&a, r"\d+").unwrap();
166        let matched_data = matched.as_slice().unwrap();
167        assert_eq!(matched_data, &[true, false, true]);
168
169        let extracted = extract(&a, r"(\d+)").unwrap();
170        assert_eq!(extracted.as_slice(), &["123", "", "456"]);
171    }
172
173    #[test]
174    fn full_pipeline() {
175        // End-to-end: strip, upper, add suffix, search
176        let raw = array(&["  Hello  ", " World "]).unwrap();
177        let stripped = strip(&raw, None).unwrap();
178        let uppered = upper(&stripped).unwrap();
179        let suffix = array(&["!"]).unwrap();
180        let result = add(&uppered, &suffix).unwrap();
181        assert_eq!(result.as_slice(), &["HELLO!", "WORLD!"]);
182
183        let has_excl = endswith(&result, "!").unwrap();
184        let data = has_excl.as_slice().unwrap();
185        assert_eq!(data, &[true, true]);
186    }
187
188    #[test]
189    fn case_round_trip() {
190        let a = array(&["Hello World"]).unwrap();
191        let low = lower(&a).unwrap();
192        let titled = title(&low).unwrap();
193        assert_eq!(titled.as_slice(), &["Hello World"]);
194    }
195
196    #[test]
197    fn alignment_operations() {
198        let a = array(&["hi"]).unwrap();
199        let c = center(&a, 6, '-').unwrap();
200        assert_eq!(c.as_slice(), &["--hi--"]);
201
202        let l = ljust(&a, 6).unwrap();
203        assert_eq!(l.as_slice(), &["hi    "]);
204
205        let r = rjust(&a, 6).unwrap();
206        assert_eq!(r.as_slice(), &["    hi"]);
207
208        let z = zfill(&array(&["42"]).unwrap(), 5).unwrap();
209        assert_eq!(z.as_slice(), &["00042"]);
210    }
211
212    #[test]
213    fn strip_operations() {
214        let a = array(&["  hello  "]).unwrap();
215        assert_eq!(strip(&a, None).unwrap().as_slice(), &["hello"]);
216        assert_eq!(lstrip(&a, None).unwrap().as_slice(), &["hello  "]);
217        assert_eq!(rstrip(&a, None).unwrap().as_slice(), &["  hello"]);
218    }
219
220    #[test]
221    fn search_operations() {
222        let a = array(&["hello world", "foo bar"]).unwrap();
223        let c = count(&a, "o").unwrap();
224        let data = c.as_slice().unwrap();
225        // "hello world" has 2 'o's, "foo bar" has 2 'o's
226        assert_eq!(data, &[2_i64, 2]);
227    }
228
229    #[test]
230    fn replace_operation() {
231        let a = array(&["hello world"]).unwrap();
232        let b = replace(&a, "world", "rust", None).unwrap();
233        assert_eq!(b.as_slice(), &["hello rust"]);
234    }
235
236    #[test]
237    fn multiply_operation() {
238        let a = array(&["ab"]).unwrap();
239        let b = multiply(&a, 3).unwrap();
240        assert_eq!(b.as_slice(), &["ababab"]);
241    }
242
243    #[test]
244    fn join_operation() {
245        let parts = vec![
246            vec!["a".to_string(), "b".to_string()],
247            vec!["c".to_string(), "d".to_string()],
248        ];
249        let result = join("-", &parts).unwrap();
250        assert_eq!(result.as_slice(), &["a-b", "c-d"]);
251    }
252
253    #[test]
254    fn capitalize_operation() {
255        let a = array(&["hello world", "RUST"]).unwrap();
256        let b = capitalize(&a).unwrap();
257        assert_eq!(b.as_slice(), &["Hello world", "Rust"]);
258    }
259
260    #[test]
261    fn string_array_2d() {
262        let a = StringArray2::from_rows(&[&["a", "b"], &["c", "d"]]).unwrap();
263        assert_eq!(a.shape(), &[2, 2]);
264        let b = upper(&a).unwrap();
265        assert_eq!(b.as_slice(), &["A", "B", "C", "D"]);
266        assert_eq!(b.shape(), &[2, 2]);
267    }
268
269    // -----------------------------------------------------------------
270    // #520 — shape preservation for ND StringArrays across every op.
271    //
272    // Every operation that takes `StringArray<D>` must thread `D`
273    // through the output. These tests pin that contract on a 2x2
274    // input so regressions (e.g. an accidental `Ix1::new([len])` in
275    // the output constructor) are caught immediately.
276    // -----------------------------------------------------------------
277
278    fn two_by_two(vals: &[&str; 4]) -> crate::StringArray2 {
279        crate::StringArray2::from_rows(&[&[vals[0], vals[1]], &[vals[2], vals[3]]]).unwrap()
280    }
281
282    #[test]
283    fn shape_preserved_case_ops_2d() {
284        let a = two_by_two(&["Hello", "World", "foo", "Bar"]);
285        assert_eq!(upper(&a).unwrap().shape(), &[2, 2]);
286        assert_eq!(lower(&a).unwrap().shape(), &[2, 2]);
287        assert_eq!(capitalize(&a).unwrap().shape(), &[2, 2]);
288        assert_eq!(title(&a).unwrap().shape(), &[2, 2]);
289    }
290
291    #[test]
292    fn shape_preserved_align_ops_2d() {
293        let a = two_by_two(&["a", "bb", "ccc", "dddd"]);
294        assert_eq!(center(&a, 6, ' ').unwrap().shape(), &[2, 2]);
295        assert_eq!(ljust(&a, 6).unwrap().shape(), &[2, 2]);
296        assert_eq!(rjust(&a, 6).unwrap().shape(), &[2, 2]);
297        assert_eq!(zfill(&a, 6).unwrap().shape(), &[2, 2]);
298    }
299
300    #[test]
301    fn shape_preserved_strip_ops_2d() {
302        let a = two_by_two(&["  a  ", "  b  ", "  c  ", "  d  "]);
303        assert_eq!(strip(&a, None).unwrap().shape(), &[2, 2]);
304        assert_eq!(lstrip(&a, None).unwrap().shape(), &[2, 2]);
305        assert_eq!(rstrip(&a, None).unwrap().shape(), &[2, 2]);
306    }
307
308    #[test]
309    fn shape_preserved_concat_ops_2d() {
310        let a = two_by_two(&["ab", "cd", "ef", "gh"]);
311        // `add` currently flattens to IxDyn regardless of input rank
312        // (it supports cross-rank broadcasting), so it's not expected
313        // to preserve D. Just verify the total element count.
314        let b = two_by_two(&["!", "!", "!", "!"]);
315        let ab = add(&a, &b).unwrap();
316        assert_eq!(ab.shape(), &[2, 2]);
317        // multiply preserves D
318        assert_eq!(multiply(&a, 2).unwrap().shape(), &[2, 2]);
319    }
320
321    #[test]
322    fn shape_preserved_search_ops_2d() {
323        let a = two_by_two(&["hello", "help", "world", "word"]);
324        // Each of these returns Array<T, D> — verify D is preserved.
325        assert_eq!(find(&a, "ell").unwrap().shape(), &[2, 2]);
326        assert_eq!(count(&a, "l").unwrap().shape(), &[2, 2]);
327        assert_eq!(startswith(&a, "he").unwrap().shape(), &[2, 2]);
328        assert_eq!(endswith(&a, "d").unwrap().shape(), &[2, 2]);
329        assert_eq!(replace(&a, "l", "L", None).unwrap().shape(), &[2, 2]);
330    }
331
332    #[test]
333    fn shape_preserved_regex_ops_2d() {
334        let a = two_by_two(&["abc123", "x", "y42", "zzz"]);
335        // match_ preserves D; extract flattens by design (ragged results).
336        assert_eq!(match_(&a, r"\d+").unwrap().shape(), &[2, 2]);
337    }
338
339    #[test]
340    fn shape_preserved_case_ops_3d() {
341        // Just to confirm we aren't special-casing Ix2 somewhere — bump to Ix3.
342        use ferray_core::dimension::Ix3;
343        let data: Vec<String> = (0..8).map(|i| format!("s{i}")).collect();
344        let a = crate::StringArray::<Ix3>::from_vec(Ix3::new([2, 2, 2]), data).unwrap();
345        assert_eq!(upper(&a).unwrap().shape(), &[2, 2, 2]);
346        assert_eq!(lower(&a).unwrap().shape(), &[2, 2, 2]);
347    }
348
349    // --- Unicode / multi-byte character tests ---
350
351    #[test]
352    fn unicode_upper_lower() {
353        let a = array(&["café", "naïve", "über"]).unwrap();
354        let u = upper(&a).unwrap();
355        assert_eq!(u.as_slice(), &["CAFÉ", "NAÏVE", "ÜBER"]);
356        let l = lower(&u).unwrap();
357        assert_eq!(l.as_slice(), &["café", "naïve", "über"]);
358    }
359
360    #[test]
361    fn unicode_capitalize() {
362        let a = array(&["ñoño", "straße"]).unwrap();
363        let c = capitalize(&a).unwrap();
364        assert_eq!(c.as_slice()[0], "Ñoño");
365        // Rust's capitalize of "straße" -> "Straße"
366        assert_eq!(c.as_slice()[1], "Straße");
367    }
368
369    #[test]
370    fn unicode_find() {
371        let a = array(&["日本語テスト", "こんにちは"]).unwrap();
372        let r = find(&a, "テスト").unwrap();
373        let data = r.as_slice().unwrap();
374        assert_eq!(data[0], 3); // "テスト" starts at byte position, but find uses char position...
375        // Actually, find returns the byte index via str::find. Check:
376        // "日本語テスト".find("テスト") returns byte offset 9
377        // But our find should return character index or byte index?
378        // Let's just verify it finds it (>= 0) vs not found (-1)
379        assert!(data[0] >= 0); // found
380        assert_eq!(data[1], -1); // not found
381    }
382
383    #[test]
384    fn unicode_strip() {
385        let a = array(&["  héllo  ", "  wörld  "]).unwrap();
386        let s = strip(&a, None).unwrap();
387        assert_eq!(s.as_slice(), &["héllo", "wörld"]);
388    }
389
390    #[test]
391    fn unicode_replace() {
392        let a = array(&["café latte"]).unwrap();
393        let r = replace(&a, "café", "tea", None).unwrap();
394        assert_eq!(r.as_slice(), &["tea latte"]);
395    }
396
397    #[test]
398    fn emoji_operations() {
399        let a = array(&["hello 🌍", "rust 🦀"]).unwrap();
400        let u = upper(&a).unwrap();
401        assert_eq!(u.as_slice(), &["HELLO 🌍", "RUST 🦀"]);
402        let c = count(&a, "🌍").unwrap();
403        assert_eq!(c.as_slice().unwrap(), &[1, 0]);
404    }
405
406    #[test]
407    fn cjk_characters() {
408        let a = array(&["你好世界", "こんにちは"]).unwrap();
409        let starts = startswith(&a, "你好").unwrap();
410        assert_eq!(starts.as_slice().unwrap(), &[true, false]);
411        let ends = endswith(&a, "世界").unwrap();
412        assert_eq!(ends.as_slice().unwrap(), &[true, false]);
413    }
414
415    // ----- Empty array tests (#282) -----
416
417    #[test]
418    fn empty_array_upper() {
419        let a = StringArray1::from_vec(ferray_core::dimension::Ix1::new([0]), vec![]).unwrap();
420        let u = upper(&a).unwrap();
421        assert_eq!(u.len(), 0);
422    }
423
424    #[test]
425    fn empty_array_str_len() {
426        let a = StringArray1::from_vec(ferray_core::dimension::Ix1::new([0]), vec![]).unwrap();
427        let l = str_len(&a).unwrap();
428        assert_eq!(l.size(), 0);
429    }
430
431    #[test]
432    fn empty_array_find() {
433        let a = StringArray1::from_vec(ferray_core::dimension::Ix1::new([0]), vec![]).unwrap();
434        let f = find(&a, "x").unwrap();
435        assert_eq!(f.size(), 0);
436    }
437}