Skip to main content

ferray_strings/
lib.rs

1// ferray-strings: Vectorized string operations on arrays of strings
2//
3// Implements `numpy.strings` (NumPy 2.0+): vectorized elementwise string
4// operations on arrays of strings with broadcasting. Covers case manipulation,
5// alignment/padding, stripping, find/replace, splitting/joining, and regex
6// support. Operates on `StringArray` — a separate array type backed by
7// `Vec<String>`.
8
9//! # ferray-strings
10//!
11//! Vectorized string operations on arrays of strings, analogous to
12//! `numpy.strings` in `NumPy` 2.0+.
13//!
14//! The primary type is [`StringArray`], a specialized N-dimensional array
15//! backed by `Vec<String>`. Since `String` does not implement
16//! [`ferray_core::Element`], this type is separate from `NdArray<T, D>`.
17//!
18//! # Quick Start
19//!
20//! ```ignore
21//! use ferray_strings::*;
22//!
23//! let a = array(&["hello", "world"]).unwrap();
24//! let b = upper(&a).unwrap();
25//! assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
26//! ```
27
28// Workspace convention: every public function returns FerrayResult<T> and
29// the FerrayError variants are documented once on the type, not on every
30// returning function.
31#![allow(
32    clippy::missing_errors_doc,
33    clippy::missing_panics_doc,
34    clippy::many_single_char_names,
35    clippy::similar_names,
36    clippy::items_after_statements,
37    clippy::option_if_let_else,
38    clippy::too_long_first_doc_paragraph,
39    clippy::needless_pass_by_value,
40    clippy::match_same_arms
41)]
42
43pub mod align;
44pub mod case;
45pub mod classify;
46#[cfg(feature = "compact-storage")]
47pub mod compact;
48pub mod concat;
49pub mod extras;
50pub mod regex_ops;
51pub mod search;
52pub mod serde_impl;
53pub mod split_join;
54pub mod str_ops;
55pub mod string_array;
56pub mod strip;
57
58// Re-export types
59pub use string_array::{StringArray, StringArray1, StringArray2, array};
60
61// Compact (Arrow-style) backend prototype (#736).
62#[cfg(feature = "compact-storage")]
63pub use compact::{CompactStringArray, CompactStringIter, estimated_string_array_bytes};
64
65// Re-export operations for flat namespace (like numpy.strings.upper etc.)
66pub use align::{center, ljust, ljust_with, rjust, rjust_with, zfill};
67pub use case::{capitalize, lower, title, upper};
68pub use classify::{
69    isalnum, isalpha, isdecimal, isdigit, islower, isnumeric, isspace, istitle, isupper,
70};
71pub use concat::{add, add_same, multiply};
72pub use extras::{decode, encode, expandtabs, mod_, partition, rpartition, slice, translate};
73pub use regex_ops::{extract, extract_compiled, match_, match_compiled};
74// Re-export `regex::Regex` so callers of `match_compiled`/`extract_compiled`
75// don't have to add a direct `regex` dependency to construct one.
76pub use regex::Regex;
77pub use search::{count, endswith, find, index, replace, rfind, rindex, startswith};
78pub use split_join::{join, join_array, rsplit, split, split_ragged, splitlines};
79pub use str_ops::{equal, greater, greater_equal, less, less_equal, not_equal, str_len, swapcase};
80pub use strip::{lstrip, rstrip, strip};
81
82#[cfg(test)]
83mod integration_tests {
84    use super::*;
85
86    #[test]
87    fn ac1_upper() {
88        // AC-1: strings::upper(&["hello", "world"]) produces ["HELLO", "WORLD"]
89        let a = array(&["hello", "world"]).unwrap();
90        let b = upper(&a).unwrap();
91        assert_eq!(b.as_slice(), &["HELLO", "WORLD"]);
92    }
93
94    #[test]
95    fn ac2_add_broadcast_scalar() {
96        // AC-2: strings::add broadcasts a scalar string against an array correctly
97        let a = array(&["hello", "world"]).unwrap();
98        let b = array(&["!"]).unwrap();
99        let c = add(&a, &b).unwrap();
100        assert_eq!(c.as_slice(), &["hello!", "world!"]);
101    }
102
103    #[test]
104    fn ac3_find_indices() {
105        // AC-3: strings::find(&a, "ll") returns correct indices
106        let a = array(&["hello", "world"]).unwrap();
107        let b = find(&a, "ll").unwrap();
108        let data = b.as_slice().unwrap();
109        assert_eq!(data, &[2_i64, -1_i64]);
110    }
111
112    #[test]
113    fn ac4_split() {
114        // AC-4 (#277): strings::split returns a 2-D StringArray of
115        // shape (n_inputs, max_parts). split_ragged keeps the
116        // ragged Vec<Vec<String>> form for callers that need it.
117        let a = array(&["a-b", "c-d"]).unwrap();
118        let result = split(&a, "-").unwrap();
119        assert_eq!(result.shape(), &[2, 2]);
120        assert_eq!(result.as_slice(), &["a", "b", "c", "d"]);
121    }
122
123    #[test]
124    fn ac5_regex() {
125        // AC-5: Regex match_ and extract work correctly with capture groups
126        let a = array(&["abc123", "def", "ghi456"]).unwrap();
127
128        let matched = match_(&a, r"\d+").unwrap();
129        let matched_data = matched.as_slice().unwrap();
130        assert_eq!(matched_data, &[true, false, true]);
131
132        let extracted = extract(&a, r"(\d+)").unwrap();
133        assert_eq!(extracted.as_slice(), &["123", "", "456"]);
134    }
135
136    #[test]
137    fn full_pipeline() {
138        // End-to-end: strip, upper, add suffix, search
139        let raw = array(&["  Hello  ", " World "]).unwrap();
140        let stripped = strip(&raw, None).unwrap();
141        let uppered = upper(&stripped).unwrap();
142        let suffix = array(&["!"]).unwrap();
143        let result = add(&uppered, &suffix).unwrap();
144        assert_eq!(result.as_slice(), &["HELLO!", "WORLD!"]);
145
146        let has_excl = endswith(&result, "!").unwrap();
147        let data = has_excl.as_slice().unwrap();
148        assert_eq!(data, &[true, true]);
149    }
150
151    #[test]
152    fn case_round_trip() {
153        let a = array(&["Hello World"]).unwrap();
154        let low = lower(&a).unwrap();
155        let titled = title(&low).unwrap();
156        assert_eq!(titled.as_slice(), &["Hello World"]);
157    }
158
159    #[test]
160    fn alignment_operations() {
161        let a = array(&["hi"]).unwrap();
162        let c = center(&a, 6, '-').unwrap();
163        assert_eq!(c.as_slice(), &["--hi--"]);
164
165        let l = ljust(&a, 6).unwrap();
166        assert_eq!(l.as_slice(), &["hi    "]);
167
168        let r = rjust(&a, 6).unwrap();
169        assert_eq!(r.as_slice(), &["    hi"]);
170
171        let z = zfill(&array(&["42"]).unwrap(), 5).unwrap();
172        assert_eq!(z.as_slice(), &["00042"]);
173    }
174
175    #[test]
176    fn strip_operations() {
177        let a = array(&["  hello  "]).unwrap();
178        assert_eq!(strip(&a, None).unwrap().as_slice(), &["hello"]);
179        assert_eq!(lstrip(&a, None).unwrap().as_slice(), &["hello  "]);
180        assert_eq!(rstrip(&a, None).unwrap().as_slice(), &["  hello"]);
181    }
182
183    #[test]
184    fn search_operations() {
185        let a = array(&["hello world", "foo bar"]).unwrap();
186        let c = count(&a, "o").unwrap();
187        let data = c.as_slice().unwrap();
188        // "hello world" has 2 'o's, "foo bar" has 2 'o's
189        assert_eq!(data, &[2_u64, 2]);
190    }
191
192    #[test]
193    fn replace_operation() {
194        let a = array(&["hello world"]).unwrap();
195        let b = replace(&a, "world", "rust", None).unwrap();
196        assert_eq!(b.as_slice(), &["hello rust"]);
197    }
198
199    #[test]
200    fn multiply_operation() {
201        let a = array(&["ab"]).unwrap();
202        let b = multiply(&a, 3).unwrap();
203        assert_eq!(b.as_slice(), &["ababab"]);
204    }
205
206    #[test]
207    fn join_operation() {
208        let parts = vec![
209            vec!["a".to_string(), "b".to_string()],
210            vec!["c".to_string(), "d".to_string()],
211        ];
212        let result = join("-", &parts).unwrap();
213        assert_eq!(result.as_slice(), &["a-b", "c-d"]);
214    }
215
216    #[test]
217    fn capitalize_operation() {
218        let a = array(&["hello world", "RUST"]).unwrap();
219        let b = capitalize(&a).unwrap();
220        assert_eq!(b.as_slice(), &["Hello world", "Rust"]);
221    }
222
223    #[test]
224    fn string_array_2d() {
225        let a = StringArray2::from_rows(&[&["a", "b"], &["c", "d"]]).unwrap();
226        assert_eq!(a.shape(), &[2, 2]);
227        let b = upper(&a).unwrap();
228        assert_eq!(b.as_slice(), &["A", "B", "C", "D"]);
229        assert_eq!(b.shape(), &[2, 2]);
230    }
231
232    // -----------------------------------------------------------------
233    // #520 — shape preservation for ND StringArrays across every op.
234    //
235    // Every operation that takes `StringArray<D>` must thread `D`
236    // through the output. These tests pin that contract on a 2x2
237    // input so regressions (e.g. an accidental `Ix1::new([len])` in
238    // the output constructor) are caught immediately.
239    // -----------------------------------------------------------------
240
241    fn two_by_two(vals: &[&str; 4]) -> crate::StringArray2 {
242        crate::StringArray2::from_rows(&[&[vals[0], vals[1]], &[vals[2], vals[3]]]).unwrap()
243    }
244
245    #[test]
246    fn shape_preserved_case_ops_2d() {
247        let a = two_by_two(&["Hello", "World", "foo", "Bar"]);
248        assert_eq!(upper(&a).unwrap().shape(), &[2, 2]);
249        assert_eq!(lower(&a).unwrap().shape(), &[2, 2]);
250        assert_eq!(capitalize(&a).unwrap().shape(), &[2, 2]);
251        assert_eq!(title(&a).unwrap().shape(), &[2, 2]);
252    }
253
254    #[test]
255    fn shape_preserved_align_ops_2d() {
256        let a = two_by_two(&["a", "bb", "ccc", "dddd"]);
257        assert_eq!(center(&a, 6, ' ').unwrap().shape(), &[2, 2]);
258        assert_eq!(ljust(&a, 6).unwrap().shape(), &[2, 2]);
259        assert_eq!(rjust(&a, 6).unwrap().shape(), &[2, 2]);
260        assert_eq!(zfill(&a, 6).unwrap().shape(), &[2, 2]);
261    }
262
263    #[test]
264    fn shape_preserved_strip_ops_2d() {
265        let a = two_by_two(&["  a  ", "  b  ", "  c  ", "  d  "]);
266        assert_eq!(strip(&a, None).unwrap().shape(), &[2, 2]);
267        assert_eq!(lstrip(&a, None).unwrap().shape(), &[2, 2]);
268        assert_eq!(rstrip(&a, None).unwrap().shape(), &[2, 2]);
269    }
270
271    #[test]
272    fn shape_preserved_concat_ops_2d() {
273        let a = two_by_two(&["ab", "cd", "ef", "gh"]);
274        // `add` currently flattens to IxDyn regardless of input rank
275        // (it supports cross-rank broadcasting), so it's not expected
276        // to preserve D. Just verify the total element count.
277        let b = two_by_two(&["!", "!", "!", "!"]);
278        let ab = add(&a, &b).unwrap();
279        assert_eq!(ab.shape(), &[2, 2]);
280        // multiply preserves D
281        assert_eq!(multiply(&a, 2).unwrap().shape(), &[2, 2]);
282    }
283
284    #[test]
285    fn shape_preserved_search_ops_2d() {
286        let a = two_by_two(&["hello", "help", "world", "word"]);
287        // Each of these returns Array<T, D> — verify D is preserved.
288        assert_eq!(find(&a, "ell").unwrap().shape(), &[2, 2]);
289        assert_eq!(count(&a, "l").unwrap().shape(), &[2, 2]);
290        assert_eq!(startswith(&a, "he").unwrap().shape(), &[2, 2]);
291        assert_eq!(endswith(&a, "d").unwrap().shape(), &[2, 2]);
292        assert_eq!(replace(&a, "l", "L", None).unwrap().shape(), &[2, 2]);
293    }
294
295    #[test]
296    fn shape_preserved_regex_ops_2d() {
297        let a = two_by_two(&["abc123", "x", "y42", "zzz"]);
298        // match_ preserves D; extract flattens by design (ragged results).
299        assert_eq!(match_(&a, r"\d+").unwrap().shape(), &[2, 2]);
300    }
301
302    #[test]
303    fn shape_preserved_case_ops_3d() {
304        // Just to confirm we aren't special-casing Ix2 somewhere — bump to Ix3.
305        use ferray_core::dimension::Ix3;
306        let data: Vec<String> = (0..8).map(|i| format!("s{i}")).collect();
307        let a = crate::StringArray::<Ix3>::from_vec(Ix3::new([2, 2, 2]), data).unwrap();
308        assert_eq!(upper(&a).unwrap().shape(), &[2, 2, 2]);
309        assert_eq!(lower(&a).unwrap().shape(), &[2, 2, 2]);
310    }
311
312    // --- Unicode / multi-byte character tests ---
313
314    #[test]
315    fn unicode_upper_lower() {
316        let a = array(&["café", "naïve", "über"]).unwrap();
317        let u = upper(&a).unwrap();
318        assert_eq!(u.as_slice(), &["CAFÉ", "NAÏVE", "ÜBER"]);
319        let l = lower(&u).unwrap();
320        assert_eq!(l.as_slice(), &["café", "naïve", "über"]);
321    }
322
323    #[test]
324    fn unicode_capitalize() {
325        let a = array(&["ñoño", "straße"]).unwrap();
326        let c = capitalize(&a).unwrap();
327        assert_eq!(c.as_slice()[0], "Ñoño");
328        // Rust's capitalize of "straße" -> "Straße"
329        assert_eq!(c.as_slice()[1], "Straße");
330    }
331
332    #[test]
333    fn unicode_find() {
334        let a = array(&["日本語テスト", "こんにちは"]).unwrap();
335        let r = find(&a, "テスト").unwrap();
336        let data = r.as_slice().unwrap();
337        assert_eq!(data[0], 3); // "テスト" starts at byte position, but find uses char position...
338        // Actually, find returns the byte index via str::find. Check:
339        // "日本語テスト".find("テスト") returns byte offset 9
340        // But our find should return character index or byte index?
341        // Let's just verify it finds it (>= 0) vs not found (-1)
342        assert!(data[0] >= 0); // found
343        assert_eq!(data[1], -1); // not found
344    }
345
346    #[test]
347    fn unicode_strip() {
348        let a = array(&["  héllo  ", "  wörld  "]).unwrap();
349        let s = strip(&a, None).unwrap();
350        assert_eq!(s.as_slice(), &["héllo", "wörld"]);
351    }
352
353    #[test]
354    fn unicode_replace() {
355        let a = array(&["café latte"]).unwrap();
356        let r = replace(&a, "café", "tea", None).unwrap();
357        assert_eq!(r.as_slice(), &["tea latte"]);
358    }
359
360    #[test]
361    fn emoji_operations() {
362        let a = array(&["hello 🌍", "rust 🦀"]).unwrap();
363        let u = upper(&a).unwrap();
364        assert_eq!(u.as_slice(), &["HELLO 🌍", "RUST 🦀"]);
365        let c = count(&a, "🌍").unwrap();
366        assert_eq!(c.as_slice().unwrap(), &[1, 0]);
367    }
368
369    #[test]
370    fn cjk_characters() {
371        let a = array(&["你好世界", "こんにちは"]).unwrap();
372        let starts = startswith(&a, "你好").unwrap();
373        assert_eq!(starts.as_slice().unwrap(), &[true, false]);
374        let ends = endswith(&a, "世界").unwrap();
375        assert_eq!(ends.as_slice().unwrap(), &[true, false]);
376    }
377
378    // ----- Empty array tests (#282) -----
379
380    #[test]
381    fn empty_array_upper() {
382        let a = StringArray1::from_vec(ferray_core::dimension::Ix1::new([0]), vec![]).unwrap();
383        let u = upper(&a).unwrap();
384        assert_eq!(u.len(), 0);
385    }
386
387    #[test]
388    fn empty_array_str_len() {
389        let a = StringArray1::from_vec(ferray_core::dimension::Ix1::new([0]), vec![]).unwrap();
390        let l = str_len(&a).unwrap();
391        assert_eq!(l.size(), 0);
392    }
393
394    #[test]
395    fn empty_array_find() {
396        let a = StringArray1::from_vec(ferray_core::dimension::Ix1::new([0]), vec![]).unwrap();
397        let f = find(&a, "x").unwrap();
398        assert_eq!(f.size(), 0);
399    }
400}