Skip to main content

ferray_strings/
split_join.rs

1// ferray-strings: Split and join operations (REQ-11)
2//
3// Implements split and join — elementwise on StringArray.
4//
5// ## REQ status
6//
7// SHIPPED:
8//   - REQ-11 split/join — `split`, `rsplit`, `splitlines` (`pub fn`) produce
9//     a rectangular 2-D `StringArray2` (right-padded with empty strings to
10//     the widest row); `split_ragged` (`pub fn`) keeps the ragged
11//     `Vec<Vec<String>>` form for callers that need it. `join` /
12//     `join_array` (`pub fn`) concatenate parts with a separator. An empty
13//     separator is rejected (CPython/`np.char.split` raise on it), and
14//     `splitlines` recognizes the full CPython line-boundary set. Matches
15//     `numpy.strings.split`/`rsplit`/`splitlines`/`join`.
16//
17// Consumers (non-test): re-exported from the crate root
18// (`ferray-strings/src/lib.rs` `pub use split_join::{join, join_array,
19// rsplit, split, split_ragged, splitlines}`) and bound at the Python surface
20// in `ferray-python/src/char.rs` (`fs::split_ragged` backs the `split` shim,
21// `fs::rsplit` the `rsplit` shim, `fs::splitlines` the `splitlines` shim, and
22// the `join` shim), which back `numpy.char`/`numpy.strings`.
23
24use ferray_core::dimension::{Dimension, Ix1, Ix2};
25use ferray_core::error::{FerrayError, FerrayResult};
26
27use crate::string_array::{StringArray, StringArray1, StringArray2};
28
29/// Reject the empty separator: Rust's `str::split("")` returns the
30/// surprising `["", "a", "b", "c", ""]` pattern (one empty token per
31/// boundary including the ends), but numpy's `np.char.split` raises
32/// ValueError on an empty separator. Reject up front to match (#283).
33fn validate_separator(sep: &str) -> FerrayResult<()> {
34    if sep.is_empty() {
35        return Err(FerrayError::invalid_value(
36            "split separator must not be empty",
37        ));
38    }
39    Ok(())
40}
41
42/// Split each string element by the given separator.
43///
44/// Returns a 2-D `StringArray` of shape `(n_inputs, max_parts)` where row
45/// `i` contains the parts produced by splitting element `i`. Rows shorter
46/// than `max_parts` are padded with empty strings (#277). Use
47/// [`split_ragged`] when you need the unpadded `Vec<Vec<String>>` form.
48///
49/// # Errors
50/// Returns an error if the internal array construction fails.
51pub fn split<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<StringArray2> {
52    validate_separator(sep)?;
53    let parts: Vec<Vec<String>> = a
54        .iter()
55        .map(|s| s.split(sep).map(String::from).collect())
56        .collect();
57    let n_inputs = parts.len();
58    let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
59    let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
60    for row in &parts {
61        for j in 0..max_parts {
62            flat.push(row.get(j).cloned().unwrap_or_default());
63        }
64    }
65    StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
66}
67
68/// Right-to-left counterpart of [`split`] (#515).
69///
70/// Splits each element on `sep` starting from the right. With the
71/// optional `maxsplit` cap, only the rightmost `maxsplit` separators
72/// produce splits — leading remainder is kept as one piece. Mirrors
73/// `numpy.strings.rsplit`.
74///
75/// # Errors
76/// Returns an error if `sep` is empty or array construction fails.
77pub fn rsplit<D: Dimension>(
78    a: &StringArray<D>,
79    sep: &str,
80    maxsplit: Option<usize>,
81) -> FerrayResult<StringArray2> {
82    validate_separator(sep)?;
83    let parts: Vec<Vec<String>> = a
84        .iter()
85        .map(|s| match maxsplit {
86            None => s.rsplit(sep).map(String::from).collect::<Vec<_>>(),
87            Some(n) => s.rsplitn(n + 1, sep).map(String::from).collect::<Vec<_>>(),
88        })
89        .map(|mut v| {
90            v.reverse();
91            v
92        })
93        .collect();
94    let n_inputs = parts.len();
95    let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
96    let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
97    for row in &parts {
98        for j in 0..max_parts {
99            flat.push(row.get(j).cloned().unwrap_or_default());
100        }
101    }
102    StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
103}
104
105/// Split each element on universal newlines (#515). Equivalent to
106/// `numpy.strings.splitlines`.
107///
108/// Returns a 2-D `StringArray` shaped `(n_inputs, max_lines)` with
109/// trailing empty padding when the per-element line count differs.
110/// `keepends = true` retains the line terminator on each kept line,
111/// matching Python/NumPy behavior.
112///
113/// # Errors
114/// Returns an error if array construction fails.
115pub fn splitlines<D: Dimension>(a: &StringArray<D>, keepends: bool) -> FerrayResult<StringArray2> {
116    let parts: Vec<Vec<String>> = a
117        .iter()
118        .map(|s| split_universal_newlines(s, keepends))
119        .collect();
120    let n_inputs = parts.len();
121    let max_lines = parts.iter().map(Vec::len).max().unwrap_or(0);
122    let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_lines);
123    for row in &parts {
124        for j in 0..max_lines {
125            flat.push(row.get(j).cloned().unwrap_or_default());
126        }
127    }
128    StringArray2::from_vec(Ix2::new([n_inputs, max_lines]), flat)
129}
130
131/// Whether `c` is a line boundary for `splitlines`, matching Python
132/// `str.splitlines` / `numpy.char.splitlines`.
133///
134/// Python breaks on the full universal-newline set, derived live
135/// (`("a" + sep + "b").splitlines()` has length > 1): `\n`, `\v` (U+000B),
136/// `\f` (U+000C), `\r`, U+001C/001D/001E (FS/GS/RS), U+0085 (NEL),
137/// U+2028 (LS), U+2029 (PS) — exactly ten boundaries (note: NOT U+001F, and
138/// `\r\n` is a single boundary, handled by the caller). Upstream:
139/// numpy/_core/strings.py:1528 splitlines -> `str.splitlines`.
140fn is_line_boundary(c: char) -> bool {
141    matches!(
142        c,
143        '\n' | '\u{0B}'
144            | '\u{0C}'
145            | '\r'
146            | '\u{1C}'
147            | '\u{1D}'
148            | '\u{1E}'
149            | '\u{85}'
150            | '\u{2028}'
151            | '\u{2029}'
152    )
153}
154
155/// Universal-newline split: a `\r\n` pair is a single boundary; every other
156/// boundary character in [`is_line_boundary`] terminates a line on its own.
157/// The result mirrors Python's `str.splitlines`.
158fn split_universal_newlines(s: &str, keepends: bool) -> Vec<String> {
159    let mut out = Vec::new();
160    let mut line_start = 0usize;
161    let mut chars = s.char_indices().peekable();
162    while let Some((idx, c)) = chars.next() {
163        if !is_line_boundary(c) {
164            continue;
165        }
166        // `\r\n` collapses into one boundary; consume the trailing `\n`.
167        let mut eol_end = idx + c.len_utf8();
168        if c == '\r' {
169            if let Some(&(_, '\n')) = chars.peek() {
170                chars.next();
171                eol_end += '\n'.len_utf8();
172            }
173        }
174        let line_end = if keepends { eol_end } else { idx };
175        out.push(s[line_start..line_end].to_string());
176        line_start = eol_end;
177    }
178    if line_start < s.len() {
179        out.push(s[line_start..].to_string());
180    }
181    out
182}
183
184/// Ragged-result variant of [`split`]: returns a `Vec<Vec<String>>` so
185/// callers that need the unpadded splits per element don't have to
186/// strip empty padding from the 2-D result (#277).
187///
188/// # Errors
189/// Returns an error only for internal failures.
190pub fn split_ragged<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<Vec<Vec<String>>> {
191    validate_separator(sep)?;
192    let result: Vec<Vec<String>> = a
193        .iter()
194        .map(|s| s.split(sep).map(String::from).collect())
195        .collect();
196    Ok(result)
197}
198
199/// Join a collection of string vectors using the given separator.
200///
201/// Each element in the input is a `Vec<String>` which is joined into
202/// a single string. Returns a 1-D `StringArray`.
203///
204/// # Errors
205/// Returns an error if the internal array construction fails.
206pub fn join(sep: &str, items: &[Vec<String>]) -> FerrayResult<StringArray1> {
207    let data: Vec<String> = items.iter().map(|parts| parts.join(sep)).collect();
208    let dim = Ix1::new([data.len()]);
209    StringArray1::from_vec(dim, data)
210}
211
212/// Join each string element of a `StringArray` using the given separator.
213///
214/// This variant takes a `StringArray` and joins all elements into a single
215/// string. Returns a 1-D `StringArray` with one element.
216///
217/// # Errors
218/// Returns an error if the internal array construction fails.
219pub fn join_array<D: Dimension>(sep: &str, a: &StringArray<D>) -> FerrayResult<StringArray1> {
220    let joined: String = a
221        .iter()
222        .map(std::string::String::as_str)
223        .collect::<Vec<&str>>()
224        .join(sep);
225    let dim = Ix1::new([1]);
226    StringArray1::from_vec(dim, vec![joined])
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232    use crate::string_array::array;
233
234    #[test]
235    fn test_split() {
236        let a = array(&["a-b", "c-d"]).unwrap();
237        let result = split(&a, "-").unwrap();
238        assert_eq!(result.shape(), &[2, 2]);
239        let s = result.as_slice();
240        assert_eq!(s, &["a", "b", "c", "d"]);
241    }
242
243    // ---- rsplit / splitlines (#515) ------------------------------------
244
245    #[test]
246    fn rsplit_basic_no_limit() {
247        let a = array(&["a-b-c", "x-y"]).unwrap();
248        let r = rsplit(&a, "-", None).unwrap();
249        assert_eq!(r.shape(), &[2, 3]);
250        let s = r.as_slice();
251        // Trailing empty pads (matches `split`'s padding convention).
252        // Row 0: ["a","b","c"], Row 1: ["x","y",""]
253        assert_eq!(s, &["a", "b", "c", "x", "y", ""]);
254    }
255
256    #[test]
257    fn rsplit_with_maxsplit_one() {
258        // maxsplit=1: only the rightmost separator splits.
259        let a = array(&["a-b-c-d"]).unwrap();
260        let r = rsplit(&a, "-", Some(1)).unwrap();
261        assert_eq!(r.shape(), &[1, 2]);
262        let s = r.as_slice();
263        assert_eq!(s, &["a-b-c", "d"]);
264    }
265
266    #[test]
267    fn splitlines_with_lf_and_crlf() {
268        let a = array(&["one\ntwo\r\nthree", "single"]).unwrap();
269        let r = splitlines(&a, false).unwrap();
270        // Row 0 has 3 lines, row 1 has 1 line. Padding to 3.
271        assert_eq!(r.shape(), &[2, 3]);
272        let s = r.as_slice();
273        assert_eq!(s, &["one", "two", "three", "single", "", ""]);
274    }
275
276    #[test]
277    fn splitlines_keepends_retains_terminator() {
278        let a = array(&["x\ny\r\nz"]).unwrap();
279        let r = splitlines(&a, true).unwrap();
280        let s = r.as_slice();
281        assert_eq!(s, &["x\n", "y\r\n", "z"]);
282    }
283
284    #[test]
285    fn splitlines_handles_solo_carriage_return() {
286        let a = array(&["a\rb"]).unwrap();
287        let r = splitlines(&a, false).unwrap();
288        let s = r.as_slice();
289        assert_eq!(s, &["a", "b"]);
290    }
291
292    // ---- Universal-newline boundary divergence regression (#917) -------
293    // Python `str.splitlines` / `numpy.char.splitlines` break on the full
294    // set; ferray previously only broke on \n/\r/\r\n. Expected values mirror
295    // e.g. `"a\x0bb\x0cc".splitlines() == ["a", "b", "c"]` (R-CHAR-3). These
296    // exercise the boundary logic directly via `split_universal_newlines`.
297
298    #[test]
299    fn splitlines_breaks_on_vtab_and_formfeed() {
300        // "a\x0bb\x0cc".splitlines() == ["a", "b", "c"]
301        assert_eq!(
302            split_universal_newlines("a\u{0B}b\u{0C}c", false),
303            vec!["a".to_string(), "b".to_string(), "c".to_string()]
304        );
305    }
306
307    #[test]
308    fn splitlines_breaks_on_c0_separators() {
309        // "a\x1cb\x1dc\x1ed".splitlines() == ["a", "b", "c", "d"]
310        // (FS/GS/RS are boundaries; US \x1f is NOT, per Python.)
311        assert_eq!(
312            split_universal_newlines("a\u{1C}b\u{1D}c\u{1E}d", false),
313            vec![
314                "a".to_string(),
315                "b".to_string(),
316                "c".to_string(),
317                "d".to_string()
318            ]
319        );
320    }
321
322    #[test]
323    fn splitlines_unit_separator_is_not_a_boundary() {
324        // "a\x1fb".splitlines() == ["a\x1fb"] — U+001F is NOT a boundary.
325        assert_eq!(
326            split_universal_newlines("a\u{1F}b", false),
327            vec!["a\u{1F}b".to_string()]
328        );
329    }
330
331    #[test]
332    fn splitlines_breaks_on_nel_and_unicode_separators() {
333        // "a\x85b c d".splitlines() == ["a", "b", "c", "d"]
334        // (NEL U+0085, LINE SEP U+2028, PARA SEP U+2029.)
335        assert_eq!(
336            split_universal_newlines("a\u{85}b\u{2028}c\u{2029}d", false),
337            vec![
338                "a".to_string(),
339                "b".to_string(),
340                "c".to_string(),
341                "d".to_string()
342            ]
343        );
344    }
345
346    #[test]
347    fn splitlines_keepends_retains_unicode_terminators() {
348        // keepends keeps each (multi-byte) terminator on its line, matching
349        // `"a\x85b c".splitlines(True) == ["a\x85", "b ", "c"]`.
350        assert_eq!(
351            split_universal_newlines("a\u{85}b\u{2028}c", true),
352            vec![
353                "a\u{85}".to_string(),
354                "b\u{2028}".to_string(),
355                "c".to_string()
356            ]
357        );
358    }
359
360    #[test]
361    fn test_split_multiple_parts() {
362        let a = array(&["a-b-c"]).unwrap();
363        let result = split(&a, "-").unwrap();
364        assert_eq!(result.shape(), &[1, 3]);
365        assert_eq!(result.as_slice(), &["a", "b", "c"]);
366    }
367
368    #[test]
369    fn test_split_no_separator_found() {
370        let a = array(&["hello"]).unwrap();
371        let result = split(&a, "-").unwrap();
372        assert_eq!(result.shape(), &[1, 1]);
373        assert_eq!(result.as_slice(), &["hello"]);
374    }
375
376    #[test]
377    fn test_split_pads_short_rows_with_empty_strings() {
378        // #277: rows shorter than max_parts must be padded with "".
379        let a = array(&["a-b", "x-y-z"]).unwrap();
380        let result = split(&a, "-").unwrap();
381        assert_eq!(result.shape(), &[2, 3]);
382        // Row 0: ["a", "b", ""] (padded), Row 1: ["x", "y", "z"]
383        assert_eq!(result.as_slice(), &["a", "b", "", "x", "y", "z"]);
384    }
385
386    #[test]
387    fn test_split_ragged_returns_unpadded() {
388        // #277: split_ragged keeps the per-element variable length.
389        let a = array(&["a-b", "x-y-z"]).unwrap();
390        let result = split_ragged(&a, "-").unwrap();
391        assert_eq!(
392            result,
393            vec![
394                vec!["a".to_string(), "b".to_string()],
395                vec!["x".to_string(), "y".to_string(), "z".to_string()],
396            ]
397        );
398    }
399
400    #[test]
401    fn test_join() {
402        let items = vec![
403            vec!["a".to_string(), "b".to_string()],
404            vec!["c".to_string(), "d".to_string()],
405        ];
406        let result = join("-", &items).unwrap();
407        assert_eq!(result.as_slice(), &["a-b", "c-d"]);
408    }
409
410    #[test]
411    fn test_join_array() {
412        let a = array(&["hello", "world"]).unwrap();
413        let result = join_array(" ", &a).unwrap();
414        assert_eq!(result.as_slice(), &["hello world"]);
415    }
416
417    #[test]
418    fn test_split_ac4() {
419        // AC-4: strings::split_ragged(&["a-b", "c-d"], "-") returns
420        // [vec!["a","b"], vec!["c","d"]] — the ragged form preserves
421        // the original AC behavior.
422        let a = array(&["a-b", "c-d"]).unwrap();
423        let result = split_ragged(&a, "-").unwrap();
424        assert_eq!(
425            result,
426            vec![
427                vec!["a".to_string(), "b".to_string()],
428                vec!["c".to_string(), "d".to_string()],
429            ]
430        );
431    }
432
433    // ----- Empty separator rejection (#283) ------------------------------
434
435    #[test]
436    fn test_split_empty_separator_errs() {
437        // #283: Rust's str::split("") returns the surprising
438        // ["", "a", "b", "c", ""] pattern with empty tokens around
439        // every char boundary. numpy's np.char.split raises ValueError
440        // for an empty separator. Match numpy's strict path.
441        let a = array(&["abc", "def"]).unwrap();
442        let err = split(&a, "").unwrap_err();
443        assert!(
444            err.to_string().contains("separator must not be empty"),
445            "expected empty-separator error, got: {err}"
446        );
447    }
448
449    #[test]
450    fn test_split_ragged_empty_separator_errs() {
451        let a = array(&["abc"]).unwrap();
452        assert!(split_ragged(&a, "").is_err());
453    }
454
455    #[test]
456    fn test_split_single_char_separator_works() {
457        // Sanity check: a single-char separator still splits correctly
458        // — the validation gates only the empty-string case.
459        let a = array(&["a,b,c"]).unwrap();
460        let result = split_ragged(&a, ",").unwrap();
461        assert_eq!(result[0], vec!["a", "b", "c"]);
462    }
463
464    #[test]
465    fn test_split_multichar_separator_works() {
466        // Multi-character separator: "::" should split exactly on the
467        // 2-byte sequence, not on each byte.
468        let a = array(&["a::b::c"]).unwrap();
469        let result = split_ragged(&a, "::").unwrap();
470        assert_eq!(result[0], vec!["a", "b", "c"]);
471    }
472}