Skip to main content

ferray_strings/
split_join.rs

1// ferray-strings: Split and join operations (REQ-11)
2//
3// Implements split and join — elementwise on StringArray.
4
5use ferray_core::dimension::{Dimension, Ix1, Ix2};
6use ferray_core::error::{FerrayError, FerrayResult};
7
8use crate::string_array::{StringArray, StringArray1, StringArray2};
9
10/// Reject the empty separator: Rust's `str::split("")` returns the
11/// surprising `["", "a", "b", "c", ""]` pattern (one empty token per
12/// boundary including the ends), but numpy's `np.char.split` raises
13/// ValueError on an empty separator. Reject up front to match (#283).
14fn validate_separator(sep: &str) -> FerrayResult<()> {
15    if sep.is_empty() {
16        return Err(FerrayError::invalid_value(
17            "split separator must not be empty",
18        ));
19    }
20    Ok(())
21}
22
23/// Split each string element by the given separator.
24///
25/// Returns a 2-D `StringArray` of shape `(n_inputs, max_parts)` where row
26/// `i` contains the parts produced by splitting element `i`. Rows shorter
27/// than `max_parts` are padded with empty strings (#277). Use
28/// [`split_ragged`] when you need the unpadded `Vec<Vec<String>>` form.
29///
30/// # Errors
31/// Returns an error if the internal array construction fails.
32pub fn split<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<StringArray2> {
33    validate_separator(sep)?;
34    let parts: Vec<Vec<String>> = a
35        .iter()
36        .map(|s| s.split(sep).map(String::from).collect())
37        .collect();
38    let n_inputs = parts.len();
39    let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
40    let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
41    for row in &parts {
42        for j in 0..max_parts {
43            flat.push(row.get(j).cloned().unwrap_or_default());
44        }
45    }
46    StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
47}
48
49/// Right-to-left counterpart of [`split`] (#515).
50///
51/// Splits each element on `sep` starting from the right. With the
52/// optional `maxsplit` cap, only the rightmost `maxsplit` separators
53/// produce splits — leading remainder is kept as one piece. Mirrors
54/// `numpy.strings.rsplit`.
55///
56/// # Errors
57/// Returns an error if `sep` is empty or array construction fails.
58pub fn rsplit<D: Dimension>(
59    a: &StringArray<D>,
60    sep: &str,
61    maxsplit: Option<usize>,
62) -> FerrayResult<StringArray2> {
63    validate_separator(sep)?;
64    let parts: Vec<Vec<String>> = a
65        .iter()
66        .map(|s| match maxsplit {
67            None => s.rsplit(sep).map(String::from).collect::<Vec<_>>(),
68            Some(n) => s.rsplitn(n + 1, sep).map(String::from).collect::<Vec<_>>(),
69        })
70        .map(|mut v| {
71            v.reverse();
72            v
73        })
74        .collect();
75    let n_inputs = parts.len();
76    let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
77    let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
78    for row in &parts {
79        for j in 0..max_parts {
80            flat.push(row.get(j).cloned().unwrap_or_default());
81        }
82    }
83    StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
84}
85
86/// Split each element on universal newlines (`\n`, `\r\n`, `\r`)
87/// (#515). Equivalent to `numpy.strings.splitlines`.
88///
89/// Returns a 2-D `StringArray` shaped `(n_inputs, max_lines)` with
90/// trailing empty padding when the per-element line count differs.
91/// `keepends = true` retains the line terminator on each kept line,
92/// matching Python/NumPy behavior.
93///
94/// # Errors
95/// Returns an error if array construction fails.
96pub fn splitlines<D: Dimension>(a: &StringArray<D>, keepends: bool) -> FerrayResult<StringArray2> {
97    let parts: Vec<Vec<String>> = a
98        .iter()
99        .map(|s| split_universal_newlines(s, keepends))
100        .collect();
101    let n_inputs = parts.len();
102    let max_lines = parts.iter().map(Vec::len).max().unwrap_or(0);
103    let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_lines);
104    for row in &parts {
105        for j in 0..max_lines {
106            flat.push(row.get(j).cloned().unwrap_or_default());
107        }
108    }
109    StringArray2::from_vec(Ix2::new([n_inputs, max_lines]), flat)
110}
111
112/// Universal-newline split: `\r\n` is a single split, then any
113/// remaining `\n` or `\r` independently. The result mirrors
114/// Python's `str.splitlines`.
115fn split_universal_newlines(s: &str, keepends: bool) -> Vec<String> {
116    let mut out = Vec::new();
117    let bytes = s.as_bytes();
118    let mut start = 0;
119    let mut i = 0;
120    while i < bytes.len() {
121        let b = bytes[i];
122        if b == b'\n' || b == b'\r' {
123            // Identify the EOL run length.
124            let eol_len = if b == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
125                2
126            } else {
127                1
128            };
129            let line_end = if keepends { i + eol_len } else { i };
130            let line = std::str::from_utf8(&bytes[start..line_end])
131                .expect("input was &str so all slices are valid UTF-8")
132                .to_string();
133            out.push(line);
134            i += eol_len;
135            start = i;
136        } else {
137            i += 1;
138        }
139    }
140    if start < bytes.len() {
141        let trailing = std::str::from_utf8(&bytes[start..])
142            .expect("input was &str so all slices are valid UTF-8")
143            .to_string();
144        out.push(trailing);
145    }
146    out
147}
148
149/// Ragged-result variant of [`split`]: returns a `Vec<Vec<String>>` so
150/// callers that need the unpadded splits per element don't have to
151/// strip empty padding from the 2-D result (#277).
152///
153/// # Errors
154/// Returns an error only for internal failures.
155pub fn split_ragged<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<Vec<Vec<String>>> {
156    validate_separator(sep)?;
157    let result: Vec<Vec<String>> = a
158        .iter()
159        .map(|s| s.split(sep).map(String::from).collect())
160        .collect();
161    Ok(result)
162}
163
164/// Join a collection of string vectors using the given separator.
165///
166/// Each element in the input is a `Vec<String>` which is joined into
167/// a single string. Returns a 1-D `StringArray`.
168///
169/// # Errors
170/// Returns an error if the internal array construction fails.
171pub fn join(sep: &str, items: &[Vec<String>]) -> FerrayResult<StringArray1> {
172    let data: Vec<String> = items.iter().map(|parts| parts.join(sep)).collect();
173    let dim = Ix1::new([data.len()]);
174    StringArray1::from_vec(dim, data)
175}
176
177/// Join each string element of a `StringArray` using the given separator.
178///
179/// This variant takes a `StringArray` and joins all elements into a single
180/// string. Returns a 1-D `StringArray` with one element.
181///
182/// # Errors
183/// Returns an error if the internal array construction fails.
184pub fn join_array<D: Dimension>(sep: &str, a: &StringArray<D>) -> FerrayResult<StringArray1> {
185    let joined: String = a
186        .iter()
187        .map(std::string::String::as_str)
188        .collect::<Vec<&str>>()
189        .join(sep);
190    let dim = Ix1::new([1]);
191    StringArray1::from_vec(dim, vec![joined])
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197    use crate::string_array::array;
198
199    #[test]
200    fn test_split() {
201        let a = array(&["a-b", "c-d"]).unwrap();
202        let result = split(&a, "-").unwrap();
203        assert_eq!(result.shape(), &[2, 2]);
204        let s = result.as_slice();
205        assert_eq!(s, &["a", "b", "c", "d"]);
206    }
207
208    // ---- rsplit / splitlines (#515) ------------------------------------
209
210    #[test]
211    fn rsplit_basic_no_limit() {
212        let a = array(&["a-b-c", "x-y"]).unwrap();
213        let r = rsplit(&a, "-", None).unwrap();
214        assert_eq!(r.shape(), &[2, 3]);
215        let s = r.as_slice();
216        // Trailing empty pads (matches `split`'s padding convention).
217        // Row 0: ["a","b","c"], Row 1: ["x","y",""]
218        assert_eq!(s, &["a", "b", "c", "x", "y", ""]);
219    }
220
221    #[test]
222    fn rsplit_with_maxsplit_one() {
223        // maxsplit=1: only the rightmost separator splits.
224        let a = array(&["a-b-c-d"]).unwrap();
225        let r = rsplit(&a, "-", Some(1)).unwrap();
226        assert_eq!(r.shape(), &[1, 2]);
227        let s = r.as_slice();
228        assert_eq!(s, &["a-b-c", "d"]);
229    }
230
231    #[test]
232    fn splitlines_with_lf_and_crlf() {
233        let a = array(&["one\ntwo\r\nthree", "single"]).unwrap();
234        let r = splitlines(&a, false).unwrap();
235        // Row 0 has 3 lines, row 1 has 1 line. Padding to 3.
236        assert_eq!(r.shape(), &[2, 3]);
237        let s = r.as_slice();
238        assert_eq!(s, &["one", "two", "three", "single", "", ""]);
239    }
240
241    #[test]
242    fn splitlines_keepends_retains_terminator() {
243        let a = array(&["x\ny\r\nz"]).unwrap();
244        let r = splitlines(&a, true).unwrap();
245        let s = r.as_slice();
246        assert_eq!(s, &["x\n", "y\r\n", "z"]);
247    }
248
249    #[test]
250    fn splitlines_handles_solo_carriage_return() {
251        let a = array(&["a\rb"]).unwrap();
252        let r = splitlines(&a, false).unwrap();
253        let s = r.as_slice();
254        assert_eq!(s, &["a", "b"]);
255    }
256
257    #[test]
258    fn test_split_multiple_parts() {
259        let a = array(&["a-b-c"]).unwrap();
260        let result = split(&a, "-").unwrap();
261        assert_eq!(result.shape(), &[1, 3]);
262        assert_eq!(result.as_slice(), &["a", "b", "c"]);
263    }
264
265    #[test]
266    fn test_split_no_separator_found() {
267        let a = array(&["hello"]).unwrap();
268        let result = split(&a, "-").unwrap();
269        assert_eq!(result.shape(), &[1, 1]);
270        assert_eq!(result.as_slice(), &["hello"]);
271    }
272
273    #[test]
274    fn test_split_pads_short_rows_with_empty_strings() {
275        // #277: rows shorter than max_parts must be padded with "".
276        let a = array(&["a-b", "x-y-z"]).unwrap();
277        let result = split(&a, "-").unwrap();
278        assert_eq!(result.shape(), &[2, 3]);
279        // Row 0: ["a", "b", ""] (padded), Row 1: ["x", "y", "z"]
280        assert_eq!(result.as_slice(), &["a", "b", "", "x", "y", "z"]);
281    }
282
283    #[test]
284    fn test_split_ragged_returns_unpadded() {
285        // #277: split_ragged keeps the per-element variable length.
286        let a = array(&["a-b", "x-y-z"]).unwrap();
287        let result = split_ragged(&a, "-").unwrap();
288        assert_eq!(
289            result,
290            vec![
291                vec!["a".to_string(), "b".to_string()],
292                vec!["x".to_string(), "y".to_string(), "z".to_string()],
293            ]
294        );
295    }
296
297    #[test]
298    fn test_join() {
299        let items = vec![
300            vec!["a".to_string(), "b".to_string()],
301            vec!["c".to_string(), "d".to_string()],
302        ];
303        let result = join("-", &items).unwrap();
304        assert_eq!(result.as_slice(), &["a-b", "c-d"]);
305    }
306
307    #[test]
308    fn test_join_array() {
309        let a = array(&["hello", "world"]).unwrap();
310        let result = join_array(" ", &a).unwrap();
311        assert_eq!(result.as_slice(), &["hello world"]);
312    }
313
314    #[test]
315    fn test_split_ac4() {
316        // AC-4: strings::split_ragged(&["a-b", "c-d"], "-") returns
317        // [vec!["a","b"], vec!["c","d"]] — the ragged form preserves
318        // the original AC behavior.
319        let a = array(&["a-b", "c-d"]).unwrap();
320        let result = split_ragged(&a, "-").unwrap();
321        assert_eq!(
322            result,
323            vec![
324                vec!["a".to_string(), "b".to_string()],
325                vec!["c".to_string(), "d".to_string()],
326            ]
327        );
328    }
329
330    // ----- Empty separator rejection (#283) ------------------------------
331
332    #[test]
333    fn test_split_empty_separator_errs() {
334        // #283: Rust's str::split("") returns the surprising
335        // ["", "a", "b", "c", ""] pattern with empty tokens around
336        // every char boundary. numpy's np.char.split raises ValueError
337        // for an empty separator. Match numpy's strict path.
338        let a = array(&["abc", "def"]).unwrap();
339        let err = split(&a, "").unwrap_err();
340        assert!(
341            err.to_string().contains("separator must not be empty"),
342            "expected empty-separator error, got: {err}"
343        );
344    }
345
346    #[test]
347    fn test_split_ragged_empty_separator_errs() {
348        let a = array(&["abc"]).unwrap();
349        assert!(split_ragged(&a, "").is_err());
350    }
351
352    #[test]
353    fn test_split_single_char_separator_works() {
354        // Sanity check: a single-char separator still splits correctly
355        // — the validation gates only the empty-string case.
356        let a = array(&["a,b,c"]).unwrap();
357        let result = split_ragged(&a, ",").unwrap();
358        assert_eq!(result[0], vec!["a", "b", "c"]);
359    }
360
361    #[test]
362    fn test_split_multichar_separator_works() {
363        // Multi-character separator: "::" should split exactly on the
364        // 2-byte sequence, not on each byte.
365        let a = array(&["a::b::c"]).unwrap();
366        let result = split_ragged(&a, "::").unwrap();
367        assert_eq!(result[0], vec!["a", "b", "c"]);
368    }
369}