1use ferray_core::dimension::{Dimension, Ix1, Ix2};
6use ferray_core::error::{FerrayError, FerrayResult};
7
8use crate::string_array::{StringArray, StringArray1, StringArray2};
9
10fn validate_separator(sep: &str) -> FerrayResult<()> {
15 if sep.is_empty() {
16 return Err(FerrayError::invalid_value(
17 "split separator must not be empty",
18 ));
19 }
20 Ok(())
21}
22
23pub fn split<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<StringArray2> {
33 validate_separator(sep)?;
34 let parts: Vec<Vec<String>> = a
35 .iter()
36 .map(|s| s.split(sep).map(String::from).collect())
37 .collect();
38 let n_inputs = parts.len();
39 let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
40 let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
41 for row in &parts {
42 for j in 0..max_parts {
43 flat.push(row.get(j).cloned().unwrap_or_default());
44 }
45 }
46 StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
47}
48
49pub fn rsplit<D: Dimension>(
59 a: &StringArray<D>,
60 sep: &str,
61 maxsplit: Option<usize>,
62) -> FerrayResult<StringArray2> {
63 validate_separator(sep)?;
64 let parts: Vec<Vec<String>> = a
65 .iter()
66 .map(|s| match maxsplit {
67 None => s.rsplit(sep).map(String::from).collect::<Vec<_>>(),
68 Some(n) => s.rsplitn(n + 1, sep).map(String::from).collect::<Vec<_>>(),
69 })
70 .map(|mut v| {
71 v.reverse();
72 v
73 })
74 .collect();
75 let n_inputs = parts.len();
76 let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
77 let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
78 for row in &parts {
79 for j in 0..max_parts {
80 flat.push(row.get(j).cloned().unwrap_or_default());
81 }
82 }
83 StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
84}
85
86pub fn splitlines<D: Dimension>(a: &StringArray<D>, keepends: bool) -> FerrayResult<StringArray2> {
97 let parts: Vec<Vec<String>> = a
98 .iter()
99 .map(|s| split_universal_newlines(s, keepends))
100 .collect();
101 let n_inputs = parts.len();
102 let max_lines = parts.iter().map(Vec::len).max().unwrap_or(0);
103 let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_lines);
104 for row in &parts {
105 for j in 0..max_lines {
106 flat.push(row.get(j).cloned().unwrap_or_default());
107 }
108 }
109 StringArray2::from_vec(Ix2::new([n_inputs, max_lines]), flat)
110}
111
112fn split_universal_newlines(s: &str, keepends: bool) -> Vec<String> {
116 let mut out = Vec::new();
117 let bytes = s.as_bytes();
118 let mut start = 0;
119 let mut i = 0;
120 while i < bytes.len() {
121 let b = bytes[i];
122 if b == b'\n' || b == b'\r' {
123 let eol_len = if b == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
125 2
126 } else {
127 1
128 };
129 let line_end = if keepends { i + eol_len } else { i };
130 let line = std::str::from_utf8(&bytes[start..line_end])
131 .expect("input was &str so all slices are valid UTF-8")
132 .to_string();
133 out.push(line);
134 i += eol_len;
135 start = i;
136 } else {
137 i += 1;
138 }
139 }
140 if start < bytes.len() {
141 let trailing = std::str::from_utf8(&bytes[start..])
142 .expect("input was &str so all slices are valid UTF-8")
143 .to_string();
144 out.push(trailing);
145 }
146 out
147}
148
149pub fn split_ragged<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<Vec<Vec<String>>> {
156 validate_separator(sep)?;
157 let result: Vec<Vec<String>> = a
158 .iter()
159 .map(|s| s.split(sep).map(String::from).collect())
160 .collect();
161 Ok(result)
162}
163
164pub fn join(sep: &str, items: &[Vec<String>]) -> FerrayResult<StringArray1> {
172 let data: Vec<String> = items.iter().map(|parts| parts.join(sep)).collect();
173 let dim = Ix1::new([data.len()]);
174 StringArray1::from_vec(dim, data)
175}
176
177pub fn join_array<D: Dimension>(sep: &str, a: &StringArray<D>) -> FerrayResult<StringArray1> {
185 let joined: String = a
186 .iter()
187 .map(std::string::String::as_str)
188 .collect::<Vec<&str>>()
189 .join(sep);
190 let dim = Ix1::new([1]);
191 StringArray1::from_vec(dim, vec![joined])
192}
193
194#[cfg(test)]
195mod tests {
196 use super::*;
197 use crate::string_array::array;
198
199 #[test]
200 fn test_split() {
201 let a = array(&["a-b", "c-d"]).unwrap();
202 let result = split(&a, "-").unwrap();
203 assert_eq!(result.shape(), &[2, 2]);
204 let s = result.as_slice();
205 assert_eq!(s, &["a", "b", "c", "d"]);
206 }
207
208 #[test]
211 fn rsplit_basic_no_limit() {
212 let a = array(&["a-b-c", "x-y"]).unwrap();
213 let r = rsplit(&a, "-", None).unwrap();
214 assert_eq!(r.shape(), &[2, 3]);
215 let s = r.as_slice();
216 assert_eq!(s, &["a", "b", "c", "x", "y", ""]);
219 }
220
221 #[test]
222 fn rsplit_with_maxsplit_one() {
223 let a = array(&["a-b-c-d"]).unwrap();
225 let r = rsplit(&a, "-", Some(1)).unwrap();
226 assert_eq!(r.shape(), &[1, 2]);
227 let s = r.as_slice();
228 assert_eq!(s, &["a-b-c", "d"]);
229 }
230
231 #[test]
232 fn splitlines_with_lf_and_crlf() {
233 let a = array(&["one\ntwo\r\nthree", "single"]).unwrap();
234 let r = splitlines(&a, false).unwrap();
235 assert_eq!(r.shape(), &[2, 3]);
237 let s = r.as_slice();
238 assert_eq!(s, &["one", "two", "three", "single", "", ""]);
239 }
240
241 #[test]
242 fn splitlines_keepends_retains_terminator() {
243 let a = array(&["x\ny\r\nz"]).unwrap();
244 let r = splitlines(&a, true).unwrap();
245 let s = r.as_slice();
246 assert_eq!(s, &["x\n", "y\r\n", "z"]);
247 }
248
249 #[test]
250 fn splitlines_handles_solo_carriage_return() {
251 let a = array(&["a\rb"]).unwrap();
252 let r = splitlines(&a, false).unwrap();
253 let s = r.as_slice();
254 assert_eq!(s, &["a", "b"]);
255 }
256
257 #[test]
258 fn test_split_multiple_parts() {
259 let a = array(&["a-b-c"]).unwrap();
260 let result = split(&a, "-").unwrap();
261 assert_eq!(result.shape(), &[1, 3]);
262 assert_eq!(result.as_slice(), &["a", "b", "c"]);
263 }
264
265 #[test]
266 fn test_split_no_separator_found() {
267 let a = array(&["hello"]).unwrap();
268 let result = split(&a, "-").unwrap();
269 assert_eq!(result.shape(), &[1, 1]);
270 assert_eq!(result.as_slice(), &["hello"]);
271 }
272
273 #[test]
274 fn test_split_pads_short_rows_with_empty_strings() {
275 let a = array(&["a-b", "x-y-z"]).unwrap();
277 let result = split(&a, "-").unwrap();
278 assert_eq!(result.shape(), &[2, 3]);
279 assert_eq!(result.as_slice(), &["a", "b", "", "x", "y", "z"]);
281 }
282
283 #[test]
284 fn test_split_ragged_returns_unpadded() {
285 let a = array(&["a-b", "x-y-z"]).unwrap();
287 let result = split_ragged(&a, "-").unwrap();
288 assert_eq!(
289 result,
290 vec![
291 vec!["a".to_string(), "b".to_string()],
292 vec!["x".to_string(), "y".to_string(), "z".to_string()],
293 ]
294 );
295 }
296
297 #[test]
298 fn test_join() {
299 let items = vec![
300 vec!["a".to_string(), "b".to_string()],
301 vec!["c".to_string(), "d".to_string()],
302 ];
303 let result = join("-", &items).unwrap();
304 assert_eq!(result.as_slice(), &["a-b", "c-d"]);
305 }
306
307 #[test]
308 fn test_join_array() {
309 let a = array(&["hello", "world"]).unwrap();
310 let result = join_array(" ", &a).unwrap();
311 assert_eq!(result.as_slice(), &["hello world"]);
312 }
313
314 #[test]
315 fn test_split_ac4() {
316 let a = array(&["a-b", "c-d"]).unwrap();
320 let result = split_ragged(&a, "-").unwrap();
321 assert_eq!(
322 result,
323 vec![
324 vec!["a".to_string(), "b".to_string()],
325 vec!["c".to_string(), "d".to_string()],
326 ]
327 );
328 }
329
330 #[test]
333 fn test_split_empty_separator_errs() {
334 let a = array(&["abc", "def"]).unwrap();
339 let err = split(&a, "").unwrap_err();
340 assert!(
341 err.to_string().contains("separator must not be empty"),
342 "expected empty-separator error, got: {err}"
343 );
344 }
345
346 #[test]
347 fn test_split_ragged_empty_separator_errs() {
348 let a = array(&["abc"]).unwrap();
349 assert!(split_ragged(&a, "").is_err());
350 }
351
352 #[test]
353 fn test_split_single_char_separator_works() {
354 let a = array(&["a,b,c"]).unwrap();
357 let result = split_ragged(&a, ",").unwrap();
358 assert_eq!(result[0], vec!["a", "b", "c"]);
359 }
360
361 #[test]
362 fn test_split_multichar_separator_works() {
363 let a = array(&["a::b::c"]).unwrap();
366 let result = split_ragged(&a, "::").unwrap();
367 assert_eq!(result[0], vec!["a", "b", "c"]);
368 }
369}