1use ferray_core::dimension::{Dimension, Ix1, Ix2};
25use ferray_core::error::{FerrayError, FerrayResult};
26
27use crate::string_array::{StringArray, StringArray1, StringArray2};
28
29fn validate_separator(sep: &str) -> FerrayResult<()> {
34 if sep.is_empty() {
35 return Err(FerrayError::invalid_value(
36 "split separator must not be empty",
37 ));
38 }
39 Ok(())
40}
41
42pub fn split<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<StringArray2> {
52 validate_separator(sep)?;
53 let parts: Vec<Vec<String>> = a
54 .iter()
55 .map(|s| s.split(sep).map(String::from).collect())
56 .collect();
57 let n_inputs = parts.len();
58 let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
59 let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
60 for row in &parts {
61 for j in 0..max_parts {
62 flat.push(row.get(j).cloned().unwrap_or_default());
63 }
64 }
65 StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
66}
67
68pub fn rsplit<D: Dimension>(
78 a: &StringArray<D>,
79 sep: &str,
80 maxsplit: Option<usize>,
81) -> FerrayResult<StringArray2> {
82 validate_separator(sep)?;
83 let parts: Vec<Vec<String>> = a
84 .iter()
85 .map(|s| match maxsplit {
86 None => s.rsplit(sep).map(String::from).collect::<Vec<_>>(),
87 Some(n) => s.rsplitn(n + 1, sep).map(String::from).collect::<Vec<_>>(),
88 })
89 .map(|mut v| {
90 v.reverse();
91 v
92 })
93 .collect();
94 let n_inputs = parts.len();
95 let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
96 let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
97 for row in &parts {
98 for j in 0..max_parts {
99 flat.push(row.get(j).cloned().unwrap_or_default());
100 }
101 }
102 StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
103}
104
105pub fn splitlines<D: Dimension>(a: &StringArray<D>, keepends: bool) -> FerrayResult<StringArray2> {
116 let parts: Vec<Vec<String>> = a
117 .iter()
118 .map(|s| split_universal_newlines(s, keepends))
119 .collect();
120 let n_inputs = parts.len();
121 let max_lines = parts.iter().map(Vec::len).max().unwrap_or(0);
122 let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_lines);
123 for row in &parts {
124 for j in 0..max_lines {
125 flat.push(row.get(j).cloned().unwrap_or_default());
126 }
127 }
128 StringArray2::from_vec(Ix2::new([n_inputs, max_lines]), flat)
129}
130
131fn is_line_boundary(c: char) -> bool {
141 matches!(
142 c,
143 '\n' | '\u{0B}'
144 | '\u{0C}'
145 | '\r'
146 | '\u{1C}'
147 | '\u{1D}'
148 | '\u{1E}'
149 | '\u{85}'
150 | '\u{2028}'
151 | '\u{2029}'
152 )
153}
154
155fn split_universal_newlines(s: &str, keepends: bool) -> Vec<String> {
159 let mut out = Vec::new();
160 let mut line_start = 0usize;
161 let mut chars = s.char_indices().peekable();
162 while let Some((idx, c)) = chars.next() {
163 if !is_line_boundary(c) {
164 continue;
165 }
166 let mut eol_end = idx + c.len_utf8();
168 if c == '\r' {
169 if let Some(&(_, '\n')) = chars.peek() {
170 chars.next();
171 eol_end += '\n'.len_utf8();
172 }
173 }
174 let line_end = if keepends { eol_end } else { idx };
175 out.push(s[line_start..line_end].to_string());
176 line_start = eol_end;
177 }
178 if line_start < s.len() {
179 out.push(s[line_start..].to_string());
180 }
181 out
182}
183
184pub fn split_ragged<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<Vec<Vec<String>>> {
191 validate_separator(sep)?;
192 let result: Vec<Vec<String>> = a
193 .iter()
194 .map(|s| s.split(sep).map(String::from).collect())
195 .collect();
196 Ok(result)
197}
198
199pub fn join(sep: &str, items: &[Vec<String>]) -> FerrayResult<StringArray1> {
207 let data: Vec<String> = items.iter().map(|parts| parts.join(sep)).collect();
208 let dim = Ix1::new([data.len()]);
209 StringArray1::from_vec(dim, data)
210}
211
212pub fn join_array<D: Dimension>(sep: &str, a: &StringArray<D>) -> FerrayResult<StringArray1> {
220 let joined: String = a
221 .iter()
222 .map(std::string::String::as_str)
223 .collect::<Vec<&str>>()
224 .join(sep);
225 let dim = Ix1::new([1]);
226 StringArray1::from_vec(dim, vec![joined])
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232 use crate::string_array::array;
233
234 #[test]
235 fn test_split() {
236 let a = array(&["a-b", "c-d"]).unwrap();
237 let result = split(&a, "-").unwrap();
238 assert_eq!(result.shape(), &[2, 2]);
239 let s = result.as_slice();
240 assert_eq!(s, &["a", "b", "c", "d"]);
241 }
242
243 #[test]
246 fn rsplit_basic_no_limit() {
247 let a = array(&["a-b-c", "x-y"]).unwrap();
248 let r = rsplit(&a, "-", None).unwrap();
249 assert_eq!(r.shape(), &[2, 3]);
250 let s = r.as_slice();
251 assert_eq!(s, &["a", "b", "c", "x", "y", ""]);
254 }
255
256 #[test]
257 fn rsplit_with_maxsplit_one() {
258 let a = array(&["a-b-c-d"]).unwrap();
260 let r = rsplit(&a, "-", Some(1)).unwrap();
261 assert_eq!(r.shape(), &[1, 2]);
262 let s = r.as_slice();
263 assert_eq!(s, &["a-b-c", "d"]);
264 }
265
266 #[test]
267 fn splitlines_with_lf_and_crlf() {
268 let a = array(&["one\ntwo\r\nthree", "single"]).unwrap();
269 let r = splitlines(&a, false).unwrap();
270 assert_eq!(r.shape(), &[2, 3]);
272 let s = r.as_slice();
273 assert_eq!(s, &["one", "two", "three", "single", "", ""]);
274 }
275
276 #[test]
277 fn splitlines_keepends_retains_terminator() {
278 let a = array(&["x\ny\r\nz"]).unwrap();
279 let r = splitlines(&a, true).unwrap();
280 let s = r.as_slice();
281 assert_eq!(s, &["x\n", "y\r\n", "z"]);
282 }
283
284 #[test]
285 fn splitlines_handles_solo_carriage_return() {
286 let a = array(&["a\rb"]).unwrap();
287 let r = splitlines(&a, false).unwrap();
288 let s = r.as_slice();
289 assert_eq!(s, &["a", "b"]);
290 }
291
292 #[test]
299 fn splitlines_breaks_on_vtab_and_formfeed() {
300 assert_eq!(
302 split_universal_newlines("a\u{0B}b\u{0C}c", false),
303 vec!["a".to_string(), "b".to_string(), "c".to_string()]
304 );
305 }
306
307 #[test]
308 fn splitlines_breaks_on_c0_separators() {
309 assert_eq!(
312 split_universal_newlines("a\u{1C}b\u{1D}c\u{1E}d", false),
313 vec![
314 "a".to_string(),
315 "b".to_string(),
316 "c".to_string(),
317 "d".to_string()
318 ]
319 );
320 }
321
322 #[test]
323 fn splitlines_unit_separator_is_not_a_boundary() {
324 assert_eq!(
326 split_universal_newlines("a\u{1F}b", false),
327 vec!["a\u{1F}b".to_string()]
328 );
329 }
330
331 #[test]
332 fn splitlines_breaks_on_nel_and_unicode_separators() {
333 assert_eq!(
336 split_universal_newlines("a\u{85}b\u{2028}c\u{2029}d", false),
337 vec![
338 "a".to_string(),
339 "b".to_string(),
340 "c".to_string(),
341 "d".to_string()
342 ]
343 );
344 }
345
346 #[test]
347 fn splitlines_keepends_retains_unicode_terminators() {
348 assert_eq!(
351 split_universal_newlines("a\u{85}b\u{2028}c", true),
352 vec![
353 "a\u{85}".to_string(),
354 "b\u{2028}".to_string(),
355 "c".to_string()
356 ]
357 );
358 }
359
360 #[test]
361 fn test_split_multiple_parts() {
362 let a = array(&["a-b-c"]).unwrap();
363 let result = split(&a, "-").unwrap();
364 assert_eq!(result.shape(), &[1, 3]);
365 assert_eq!(result.as_slice(), &["a", "b", "c"]);
366 }
367
368 #[test]
369 fn test_split_no_separator_found() {
370 let a = array(&["hello"]).unwrap();
371 let result = split(&a, "-").unwrap();
372 assert_eq!(result.shape(), &[1, 1]);
373 assert_eq!(result.as_slice(), &["hello"]);
374 }
375
376 #[test]
377 fn test_split_pads_short_rows_with_empty_strings() {
378 let a = array(&["a-b", "x-y-z"]).unwrap();
380 let result = split(&a, "-").unwrap();
381 assert_eq!(result.shape(), &[2, 3]);
382 assert_eq!(result.as_slice(), &["a", "b", "", "x", "y", "z"]);
384 }
385
386 #[test]
387 fn test_split_ragged_returns_unpadded() {
388 let a = array(&["a-b", "x-y-z"]).unwrap();
390 let result = split_ragged(&a, "-").unwrap();
391 assert_eq!(
392 result,
393 vec![
394 vec!["a".to_string(), "b".to_string()],
395 vec!["x".to_string(), "y".to_string(), "z".to_string()],
396 ]
397 );
398 }
399
400 #[test]
401 fn test_join() {
402 let items = vec![
403 vec!["a".to_string(), "b".to_string()],
404 vec!["c".to_string(), "d".to_string()],
405 ];
406 let result = join("-", &items).unwrap();
407 assert_eq!(result.as_slice(), &["a-b", "c-d"]);
408 }
409
410 #[test]
411 fn test_join_array() {
412 let a = array(&["hello", "world"]).unwrap();
413 let result = join_array(" ", &a).unwrap();
414 assert_eq!(result.as_slice(), &["hello world"]);
415 }
416
417 #[test]
418 fn test_split_ac4() {
419 let a = array(&["a-b", "c-d"]).unwrap();
423 let result = split_ragged(&a, "-").unwrap();
424 assert_eq!(
425 result,
426 vec![
427 vec!["a".to_string(), "b".to_string()],
428 vec!["c".to_string(), "d".to_string()],
429 ]
430 );
431 }
432
433 #[test]
436 fn test_split_empty_separator_errs() {
437 let a = array(&["abc", "def"]).unwrap();
442 let err = split(&a, "").unwrap_err();
443 assert!(
444 err.to_string().contains("separator must not be empty"),
445 "expected empty-separator error, got: {err}"
446 );
447 }
448
449 #[test]
450 fn test_split_ragged_empty_separator_errs() {
451 let a = array(&["abc"]).unwrap();
452 assert!(split_ragged(&a, "").is_err());
453 }
454
455 #[test]
456 fn test_split_single_char_separator_works() {
457 let a = array(&["a,b,c"]).unwrap();
460 let result = split_ragged(&a, ",").unwrap();
461 assert_eq!(result[0], vec!["a", "b", "c"]);
462 }
463
464 #[test]
465 fn test_split_multichar_separator_works() {
466 let a = array(&["a::b::c"]).unwrap();
469 let result = split_ragged(&a, "::").unwrap();
470 assert_eq!(result[0], vec!["a", "b", "c"]);
471 }
472}