use ferray_core::dimension::{Dimension, Ix1, Ix2};
use ferray_core::error::{FerrayError, FerrayResult};
use crate::string_array::{StringArray, StringArray1, StringArray2};
fn validate_separator(sep: &str) -> FerrayResult<()> {
if sep.is_empty() {
return Err(FerrayError::invalid_value(
"split separator must not be empty",
));
}
Ok(())
}
pub fn split<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<StringArray2> {
validate_separator(sep)?;
let parts: Vec<Vec<String>> = a
.iter()
.map(|s| s.split(sep).map(String::from).collect())
.collect();
let n_inputs = parts.len();
let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
for row in &parts {
for j in 0..max_parts {
flat.push(row.get(j).cloned().unwrap_or_default());
}
}
StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
}
pub fn rsplit<D: Dimension>(
a: &StringArray<D>,
sep: &str,
maxsplit: Option<usize>,
) -> FerrayResult<StringArray2> {
validate_separator(sep)?;
let parts: Vec<Vec<String>> = a
.iter()
.map(|s| match maxsplit {
None => s.rsplit(sep).map(String::from).collect::<Vec<_>>(),
Some(n) => s.rsplitn(n + 1, sep).map(String::from).collect::<Vec<_>>(),
})
.map(|mut v| {
v.reverse();
v
})
.collect();
let n_inputs = parts.len();
let max_parts = parts.iter().map(Vec::len).max().unwrap_or(0);
let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_parts);
for row in &parts {
for j in 0..max_parts {
flat.push(row.get(j).cloned().unwrap_or_default());
}
}
StringArray2::from_vec(Ix2::new([n_inputs, max_parts]), flat)
}
pub fn splitlines<D: Dimension>(a: &StringArray<D>, keepends: bool) -> FerrayResult<StringArray2> {
let parts: Vec<Vec<String>> = a
.iter()
.map(|s| split_universal_newlines(s, keepends))
.collect();
let n_inputs = parts.len();
let max_lines = parts.iter().map(Vec::len).max().unwrap_or(0);
let mut flat: Vec<String> = Vec::with_capacity(n_inputs * max_lines);
for row in &parts {
for j in 0..max_lines {
flat.push(row.get(j).cloned().unwrap_or_default());
}
}
StringArray2::from_vec(Ix2::new([n_inputs, max_lines]), flat)
}
fn split_universal_newlines(s: &str, keepends: bool) -> Vec<String> {
let mut out = Vec::new();
let bytes = s.as_bytes();
let mut start = 0;
let mut i = 0;
while i < bytes.len() {
let b = bytes[i];
if b == b'\n' || b == b'\r' {
let eol_len = if b == b'\r' && i + 1 < bytes.len() && bytes[i + 1] == b'\n' {
2
} else {
1
};
let line_end = if keepends { i + eol_len } else { i };
let line = std::str::from_utf8(&bytes[start..line_end])
.expect("input was &str so all slices are valid UTF-8")
.to_string();
out.push(line);
i += eol_len;
start = i;
} else {
i += 1;
}
}
if start < bytes.len() {
let trailing = std::str::from_utf8(&bytes[start..])
.expect("input was &str so all slices are valid UTF-8")
.to_string();
out.push(trailing);
}
out
}
pub fn split_ragged<D: Dimension>(a: &StringArray<D>, sep: &str) -> FerrayResult<Vec<Vec<String>>> {
validate_separator(sep)?;
let result: Vec<Vec<String>> = a
.iter()
.map(|s| s.split(sep).map(String::from).collect())
.collect();
Ok(result)
}
pub fn join(sep: &str, items: &[Vec<String>]) -> FerrayResult<StringArray1> {
let data: Vec<String> = items.iter().map(|parts| parts.join(sep)).collect();
let dim = Ix1::new([data.len()]);
StringArray1::from_vec(dim, data)
}
pub fn join_array<D: Dimension>(sep: &str, a: &StringArray<D>) -> FerrayResult<StringArray1> {
let joined: String = a
.iter()
.map(std::string::String::as_str)
.collect::<Vec<&str>>()
.join(sep);
let dim = Ix1::new([1]);
StringArray1::from_vec(dim, vec![joined])
}
#[cfg(test)]
mod tests {
use super::*;
use crate::string_array::array;
#[test]
fn test_split() {
let a = array(&["a-b", "c-d"]).unwrap();
let result = split(&a, "-").unwrap();
assert_eq!(result.shape(), &[2, 2]);
let s = result.as_slice();
assert_eq!(s, &["a", "b", "c", "d"]);
}
#[test]
fn rsplit_basic_no_limit() {
let a = array(&["a-b-c", "x-y"]).unwrap();
let r = rsplit(&a, "-", None).unwrap();
assert_eq!(r.shape(), &[2, 3]);
let s = r.as_slice();
assert_eq!(s, &["a", "b", "c", "x", "y", ""]);
}
#[test]
fn rsplit_with_maxsplit_one() {
let a = array(&["a-b-c-d"]).unwrap();
let r = rsplit(&a, "-", Some(1)).unwrap();
assert_eq!(r.shape(), &[1, 2]);
let s = r.as_slice();
assert_eq!(s, &["a-b-c", "d"]);
}
#[test]
fn splitlines_with_lf_and_crlf() {
let a = array(&["one\ntwo\r\nthree", "single"]).unwrap();
let r = splitlines(&a, false).unwrap();
assert_eq!(r.shape(), &[2, 3]);
let s = r.as_slice();
assert_eq!(s, &["one", "two", "three", "single", "", ""]);
}
#[test]
fn splitlines_keepends_retains_terminator() {
let a = array(&["x\ny\r\nz"]).unwrap();
let r = splitlines(&a, true).unwrap();
let s = r.as_slice();
assert_eq!(s, &["x\n", "y\r\n", "z"]);
}
#[test]
fn splitlines_handles_solo_carriage_return() {
let a = array(&["a\rb"]).unwrap();
let r = splitlines(&a, false).unwrap();
let s = r.as_slice();
assert_eq!(s, &["a", "b"]);
}
#[test]
fn test_split_multiple_parts() {
let a = array(&["a-b-c"]).unwrap();
let result = split(&a, "-").unwrap();
assert_eq!(result.shape(), &[1, 3]);
assert_eq!(result.as_slice(), &["a", "b", "c"]);
}
#[test]
fn test_split_no_separator_found() {
let a = array(&["hello"]).unwrap();
let result = split(&a, "-").unwrap();
assert_eq!(result.shape(), &[1, 1]);
assert_eq!(result.as_slice(), &["hello"]);
}
#[test]
fn test_split_pads_short_rows_with_empty_strings() {
let a = array(&["a-b", "x-y-z"]).unwrap();
let result = split(&a, "-").unwrap();
assert_eq!(result.shape(), &[2, 3]);
assert_eq!(result.as_slice(), &["a", "b", "", "x", "y", "z"]);
}
#[test]
fn test_split_ragged_returns_unpadded() {
let a = array(&["a-b", "x-y-z"]).unwrap();
let result = split_ragged(&a, "-").unwrap();
assert_eq!(
result,
vec![
vec!["a".to_string(), "b".to_string()],
vec!["x".to_string(), "y".to_string(), "z".to_string()],
]
);
}
#[test]
fn test_join() {
let items = vec![
vec!["a".to_string(), "b".to_string()],
vec!["c".to_string(), "d".to_string()],
];
let result = join("-", &items).unwrap();
assert_eq!(result.as_slice(), &["a-b", "c-d"]);
}
#[test]
fn test_join_array() {
let a = array(&["hello", "world"]).unwrap();
let result = join_array(" ", &a).unwrap();
assert_eq!(result.as_slice(), &["hello world"]);
}
#[test]
fn test_split_ac4() {
let a = array(&["a-b", "c-d"]).unwrap();
let result = split_ragged(&a, "-").unwrap();
assert_eq!(
result,
vec![
vec!["a".to_string(), "b".to_string()],
vec!["c".to_string(), "d".to_string()],
]
);
}
#[test]
fn test_split_empty_separator_errs() {
let a = array(&["abc", "def"]).unwrap();
let err = split(&a, "").unwrap_err();
assert!(
err.to_string().contains("separator must not be empty"),
"expected empty-separator error, got: {err}"
);
}
#[test]
fn test_split_ragged_empty_separator_errs() {
let a = array(&["abc"]).unwrap();
assert!(split_ragged(&a, "").is_err());
}
#[test]
fn test_split_single_char_separator_works() {
let a = array(&["a,b,c"]).unwrap();
let result = split_ragged(&a, ",").unwrap();
assert_eq!(result[0], vec!["a", "b", "c"]);
}
#[test]
fn test_split_multichar_separator_works() {
let a = array(&["a::b::c"]).unwrap();
let result = split_ragged(&a, "::").unwrap();
assert_eq!(result[0], vec!["a", "b", "c"]);
}
}