use ferray_core::error::{FerrayError, FerrayResult};
#[derive(Debug, Clone)]
pub struct TextParseOptions {
pub delimiter: char,
pub skiprows: usize,
pub comments: Option<char>,
pub max_rows: Option<usize>,
}
impl Default for TextParseOptions {
fn default() -> Self {
Self {
delimiter: ',',
skiprows: 0,
comments: Some('#'),
max_rows: None,
}
}
}
pub fn parse_text_grid(
content: &str,
opts: &TextParseOptions,
) -> FerrayResult<(Vec<String>, usize, usize)> {
let all_lines: Vec<&str> = content.lines().collect();
let non_comment_lines: Vec<&str> = all_lines
.iter()
.filter(|line| {
let trimmed = line.trim();
if trimmed.is_empty() {
return false;
}
if let Some(comment_char) = opts.comments {
if trimmed.starts_with(comment_char) {
return false;
}
}
true
})
.copied()
.collect();
let data_lines: &[&str] = if opts.skiprows < non_comment_lines.len() {
&non_comment_lines[opts.skiprows..]
} else {
&[]
};
let data_lines = if let Some(max) = opts.max_rows {
&data_lines[..data_lines.len().min(max)]
} else {
data_lines
};
if data_lines.is_empty() {
return Ok((vec![], 0, 0));
}
let delim = opts.delimiter;
let nrows = data_lines.len();
let ncols = data_lines[0].split(delim).count();
let mut cells = Vec::with_capacity(nrows * ncols);
for (row_idx, line) in data_lines.iter().enumerate() {
let fields: Vec<&str> = line.split(delim).collect();
if fields.len() != ncols {
return Err(FerrayError::io_error(format!(
"row {} has {} columns, expected {} (line: '{}')",
row_idx + opts.skiprows,
fields.len(),
ncols,
line,
)));
}
for field in fields {
cells.push(field.trim().to_string());
}
}
Ok((cells, nrows, ncols))
}
pub fn parse_text_grid_with_missing(
content: &str,
opts: &TextParseOptions,
missing_values: &[&str],
) -> FerrayResult<(Vec<Option<String>>, usize, usize)> {
let all_lines: Vec<&str> = content.lines().collect();
let non_comment_lines: Vec<&str> = all_lines
.iter()
.filter(|line| {
let trimmed = line.trim();
if trimmed.is_empty() {
return false;
}
if let Some(comment_char) = opts.comments {
if trimmed.starts_with(comment_char) {
return false;
}
}
true
})
.copied()
.collect();
let data_lines: &[&str] = if opts.skiprows < non_comment_lines.len() {
&non_comment_lines[opts.skiprows..]
} else {
&[]
};
let data_lines = if let Some(max) = opts.max_rows {
&data_lines[..data_lines.len().min(max)]
} else {
data_lines
};
if data_lines.is_empty() {
return Ok((vec![], 0, 0));
}
let delim = opts.delimiter;
let nrows = data_lines.len();
let ncols = data_lines[0].split(delim).count();
let mut cells = Vec::with_capacity(nrows * ncols);
for (row_idx, line) in data_lines.iter().enumerate() {
let fields: Vec<&str> = line.split(delim).collect();
for col_idx in 0..ncols {
if col_idx >= fields.len() {
cells.push(None);
} else {
let field = fields[col_idx].trim();
if field.is_empty() || missing_values.contains(&field) {
cells.push(None);
} else {
cells.push(Some(field.to_string()));
}
}
}
if fields.len() > ncols {
return Err(FerrayError::io_error(format!(
"row {} has {} columns, expected {} (line: '{}')",
row_idx + opts.skiprows,
fields.len(),
ncols,
line,
)));
}
}
Ok((cells, nrows, ncols))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_simple_csv() {
let content = "1,2,3\n4,5,6\n";
let opts = TextParseOptions {
delimiter: ',',
..Default::default()
};
let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
assert_eq!(nrows, 2);
assert_eq!(ncols, 3);
assert_eq!(cells, vec!["1", "2", "3", "4", "5", "6"]);
}
#[test]
fn parse_with_skiprows() {
let content = "# header\nname,value\n1,10\n2,20\n";
let opts = TextParseOptions {
delimiter: ',',
skiprows: 1,
comments: Some('#'),
..Default::default()
};
let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
assert_eq!(nrows, 2);
assert_eq!(ncols, 2);
assert_eq!(cells[0], "1");
}
#[test]
fn parse_with_comments() {
let content = "1,2\n# comment\n3,4\n";
let opts = TextParseOptions {
delimiter: ',',
comments: Some('#'),
..Default::default()
};
let (cells, nrows, _) = parse_text_grid(content, &opts).unwrap();
assert_eq!(nrows, 2);
assert_eq!(cells, vec!["1", "2", "3", "4"]);
}
#[test]
fn parse_tab_delimited() {
let content = "1\t2\t3\n4\t5\t6\n";
let opts = TextParseOptions {
delimiter: '\t',
..Default::default()
};
let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
assert_eq!(nrows, 2);
assert_eq!(ncols, 3);
assert_eq!(cells[0], "1");
}
#[test]
fn parse_inconsistent_columns_error() {
let content = "1,2,3\n4,5\n";
let opts = TextParseOptions::default();
assert!(parse_text_grid(content, &opts).is_err());
}
#[test]
fn parse_missing_values() {
let content = "1,2,3\n4,,6\n7,8,\n";
let opts = TextParseOptions::default();
let (cells, nrows, ncols) = parse_text_grid_with_missing(content, &opts, &[]).unwrap();
assert_eq!(nrows, 3);
assert_eq!(ncols, 3);
assert_eq!(cells[0], Some("1".to_string()));
assert_eq!(cells[4], None); assert_eq!(cells[8], None); }
#[test]
fn parse_custom_missing_marker() {
let content = "1,NA,3\n4,5,NA\n";
let opts = TextParseOptions::default();
let (cells, _, _) = parse_text_grid_with_missing(content, &opts, &["NA"]).unwrap();
assert_eq!(cells[1], None);
assert_eq!(cells[5], None);
assert_eq!(cells[0], Some("1".to_string()));
}
#[test]
fn parse_empty_content() {
let content = "";
let opts = TextParseOptions::default();
let (cells, nrows, ncols) = parse_text_grid(content, &opts).unwrap();
assert_eq!(nrows, 0);
assert_eq!(ncols, 0);
assert!(cells.is_empty());
}
}