matten_data/csv.rs
1//! CSV ingestion (RFC-035 §3–4). Behind the default-on `csv` feature.
2//!
3//! CSV input is external input: every constructor returns `Result` and malformed
4//! data never panics (RFC-035 §1). The first row is the header (required); rows
5//! must be rectangular; only empty cells are missing by default.
6//!
7//! Note: this module is named `csv`, so the external `csv` crate is referenced via
8//! the absolute path `::csv` to disambiguate it from the module.
9
10use std::path::Path;
11
12use crate::error::MattenDataError;
13use crate::table::{CellValue, Table};
14
15impl Table {
16 /// Parse a `Table` from a CSV string.
17 ///
18 /// The first row is the header. Empty input, empty or duplicate header names,
19 /// and ragged rows are reported as [`MattenDataError`] (never a panic).
20 ///
21 /// ```
22 /// use matten_data::Table;
23 /// let table = Table::from_csv_str("a,b\n1,2\n3,4").unwrap();
24 /// assert_eq!(table.row_count(), 2);
25 /// assert_eq!(table.column_names(), &["a".to_string(), "b".to_string()]);
26 /// ```
27 pub fn from_csv_str(input: &str) -> Result<Table, MattenDataError> {
28 if input.trim().is_empty() {
29 return Err(MattenDataError::EmptyInput);
30 }
31
32 let mut reader = ::csv::ReaderBuilder::new()
33 .has_headers(true)
34 // Allow varying record lengths so ragged rows can be reported with a
35 // precise RaggedRow error rather than the parser's generic message.
36 .flexible(true)
37 .from_reader(input.as_bytes());
38
39 let headers: Vec<String> = reader
40 .headers()
41 .map_err(|e| MattenDataError::Csv {
42 message: e.to_string(),
43 })?
44 .iter()
45 .map(|h| h.trim().to_string())
46 .collect();
47
48 if headers.is_empty() {
49 return Err(MattenDataError::EmptyInput);
50 }
51
52 for (i, name) in headers.iter().enumerate() {
53 if name.is_empty() {
54 return Err(MattenDataError::Csv {
55 message: format!("header column {} is empty", i + 1),
56 });
57 }
58 }
59
60 // Reject duplicate header names (named selection requires unambiguity).
61 for i in 0..headers.len() {
62 for j in (i + 1)..headers.len() {
63 if headers[i] == headers[j] {
64 return Err(MattenDataError::DuplicateColumn {
65 name: headers[i].clone(),
66 });
67 }
68 }
69 }
70
71 let n = headers.len();
72 let mut rows: Vec<Vec<CellValue>> = Vec::new();
73 for (idx, record) in reader.records().enumerate() {
74 let record = record.map_err(|e| MattenDataError::Csv {
75 message: e.to_string(),
76 })?;
77
78 // Skip a stray fully-empty record (e.g. a blank trailing line).
79 if record.is_empty() {
80 continue;
81 }
82
83 // Header is CSV line 1, so the first data record is line 2.
84 let line = idx + 2;
85 if record.len() != n {
86 return Err(MattenDataError::RaggedRow {
87 row: line,
88 expected: n,
89 actual: record.len(),
90 });
91 }
92
93 rows.push(record.iter().map(parse_cell).collect());
94 }
95
96 Ok(Table::from_parts(headers, rows))
97 }
98
99 /// Parse a `Table` from a CSV file at `path`.
100 ///
101 /// I/O failures (for example a missing file) are reported as
102 /// [`MattenDataError::Io`] with the path and underlying error preserved.
103 pub fn from_csv_path<P: AsRef<Path>>(path: P) -> Result<Table, MattenDataError> {
104 let path = path.as_ref();
105 let content = std::fs::read_to_string(path).map_err(|source| MattenDataError::Io {
106 path: path.to_path_buf(),
107 source,
108 })?;
109 Table::from_csv_str(&content)
110 }
111}
112
113/// Infer a [`CellValue`] from a raw CSV field (RFC-035 §4.1–4.2).
114///
115/// Surrounding whitespace is trimmed. An empty field is `Missing`; otherwise the
116/// value is inferred as `Int`, then `Float`, then `Bool` (`true`/`false`), and
117/// finally `Text`. Booleans are not numbers, and text is not parsed as numeric
118/// here — numeric conversion is strict and explicit in `try_numeric`.
119fn parse_cell(raw: &str) -> CellValue {
120 let s = raw.trim();
121 if s.is_empty() {
122 return CellValue::Missing;
123 }
124 if let Ok(i) = s.parse::<i64>() {
125 return CellValue::Int(i);
126 }
127 if let Ok(fl) = s.parse::<f64>() {
128 return CellValue::Float(fl);
129 }
130 match s {
131 "true" => CellValue::Bool(true),
132 "false" => CellValue::Bool(false),
133 _ => CellValue::Text(s.to_string()),
134 }
135}