Skip to main content

matten_data/
csv.rs

1//! CSV ingestion (RFC-035 §3–4). Behind the default-on `csv` feature.
2//!
3//! CSV input is external input: every constructor returns `Result` and malformed
4//! data never panics (RFC-035 §1). The first row is the header (required); rows
5//! must be rectangular; only empty cells are missing by default.
6//!
7//! Note: this module is named `csv`, so the external `csv` crate is referenced via
8//! the absolute path `::csv` to disambiguate it from the module.
9
10use std::path::Path;
11
12use crate::error::MattenDataError;
13use crate::table::{CellValue, Table};
14
15impl Table {
16    /// Parse a `Table` from a CSV string.
17    ///
18    /// The first row is the header. Empty input, empty or duplicate header names,
19    /// and ragged rows are reported as [`MattenDataError`] (never a panic).
20    ///
21    /// ```
22    /// use matten_data::Table;
23    /// let table = Table::from_csv_str("a,b\n1,2\n3,4").unwrap();
24    /// assert_eq!(table.row_count(), 2);
25    /// assert_eq!(table.column_names(), &["a".to_string(), "b".to_string()]);
26    /// ```
27    pub fn from_csv_str(input: &str) -> Result<Table, MattenDataError> {
28        if input.trim().is_empty() {
29            return Err(MattenDataError::EmptyInput);
30        }
31
32        let mut reader = ::csv::ReaderBuilder::new()
33            .has_headers(true)
34            // Allow varying record lengths so ragged rows can be reported with a
35            // precise RaggedRow error rather than the parser's generic message.
36            .flexible(true)
37            .from_reader(input.as_bytes());
38
39        let headers: Vec<String> = reader
40            .headers()
41            .map_err(|e| MattenDataError::Csv {
42                message: e.to_string(),
43            })?
44            .iter()
45            .map(|h| h.trim().to_string())
46            .collect();
47
48        if headers.is_empty() {
49            return Err(MattenDataError::EmptyInput);
50        }
51
52        for (i, name) in headers.iter().enumerate() {
53            if name.is_empty() {
54                return Err(MattenDataError::Csv {
55                    message: format!("header column {} is empty", i + 1),
56                });
57            }
58        }
59
60        // Reject duplicate header names (named selection requires unambiguity).
61        for i in 0..headers.len() {
62            for j in (i + 1)..headers.len() {
63                if headers[i] == headers[j] {
64                    return Err(MattenDataError::DuplicateColumn {
65                        name: headers[i].clone(),
66                    });
67                }
68            }
69        }
70
71        let n = headers.len();
72        let mut rows: Vec<Vec<CellValue>> = Vec::new();
73        for (idx, record) in reader.records().enumerate() {
74            let record = record.map_err(|e| MattenDataError::Csv {
75                message: e.to_string(),
76            })?;
77
78            // Skip a stray fully-empty record (e.g. a blank trailing line).
79            if record.is_empty() {
80                continue;
81            }
82
83            // Header is CSV line 1, so the first data record is line 2.
84            let line = idx + 2;
85            if record.len() != n {
86                return Err(MattenDataError::RaggedRow {
87                    row: line,
88                    expected: n,
89                    actual: record.len(),
90                });
91            }
92
93            rows.push(record.iter().map(parse_cell).collect());
94        }
95
96        Ok(Table::from_parts(headers, rows))
97    }
98
99    /// Parse a `Table` from a CSV file at `path`.
100    ///
101    /// I/O failures (for example a missing file) are reported as
102    /// [`MattenDataError::Io`] with the path and underlying error preserved.
103    pub fn from_csv_path<P: AsRef<Path>>(path: P) -> Result<Table, MattenDataError> {
104        let path = path.as_ref();
105        let content = std::fs::read_to_string(path).map_err(|source| MattenDataError::Io {
106            path: path.to_path_buf(),
107            source,
108        })?;
109        Table::from_csv_str(&content)
110    }
111}
112
113/// Infer a [`CellValue`] from a raw CSV field (RFC-035 §4.1–4.2).
114///
115/// Surrounding whitespace is trimmed. An empty field is `Missing`; otherwise the
116/// value is inferred as `Int`, then `Float`, then `Bool` (`true`/`false`), and
117/// finally `Text`. Booleans are not numbers, and text is not parsed as numeric
118/// here — numeric conversion is strict and explicit in `try_numeric`.
119fn parse_cell(raw: &str) -> CellValue {
120    let s = raw.trim();
121    if s.is_empty() {
122        return CellValue::Missing;
123    }
124    if let Ok(i) = s.parse::<i64>() {
125        return CellValue::Int(i);
126    }
127    if let Ok(fl) = s.parse::<f64>() {
128        return CellValue::Float(fl);
129    }
130    match s {
131        "true" => CellValue::Bool(true),
132        "false" => CellValue::Bool(false),
133        _ => CellValue::Text(s.to_string()),
134    }
135}