matten_data/table.rs
1//! The `Table` model and `CellValue` (RFC-034 §4–5).
2//!
3//! `Table` is a small, owned, rectangular table-like data set whose end goal is a
4//! numeric [`matten::Tensor`]. It is **not** a dataframe (RFC-033, RFC-042): there
5//! are no joins, group-by, pivot, query, lazy execution, or indexing APIs.
6
7use std::collections::HashSet;
8
9use crate::error::MattenDataError;
10use crate::schema::SchemaSummary;
11use crate::{numeric, schema};
12
13/// A single table cell value (RFC-034 §4.2).
14///
15/// `CellValue` is **intentionally crate-local** (architect ruling, RFC-033–042
16/// review Q4): it models table-ingestion cells, not core `Tensor` dynamic values,
17/// and is distinct from `matten::Element` (it is not an alias). `Text` holds an
18/// owned `String` (core `Element::Text` uses `Arc<str>`); the representation is a
19/// local, practical choice that can change without affecting the public model.
20#[derive(Debug, Clone, PartialEq)]
21#[non_exhaustive]
22pub enum CellValue {
23 /// Free text that did not parse as a number or boolean.
24 Text(String),
25 /// A floating-point value.
26 Float(f64),
27 /// An integer value.
28 Int(i64),
29 /// A boolean (`true` / `false`).
30 Bool(bool),
31 /// A missing / empty cell.
32 Missing,
33}
34
35impl From<f64> for CellValue {
36 fn from(v: f64) -> Self {
37 CellValue::Float(v)
38 }
39}
40impl From<i64> for CellValue {
41 fn from(v: i64) -> Self {
42 CellValue::Int(v)
43 }
44}
45impl From<bool> for CellValue {
46 fn from(v: bool) -> Self {
47 CellValue::Bool(v)
48 }
49}
50impl From<&str> for CellValue {
51 fn from(v: &str) -> Self {
52 CellValue::Text(v.to_string())
53 }
54}
55impl From<String> for CellValue {
56 fn from(v: String) -> Self {
57 CellValue::Text(v)
58 }
59}
60
61/// A small, owned, rectangular table-like data set.
62///
63/// External guarantees: row order is preserved; column order is preserved; column
64/// names are stable after loading. Operations return new owned `Table` values; no
65/// borrowed view lifetimes appear in normal use.
66#[derive(Debug, Clone)]
67pub struct Table {
68 headers: Vec<String>,
69 rows: Vec<Vec<CellValue>>,
70}
71
72impl Table {
73 /// Construct a `Table` from validated parts. Internal: callers (CSV ingestion,
74 /// selection, fill) guarantee that every row has `headers.len()` cells.
75 pub(crate) fn from_parts(headers: Vec<String>, rows: Vec<Vec<CellValue>>) -> Self {
76 Table { headers, rows }
77 }
78
79 /// Number of data rows.
80 pub fn row_count(&self) -> usize {
81 self.rows.len()
82 }
83
84 /// Number of columns.
85 pub fn column_count(&self) -> usize {
86 self.headers.len()
87 }
88
89 /// Column names, in column order.
90 pub fn column_names(&self) -> &[String] {
91 &self.headers
92 }
93
94 /// A small, displayable schema summary (row/column counts, per-column missing
95 /// counts and inferred kinds). Does not perform expensive analysis.
96 pub fn schema_summary(&self) -> SchemaSummary {
97 schema::summarize(self)
98 }
99
100 /// Select columns by name, returning a new `Table`.
101 ///
102 /// Behavior (RFC-034 §5.3): preserves the requested column order; errors with
103 /// [`MattenDataError::MissingColumn`] if a requested column does not exist;
104 /// rejects duplicate selections with [`MattenDataError::DuplicateSelection`];
105 /// an empty selection is [`MattenDataError::EmptySelection`].
106 pub fn select_columns<I, S>(&self, columns: I) -> Result<Table, MattenDataError>
107 where
108 I: IntoIterator<Item = S>,
109 S: AsRef<str>,
110 {
111 let requested: Vec<String> = columns
112 .into_iter()
113 .map(|s| s.as_ref().to_string())
114 .collect();
115
116 if requested.is_empty() {
117 return Err(MattenDataError::EmptySelection);
118 }
119
120 let mut seen = HashSet::with_capacity(requested.len());
121 for name in &requested {
122 if !seen.insert(name.as_str()) {
123 return Err(MattenDataError::DuplicateSelection { name: name.clone() });
124 }
125 }
126
127 let mut indices = Vec::with_capacity(requested.len());
128 for name in &requested {
129 match self.headers.iter().position(|h| h == name) {
130 Some(idx) => indices.push(idx),
131 None => return Err(MattenDataError::MissingColumn { name: name.clone() }),
132 }
133 }
134
135 let rows = self
136 .rows
137 .iter()
138 .map(|row| indices.iter().map(|&i| row[i].clone()).collect())
139 .collect();
140
141 Ok(Table::from_parts(requested, rows))
142 }
143
144 /// Fill every missing cell with `value`, returning a new `Table`.
145 ///
146 /// Missing values are never silently turned into zero; filling is always
147 /// explicit (RFC-035 §6). Non-missing cells and the shape are unchanged.
148 pub fn fill_missing(&self, value: impl Into<CellValue>) -> Result<Table, MattenDataError> {
149 let fill = value.into();
150 let rows = self
151 .rows
152 .iter()
153 .map(|row| {
154 row.iter()
155 .map(|cell| {
156 if matches!(cell, CellValue::Missing) {
157 fill.clone()
158 } else {
159 cell.clone()
160 }
161 })
162 .collect()
163 })
164 .collect();
165 Ok(Table::from_parts(self.headers.clone(), rows))
166 }
167
168 /// Convert the table to an explicit numeric table (RFC-035 §7).
169 ///
170 /// Strict conversion: `Int`/`Float` become `f64`; `Bool` and `Text` are
171 /// rejected ([`MattenDataError::NonNumericValue`]); a remaining `Missing` cell
172 /// is rejected ([`MattenDataError::MissingValue`]). Text is never parsed as a
173 /// number by default.
174 pub fn try_numeric(&self) -> Result<numeric::NumericTable, MattenDataError> {
175 numeric::try_numeric(self)
176 }
177
178 // --- crate-internal accessors used by sibling modules ---
179
180 pub(crate) fn headers(&self) -> &[String] {
181 &self.headers
182 }
183
184 pub(crate) fn rows(&self) -> &[Vec<CellValue>] {
185 &self.rows
186 }
187}