Skip to main content

virtual_frame/
dataframe.rs

1//! DataFrame — columnar storage with named columns.
2//!
3//! The base immutable data container. All TidyView operations reference
4//! a shared `Rc<DataFrame>` without copying the underlying data.
5
6use crate::column::Column;
7use std::fmt;
8
9/// Error type for DataFrame construction.
10#[derive(Debug, Clone)]
11pub enum DataError {
12    /// Column lengths don't match.
13    ColumnLengthMismatch {
14        expected: usize,
15        got: usize,
16        column: String,
17    },
18    /// Duplicate column name.
19    DuplicateColumn(String),
20    /// Empty columns list.
21    Empty,
22}
23
24impl fmt::Display for DataError {
25    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
26        match self {
27            DataError::ColumnLengthMismatch {
28                expected,
29                got,
30                column,
31            } => write!(
32                f,
33                "column `{}` has {} rows, expected {}",
34                column, got, expected
35            ),
36            DataError::DuplicateColumn(name) => write!(f, "duplicate column `{}`", name),
37            DataError::Empty => write!(f, "no columns provided"),
38        }
39    }
40}
41
42impl std::error::Error for DataError {}
43
44/// Columnar data storage. Each column is a contiguous typed buffer.
45///
46/// Immutable once constructed — mutations go through `TidyFrame`
47/// (copy-on-write) or produce new DataFrames.
48#[derive(Debug, Clone)]
49pub struct DataFrame {
50    pub columns: Vec<(String, Column)>,
51}
52
53impl DataFrame {
54    /// Create an empty DataFrame with no columns.
55    pub fn new() -> Self {
56        Self {
57            columns: Vec::new(),
58        }
59    }
60
61    /// Create a DataFrame from named columns.
62    ///
63    /// All columns must have the same length.
64    pub fn from_columns(columns: Vec<(String, Column)>) -> Result<Self, DataError> {
65        if columns.is_empty() {
66            return Ok(Self {
67                columns: Vec::new(),
68            });
69        }
70        // Check for duplicate names
71        let mut names = std::collections::BTreeSet::new();
72        for (name, _) in &columns {
73            if !names.insert(name.as_str()) {
74                return Err(DataError::DuplicateColumn(name.clone()));
75            }
76        }
77        // Check all columns have equal length
78        let len = columns[0].1.len();
79        for (name, col) in &columns {
80            if col.len() != len {
81                return Err(DataError::ColumnLengthMismatch {
82                    expected: len,
83                    got: col.len(),
84                    column: name.clone(),
85                });
86            }
87        }
88        Ok(Self { columns })
89    }
90
91    /// Number of rows (from first column, or 0 if empty).
92    pub fn nrows(&self) -> usize {
93        self.columns.first().map(|(_, c)| c.len()).unwrap_or(0)
94    }
95
96    /// Number of columns.
97    pub fn ncols(&self) -> usize {
98        self.columns.len()
99    }
100
101    /// Get a column by name.
102    pub fn get_column(&self, name: &str) -> Option<&Column> {
103        self.columns
104            .iter()
105            .find(|(n, _)| n == name)
106            .map(|(_, c)| c)
107    }
108
109    /// Get column index by name.
110    pub fn column_index(&self, name: &str) -> Option<usize> {
111        self.columns.iter().position(|(n, _)| n == name)
112    }
113
114    /// Get column names.
115    pub fn column_names(&self) -> Vec<&str> {
116        self.columns.iter().map(|(n, _)| n.as_str()).collect()
117    }
118}
119
120impl Default for DataFrame {
121    fn default() -> Self {
122        Self::new()
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn test_from_columns() {
132        let df = DataFrame::from_columns(vec![
133            ("id".into(), Column::Int(vec![1, 2, 3])),
134            ("name".into(), Column::Str(vec!["a".into(), "b".into(), "c".into()])),
135        ])
136        .unwrap();
137        assert_eq!(df.nrows(), 3);
138        assert_eq!(df.ncols(), 2);
139    }
140
141    #[test]
142    fn test_length_mismatch() {
143        let result = DataFrame::from_columns(vec![
144            ("a".into(), Column::Int(vec![1, 2])),
145            ("b".into(), Column::Int(vec![1, 2, 3])),
146        ]);
147        assert!(result.is_err());
148    }
149
150    #[test]
151    fn test_duplicate_column() {
152        let result = DataFrame::from_columns(vec![
153            ("a".into(), Column::Int(vec![1])),
154            ("a".into(), Column::Int(vec![2])),
155        ]);
156        assert!(result.is_err());
157    }
158
159    #[test]
160    fn test_get_column() {
161        let df = DataFrame::from_columns(vec![
162            ("x".into(), Column::Float(vec![1.0, 2.0])),
163        ])
164        .unwrap();
165        assert!(df.get_column("x").is_some());
166        assert!(df.get_column("y").is_none());
167    }
168}