Skip to main content

nexcore_dataframe/
dataframe.rs

1//! DataFrame: the core columnar data structure.
2
3use crate::column::{Column, DataType};
4use crate::error::DataFrameError;
5use crate::scalar::Scalar;
6use crate::schema::Schema;
7
8/// A columnar data structure. Each column is a named, typed array.
9/// All columns must have the same length.
10#[derive(Debug, Clone)]
11pub struct DataFrame {
12    columns: Vec<Column>,
13}
14
15impl DataFrame {
16    /// Create a new DataFrame from columns.
17    /// All columns must have the same length. Empty vec produces an empty DataFrame.
18    pub fn new(columns: Vec<Column>) -> Result<Self, DataFrameError> {
19        if columns.is_empty() {
20            return Ok(Self { columns });
21        }
22        // columns is non-empty (checked above), so index 0 and slice [1..] are always valid
23        #[allow(
24            clippy::indexing_slicing,
25            reason = "columns is non-empty (checked by is_empty() guard above); index 0 and slice [1..] are always in bounds"
26        )]
27        let expected = columns[0].len();
28        #[allow(
29            clippy::indexing_slicing,
30            reason = "columns is non-empty; slice [1..] on a non-empty Vec is always valid (may produce empty slice)"
31        )]
32        for col in &columns[1..] {
33            if col.len() != expected {
34                return Err(DataFrameError::LengthMismatch {
35                    expected,
36                    actual: col.len(),
37                });
38            }
39        }
40        Ok(Self { columns })
41    }
42
43    /// Create an empty DataFrame (no columns, no rows).
44    #[must_use]
45    pub fn empty() -> Self {
46        Self {
47            columns: Vec::new(),
48        }
49    }
50
51    /// Number of rows.
52    #[must_use]
53    pub fn height(&self) -> usize {
54        self.columns.first().map_or(0, |c| c.len())
55    }
56
57    /// Number of columns.
58    #[must_use]
59    pub fn width(&self) -> usize {
60        self.columns.len()
61    }
62
63    /// Whether the DataFrame has no columns.
64    #[must_use]
65    pub fn is_empty(&self) -> bool {
66        self.columns.is_empty()
67    }
68
69    /// Get a column by name.
70    pub fn column(&self, name: &str) -> Result<&Column, DataFrameError> {
71        self.columns
72            .iter()
73            .find(|c| c.name() == name)
74            .ok_or_else(|| DataFrameError::ColumnNotFound(name.to_string()))
75    }
76
77    /// Get all column names.
78    pub fn column_names(&self) -> Vec<&str> {
79        self.columns.iter().map(|c| c.name()).collect()
80    }
81
82    /// Get all columns as a slice.
83    #[must_use]
84    pub fn columns(&self) -> &[Column] {
85        &self.columns
86    }
87
88    /// Get the schema (column names and types).
89    #[must_use]
90    pub fn schema(&self) -> Schema {
91        Schema::new(
92            self.columns
93                .iter()
94                .map(|c| (c.name().to_string(), c.dtype()))
95                .collect(),
96        )
97    }
98
99    /// Get a row as a vector of Scalars.
100    pub fn row(&self, index: usize) -> Option<Vec<Scalar>> {
101        if index >= self.height() {
102            return None;
103        }
104        Some(
105            self.columns
106                .iter()
107                .map(|c| c.get(index).unwrap_or(Scalar::Null))
108                .collect(),
109        )
110    }
111
112    /// Add or replace a column. If a column with the same name exists, it is replaced.
113    pub fn with_column(&self, col: Column) -> Result<Self, DataFrameError> {
114        if !self.is_empty() && col.len() != self.height() {
115            return Err(DataFrameError::LengthMismatch {
116                expected: self.height(),
117                actual: col.len(),
118            });
119        }
120        let mut columns: Vec<Column> = self
121            .columns
122            .iter()
123            .filter(|c| c.name() != col.name())
124            .cloned()
125            .collect();
126        columns.push(col);
127        Ok(Self { columns })
128    }
129
130    /// Internal: build DataFrame from pre-validated columns (same length guaranteed).
131    pub(crate) fn from_columns_unchecked(columns: Vec<Column>) -> Self {
132        Self { columns }
133    }
134}
135
136impl std::fmt::Display for DataFrame {
137    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
138        // Header
139        let names: Vec<&str> = self.columns.iter().map(|c| c.name()).collect();
140        writeln!(f, "{}", names.join("\t"))?;
141        // Rows (max 20)
142        let max_rows = self.height().min(20);
143        for i in 0..max_rows {
144            let vals: Vec<String> = self
145                .columns
146                .iter()
147                .map(|c| c.get(i).map_or("null".to_string(), |s| s.to_string()))
148                .collect();
149            writeln!(f, "{}", vals.join("\t"))?;
150        }
151        if self.height() > max_rows {
152            // max_rows = height.min(20) <= height, so subtraction cannot underflow
153            #[allow(
154                clippy::arithmetic_side_effects,
155                reason = "max_rows = self.height().min(20) so max_rows <= self.height(); subtraction cannot underflow"
156            )]
157            writeln!(f, "... ({} more rows)", self.height() - max_rows)?;
158        }
159        Ok(())
160    }
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn new_valid() {
169        let df = DataFrame::new(vec![
170            Column::from_strs("name", &["a", "b"]),
171            Column::from_i64s("val", vec![1, 2]),
172        ]);
173        assert!(df.is_ok());
174        let df = df.unwrap_or_else(|_| unreachable!());
175        assert_eq!(df.height(), 2);
176        assert_eq!(df.width(), 2);
177    }
178
179    #[test]
180    fn new_length_mismatch() {
181        let df = DataFrame::new(vec![
182            Column::from_strs("a", &["x", "y"]),
183            Column::from_i64s("b", vec![1]),
184        ]);
185        assert!(df.is_err());
186    }
187
188    #[test]
189    fn empty_dataframe() {
190        let df = DataFrame::empty();
191        assert_eq!(df.height(), 0);
192        assert_eq!(df.width(), 0);
193        assert!(df.is_empty());
194    }
195
196    #[test]
197    fn column_lookup() {
198        let df = DataFrame::new(vec![
199            Column::from_strs("name", &["a"]),
200            Column::from_i64s("val", vec![1]),
201        ])
202        .unwrap_or_else(|_| unreachable!());
203        assert!(df.column("name").is_ok());
204        assert!(df.column("missing").is_err());
205    }
206
207    #[test]
208    fn schema_extraction() {
209        let df = DataFrame::new(vec![
210            Column::from_strs("s", &["x"]),
211            Column::from_f64s("f", vec![1.0]),
212        ])
213        .unwrap_or_else(|_| unreachable!());
214        let s = df.schema();
215        assert_eq!(s.len(), 2);
216        assert_eq!(s.dtype("s"), Some(DataType::Utf8));
217        assert_eq!(s.dtype("f"), Some(DataType::Float64));
218    }
219
220    #[test]
221    fn with_column_add() {
222        let df = DataFrame::new(vec![Column::from_i64s("a", vec![1, 2])])
223            .unwrap_or_else(|_| unreachable!());
224        let df2 = df
225            .with_column(Column::from_i64s("b", vec![3, 4]))
226            .unwrap_or_else(|_| unreachable!());
227        assert_eq!(df2.width(), 2);
228    }
229
230    #[test]
231    fn with_column_replace() {
232        let df = DataFrame::new(vec![Column::from_i64s("a", vec![1, 2])])
233            .unwrap_or_else(|_| unreachable!());
234        let df2 = df
235            .with_column(Column::from_i64s("a", vec![10, 20]))
236            .unwrap_or_else(|_| unreachable!());
237        assert_eq!(df2.width(), 1);
238        assert_eq!(
239            df2.column("a").unwrap_or_else(|_| unreachable!()).get(0),
240            Some(Scalar::Int64(10))
241        );
242    }
243
244    #[test]
245    fn with_column_length_mismatch() {
246        let df = DataFrame::new(vec![Column::from_i64s("a", vec![1, 2])])
247            .unwrap_or_else(|_| unreachable!());
248        assert!(df.with_column(Column::from_i64s("b", vec![1])).is_err());
249    }
250
251    #[test]
252    fn row_access() {
253        let df = DataFrame::new(vec![
254            Column::from_strs("name", &["alice"]),
255            Column::from_i64s("age", vec![30]),
256        ])
257        .unwrap_or_else(|_| unreachable!());
258        let r = df.row(0);
259        assert!(r.is_some());
260        let r = r.unwrap_or_else(|| unreachable!());
261        assert_eq!(r.len(), 2);
262        assert_eq!(r[0], Scalar::String("alice".to_string()));
263        assert_eq!(r[1], Scalar::Int64(30));
264        assert!(df.row(1).is_none());
265    }
266}