Skip to main content

nexcore_dataframe/
column.rs

1//! Column: a named, typed array — the fundamental storage unit.
2
3use crate::error::DataFrameError;
4use crate::scalar::Scalar;
5
6/// Type enumeration for column data.
7#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8#[non_exhaustive]
9pub enum DataType {
10    Bool,
11    Int64,
12    UInt64,
13    Float64,
14    Utf8,
15}
16
17/// Type-safe column storage. Each variant holds nullable values.
18#[derive(Debug, Clone)]
19#[non_exhaustive]
20pub enum ColumnData {
21    Bool(Vec<Option<bool>>),
22    Int64(Vec<Option<i64>>),
23    UInt64(Vec<Option<u64>>),
24    Float64(Vec<Option<f64>>),
25    String(Vec<Option<String>>),
26}
27
28impl ColumnData {
29    /// Number of elements (including nulls).
30    #[must_use]
31    pub fn len(&self) -> usize {
32        match self {
33            Self::Bool(v) => v.len(),
34            Self::Int64(v) => v.len(),
35            Self::UInt64(v) => v.len(),
36            Self::Float64(v) => v.len(),
37            Self::String(v) => v.len(),
38        }
39    }
40
41    /// Whether the column has no elements.
42    #[must_use]
43    pub fn is_empty(&self) -> bool {
44        self.len() == 0
45    }
46
47    /// Get the data type.
48    #[must_use]
49    pub fn dtype(&self) -> DataType {
50        match self {
51            Self::Bool(_) => DataType::Bool,
52            Self::Int64(_) => DataType::Int64,
53            Self::UInt64(_) => DataType::UInt64,
54            Self::Float64(_) => DataType::Float64,
55            Self::String(_) => DataType::Utf8,
56        }
57    }
58
59    /// Get value at index as Scalar.
60    #[must_use]
61    pub fn get(&self, index: usize) -> Option<Scalar> {
62        match self {
63            Self::Bool(v) => v.get(index).map(|o| match o {
64                Some(b) => Scalar::Bool(*b),
65                None => Scalar::Null,
66            }),
67            Self::Int64(v) => v.get(index).map(|o| match o {
68                Some(n) => Scalar::Int64(*n),
69                None => Scalar::Null,
70            }),
71            Self::UInt64(v) => v.get(index).map(|o| match o {
72                Some(n) => Scalar::UInt64(*n),
73                None => Scalar::Null,
74            }),
75            Self::Float64(v) => v.get(index).map(|o| match o {
76                Some(n) => Scalar::Float64(*n),
77                None => Scalar::Null,
78            }),
79            Self::String(v) => v.get(index).map(|o| match o {
80                Some(s) => Scalar::String(s.clone()),
81                None => Scalar::Null,
82            }),
83        }
84    }
85
86    /// Count of non-null values.
87    #[must_use]
88    pub fn non_null_count(&self) -> usize {
89        match self {
90            Self::Bool(v) => v.iter().filter(|o| o.is_some()).count(),
91            Self::Int64(v) => v.iter().filter(|o| o.is_some()).count(),
92            Self::UInt64(v) => v.iter().filter(|o| o.is_some()).count(),
93            Self::Float64(v) => v.iter().filter(|o| o.is_some()).count(),
94            Self::String(v) => v.iter().filter(|o| o.is_some()).count(),
95        }
96    }
97
98    /// Collect rows at specified indices into a new ColumnData.
99    ///
100    /// # Panics
101    ///
102    /// Panics if any index in `indices` is out of bounds. Callers must ensure all
103    /// indices are valid (e.g., from `filter` or `sort` which derive indices from
104    /// `0..self.len()`).
105    pub fn take(&self, indices: &[usize]) -> Self {
106        #[allow(
107            clippy::indexing_slicing,
108            reason = "indices are always derived from 0..len() in sort/filter/group_by — bounds are structurally guaranteed by callers"
109        )]
110        match self {
111            Self::Bool(v) => Self::Bool(indices.iter().map(|&i| v[i]).collect()),
112            Self::Int64(v) => Self::Int64(indices.iter().map(|&i| v[i]).collect()),
113            Self::UInt64(v) => Self::UInt64(indices.iter().map(|&i| v[i]).collect()),
114            Self::Float64(v) => Self::Float64(indices.iter().map(|&i| v[i]).collect()),
115            Self::String(v) => Self::String(indices.iter().map(|&i| v[i].clone()).collect()),
116        }
117    }
118
119    /// Collect rows at specified optional indices into a new ColumnData.
120    ///
121    /// `Some(i)` takes the value at index `i`; `None` produces a null value.
122    /// Used by join operations where one side may have no matching row.
123    pub fn take_optional(&self, indices: &[Option<usize>]) -> Self {
124        #[allow(
125            clippy::indexing_slicing,
126            reason = "Some(i) indices are derived from 0..len() in join probe — bounds are structurally guaranteed by callers"
127        )]
128        match self {
129            Self::Bool(v) => Self::Bool(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect()),
130            Self::Int64(v) => {
131                Self::Int64(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect())
132            }
133            Self::UInt64(v) => {
134                Self::UInt64(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect())
135            }
136            Self::Float64(v) => {
137                Self::Float64(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect())
138            }
139            Self::String(v) => Self::String(
140                indices
141                    .iter()
142                    .map(|opt| opt.and_then(|i| v[i].clone()))
143                    .collect(),
144            ),
145        }
146    }
147}
148
149/// A single named column with homogeneous type.
150#[derive(Debug, Clone)]
151pub struct Column {
152    name: String,
153    data: ColumnData,
154}
155
156impl Column {
157    // =========================================================================
158    // Nullable constructors
159    // =========================================================================
160
161    /// Create a boolean column with nullable values.
162    pub fn new_bool(name: impl Into<String>, data: Vec<Option<bool>>) -> Self {
163        Self {
164            name: name.into(),
165            data: ColumnData::Bool(data),
166        }
167    }
168
169    /// Create an i64 column with nullable values.
170    pub fn new_i64(name: impl Into<String>, data: Vec<Option<i64>>) -> Self {
171        Self {
172            name: name.into(),
173            data: ColumnData::Int64(data),
174        }
175    }
176
177    /// Create a u64 column with nullable values.
178    pub fn new_u64(name: impl Into<String>, data: Vec<Option<u64>>) -> Self {
179        Self {
180            name: name.into(),
181            data: ColumnData::UInt64(data),
182        }
183    }
184
185    /// Create an f64 column with nullable values.
186    pub fn new_f64(name: impl Into<String>, data: Vec<Option<f64>>) -> Self {
187        Self {
188            name: name.into(),
189            data: ColumnData::Float64(data),
190        }
191    }
192
193    /// Create a string column with nullable values.
194    pub fn new_string(name: impl Into<String>, data: Vec<Option<String>>) -> Self {
195        Self {
196            name: name.into(),
197            data: ColumnData::String(data),
198        }
199    }
200
201    // =========================================================================
202    // Convenience constructors (non-nullable)
203    // =========================================================================
204
205    /// Create a boolean column from non-nullable values.
206    pub fn from_bools(name: impl Into<String>, data: Vec<bool>) -> Self {
207        Self::new_bool(name, data.into_iter().map(Some).collect())
208    }
209
210    /// Create an i64 column from non-nullable values.
211    pub fn from_i64s(name: impl Into<String>, data: Vec<i64>) -> Self {
212        Self::new_i64(name, data.into_iter().map(Some).collect())
213    }
214
215    /// Create a u64 column from non-nullable values.
216    pub fn from_u64s(name: impl Into<String>, data: Vec<u64>) -> Self {
217        Self::new_u64(name, data.into_iter().map(Some).collect())
218    }
219
220    /// Create an f64 column from non-nullable values.
221    pub fn from_f64s(name: impl Into<String>, data: Vec<f64>) -> Self {
222        Self::new_f64(name, data.into_iter().map(Some).collect())
223    }
224
225    /// Create a string column from owned strings (non-nullable).
226    pub fn from_strings(name: impl Into<String>, data: Vec<String>) -> Self {
227        Self::new_string(name, data.into_iter().map(Some).collect())
228    }
229
230    /// Create a string column from string slices (non-nullable).
231    pub fn from_strs(name: impl Into<String>, data: &[&str]) -> Self {
232        Self::new_string(name, data.iter().map(|s| Some((*s).to_string())).collect())
233    }
234
235    // =========================================================================
236    // Accessors
237    // =========================================================================
238
239    /// Column name.
240    #[must_use]
241    pub fn name(&self) -> &str {
242        &self.name
243    }
244
245    /// Data type of this column.
246    #[must_use]
247    pub fn dtype(&self) -> DataType {
248        self.data.dtype()
249    }
250
251    /// Number of elements (including nulls).
252    #[must_use]
253    pub fn len(&self) -> usize {
254        self.data.len()
255    }
256
257    /// Whether the column is empty.
258    #[must_use]
259    pub fn is_empty(&self) -> bool {
260        self.data.is_empty()
261    }
262
263    /// Count of non-null values.
264    #[must_use]
265    pub fn non_null_count(&self) -> usize {
266        self.data.non_null_count()
267    }
268
269    /// Count of null values.
270    #[must_use]
271    pub fn null_count(&self) -> usize {
272        // non_null_count() <= len() is an invariant: every non-null is also in len()
273        #[allow(
274            clippy::arithmetic_side_effects,
275            reason = "non_null_count() is always <= len() by construction — both count the same Vec elements"
276        )]
277        {
278            self.len() - self.non_null_count()
279        }
280    }
281
282    /// Get the underlying data reference.
283    #[must_use]
284    pub fn data(&self) -> &ColumnData {
285        &self.data
286    }
287
288    /// Get value at index as Scalar. Returns None if index out of bounds.
289    #[must_use]
290    pub fn get(&self, index: usize) -> Option<Scalar> {
291        self.data.get(index)
292    }
293
294    /// Rename this column (returns a new column).
295    #[must_use]
296    pub fn rename(&self, name: impl Into<String>) -> Self {
297        Self {
298            name: name.into(),
299            data: self.data.clone(),
300        }
301    }
302
303    /// Take rows at specified indices.
304    pub fn take(&self, indices: &[usize]) -> Self {
305        Self {
306            name: self.name.clone(),
307            data: self.data.take(indices),
308        }
309    }
310
311    /// Take rows at optional indices. `None` produces null values.
312    /// Used by join operations where one side may have no matching row.
313    pub fn take_optional(&self, indices: &[Option<usize>]) -> Self {
314        Self {
315            name: self.name.clone(),
316            data: self.data.take_optional(indices),
317        }
318    }
319
320    // =========================================================================
321    // Typed iterators
322    // =========================================================================
323
324    /// Iterate as string references. Returns Err if column is not Utf8.
325    pub fn as_str_iter(&self) -> Result<impl Iterator<Item = Option<&str>>, DataFrameError> {
326        match &self.data {
327            ColumnData::String(v) => Ok(v.iter().map(|o| o.as_deref())),
328            ColumnData::Bool(_)
329            | ColumnData::Int64(_)
330            | ColumnData::UInt64(_)
331            | ColumnData::Float64(_) => Err(DataFrameError::TypeMismatch {
332                column: self.name.clone(),
333                expected: DataType::Utf8,
334                actual: self.dtype(),
335            }),
336        }
337    }
338
339    /// Iterate as i64 values. Returns Err if column is not Int64.
340    pub fn as_i64_iter(&self) -> Result<impl Iterator<Item = Option<i64>> + '_, DataFrameError> {
341        match &self.data {
342            ColumnData::Int64(v) => Ok(v.iter().copied()),
343            ColumnData::Bool(_)
344            | ColumnData::UInt64(_)
345            | ColumnData::Float64(_)
346            | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
347                column: self.name.clone(),
348                expected: DataType::Int64,
349                actual: self.dtype(),
350            }),
351        }
352    }
353
354    /// Iterate as u64 values. Returns Err if column is not UInt64.
355    pub fn as_u64_iter(&self) -> Result<impl Iterator<Item = Option<u64>> + '_, DataFrameError> {
356        match &self.data {
357            ColumnData::UInt64(v) => Ok(v.iter().copied()),
358            ColumnData::Bool(_)
359            | ColumnData::Int64(_)
360            | ColumnData::Float64(_)
361            | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
362                column: self.name.clone(),
363                expected: DataType::UInt64,
364                actual: self.dtype(),
365            }),
366        }
367    }
368
369    /// Iterate as f64 values. Returns Err if column is not Float64.
370    pub fn as_f64_iter(&self) -> Result<impl Iterator<Item = Option<f64>> + '_, DataFrameError> {
371        match &self.data {
372            ColumnData::Float64(v) => Ok(v.iter().copied()),
373            ColumnData::Bool(_)
374            | ColumnData::Int64(_)
375            | ColumnData::UInt64(_)
376            | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
377                column: self.name.clone(),
378                expected: DataType::Float64,
379                actual: self.dtype(),
380            }),
381        }
382    }
383
384    /// Iterate as bool values. Returns Err if column is not Bool.
385    pub fn as_bool_iter(&self) -> Result<impl Iterator<Item = Option<bool>> + '_, DataFrameError> {
386        match &self.data {
387            ColumnData::Bool(v) => Ok(v.iter().copied()),
388            ColumnData::Int64(_)
389            | ColumnData::UInt64(_)
390            | ColumnData::Float64(_)
391            | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
392                column: self.name.clone(),
393                expected: DataType::Bool,
394                actual: self.dtype(),
395            }),
396        }
397    }
398
399    /// Get a string value at index. Returns Err if not Utf8 column.
400    pub fn get_str(&self, index: usize) -> Result<Option<&str>, DataFrameError> {
401        match &self.data {
402            ColumnData::String(v) => match v.get(index) {
403                Some(o) => Ok(o.as_deref()),
404                None => Err(DataFrameError::IndexOutOfBounds {
405                    index,
406                    length: v.len(),
407                }),
408            },
409            ColumnData::Bool(_)
410            | ColumnData::Int64(_)
411            | ColumnData::UInt64(_)
412            | ColumnData::Float64(_) => Err(DataFrameError::TypeMismatch {
413                column: self.name.clone(),
414                expected: DataType::Utf8,
415                actual: self.dtype(),
416            }),
417        }
418    }
419}
420
421#[cfg(test)]
422mod tests {
423    use super::*;
424
425    #[test]
426    fn from_strs_construction() {
427        let c = Column::from_strs("names", &["alice", "bob", "carol"]);
428        assert_eq!(c.name(), "names");
429        assert_eq!(c.dtype(), DataType::Utf8);
430        assert_eq!(c.len(), 3);
431        assert_eq!(c.non_null_count(), 3);
432        assert_eq!(c.null_count(), 0);
433    }
434
435    #[test]
436    fn nullable_column() {
437        let c = Column::new_i64("x", vec![Some(1), None, Some(3)]);
438        assert_eq!(c.len(), 3);
439        assert_eq!(c.non_null_count(), 2);
440        assert_eq!(c.null_count(), 1);
441        assert_eq!(c.get(0), Some(Scalar::Int64(1)));
442        assert_eq!(c.get(1), Some(Scalar::Null));
443        assert_eq!(c.get(3), None);
444    }
445
446    #[test]
447    fn typed_iterators() {
448        let c = Column::from_i64s("nums", vec![10, 20, 30]);
449        let vals: Vec<_> = c.as_i64_iter().unwrap_or_else(|_| unreachable!()).collect();
450        assert_eq!(vals, vec![Some(10), Some(20), Some(30)]);
451
452        // Type mismatch
453        assert!(c.as_str_iter().is_err());
454    }
455
456    #[test]
457    fn take_indices() {
458        let c = Column::from_strs("x", &["a", "b", "c", "d"]);
459        let taken = c.take(&[0, 2, 3]);
460        assert_eq!(taken.len(), 3);
461        assert_eq!(taken.get_str(0).unwrap_or(None), Some("a"));
462        assert_eq!(taken.get_str(1).unwrap_or(None), Some("c"));
463        assert_eq!(taken.get_str(2).unwrap_or(None), Some("d"));
464    }
465
466    #[test]
467    fn rename_column() {
468        let c = Column::from_i64s("old", vec![1, 2]);
469        let c2 = c.rename("new");
470        assert_eq!(c2.name(), "new");
471        assert_eq!(c2.len(), 2);
472    }
473
474    #[test]
475    fn take_optional_indices() {
476        let c = Column::from_strs("x", &["a", "b", "c"]);
477        let taken = c.take_optional(&[Some(0), None, Some(2)]);
478        assert_eq!(taken.len(), 3);
479        assert_eq!(taken.get_str(0).unwrap_or(None), Some("a"));
480        assert_eq!(taken.get_str(1).unwrap_or(None), None);
481        assert_eq!(taken.get_str(2).unwrap_or(None), Some("c"));
482
483        // Numeric types
484        let n = Column::from_i64s("n", vec![10, 20, 30]);
485        let taken = n.take_optional(&[None, Some(1), Some(2)]);
486        assert_eq!(taken.get(0), Some(Scalar::Null));
487        assert_eq!(taken.get(1), Some(Scalar::Int64(20)));
488        assert_eq!(taken.get(2), Some(Scalar::Int64(30)));
489    }
490
491    #[test]
492    fn all_data_types_construct() {
493        let b = Column::from_bools("b", vec![true, false]);
494        assert_eq!(b.dtype(), DataType::Bool);
495        let i = Column::from_i64s("i", vec![1, 2]);
496        assert_eq!(i.dtype(), DataType::Int64);
497        let u = Column::from_u64s("u", vec![1, 2]);
498        assert_eq!(u.dtype(), DataType::UInt64);
499        let f = Column::from_f64s("f", vec![1.0, 2.0]);
500        assert_eq!(f.dtype(), DataType::Float64);
501        let s = Column::from_strings("s", vec!["a".into(), "b".into()]);
502        assert_eq!(s.dtype(), DataType::Utf8);
503    }
504}