virtual-frame 0.1.1

Deterministic data pipeline toolkit for LLM training — bitmask-filtered virtual views, NFA regex, Kahan summation, full audit trail. Python bindings included.
Documentation
//! DataFrame — columnar storage with named columns.
//!
//! The base immutable data container. All TidyView operations reference
//! a shared `Rc<DataFrame>` without copying the underlying data.

use crate::column::Column;
use std::fmt;

/// Error type for DataFrame construction.
#[derive(Debug, Clone)]
pub enum DataError {
    /// Column lengths don't match.
    ColumnLengthMismatch {
        expected: usize,
        got: usize,
        column: String,
    },
    /// Duplicate column name.
    DuplicateColumn(String),
    /// Empty columns list.
    Empty,
}

impl fmt::Display for DataError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            DataError::ColumnLengthMismatch {
                expected,
                got,
                column,
            } => write!(
                f,
                "column `{}` has {} rows, expected {}",
                column, got, expected
            ),
            DataError::DuplicateColumn(name) => write!(f, "duplicate column `{}`", name),
            DataError::Empty => write!(f, "no columns provided"),
        }
    }
}

impl std::error::Error for DataError {}

/// Columnar data storage. Each column is a contiguous typed buffer.
///
/// Immutable once constructed — mutations go through `TidyFrame`
/// (copy-on-write) or produce new DataFrames.
#[derive(Debug, Clone)]
pub struct DataFrame {
    pub columns: Vec<(String, Column)>,
}

impl DataFrame {
    /// Create an empty DataFrame with no columns.
    pub fn new() -> Self {
        Self {
            columns: Vec::new(),
        }
    }

    /// Create a DataFrame from named columns.
    ///
    /// All columns must have the same length.
    pub fn from_columns(columns: Vec<(String, Column)>) -> Result<Self, DataError> {
        if columns.is_empty() {
            return Ok(Self {
                columns: Vec::new(),
            });
        }
        // Check for duplicate names
        let mut names = std::collections::BTreeSet::new();
        for (name, _) in &columns {
            if !names.insert(name.as_str()) {
                return Err(DataError::DuplicateColumn(name.clone()));
            }
        }
        // Check all columns have equal length
        let len = columns[0].1.len();
        for (name, col) in &columns {
            if col.len() != len {
                return Err(DataError::ColumnLengthMismatch {
                    expected: len,
                    got: col.len(),
                    column: name.clone(),
                });
            }
        }
        Ok(Self { columns })
    }

    /// Number of rows (from first column, or 0 if empty).
    pub fn nrows(&self) -> usize {
        self.columns.first().map(|(_, c)| c.len()).unwrap_or(0)
    }

    /// Number of columns.
    pub fn ncols(&self) -> usize {
        self.columns.len()
    }

    /// Get a column by name.
    pub fn get_column(&self, name: &str) -> Option<&Column> {
        self.columns
            .iter()
            .find(|(n, _)| n == name)
            .map(|(_, c)| c)
    }

    /// Get column index by name.
    pub fn column_index(&self, name: &str) -> Option<usize> {
        self.columns.iter().position(|(n, _)| n == name)
    }

    /// Get column names.
    pub fn column_names(&self) -> Vec<&str> {
        self.columns.iter().map(|(n, _)| n.as_str()).collect()
    }
}

impl Default for DataFrame {
    fn default() -> Self {
        Self::new()
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_from_columns() {
        let df = DataFrame::from_columns(vec![
            ("id".into(), Column::Int(vec![1, 2, 3])),
            ("name".into(), Column::Str(vec!["a".into(), "b".into(), "c".into()])),
        ])
        .unwrap();
        assert_eq!(df.nrows(), 3);
        assert_eq!(df.ncols(), 2);
    }

    #[test]
    fn test_length_mismatch() {
        let result = DataFrame::from_columns(vec![
            ("a".into(), Column::Int(vec![1, 2])),
            ("b".into(), Column::Int(vec![1, 2, 3])),
        ]);
        assert!(result.is_err());
    }

    #[test]
    fn test_duplicate_column() {
        let result = DataFrame::from_columns(vec![
            ("a".into(), Column::Int(vec![1])),
            ("a".into(), Column::Int(vec![2])),
        ]);
        assert!(result.is_err());
    }

    #[test]
    fn test_get_column() {
        let df = DataFrame::from_columns(vec![
            ("x".into(), Column::Float(vec![1.0, 2.0])),
        ])
        .unwrap();
        assert!(df.get_column("x").is_some());
        assert!(df.get_column("y").is_none());
    }
}