virtual-frame 0.1.1

Deterministic data pipeline toolkit for LLM training — bitmask-filtered virtual views, NFA regex, Kahan summation, full audit trail. Python bindings included.
Documentation
//! Columnar storage — typed vectors, one per column.
//!
//! Each column is a contiguous typed buffer. Aggregation scans a single
//! `Vec<f64>` — no pointer chasing, no skipping over unrelated fields.
//! This is the same storage model used by Apache Arrow and DuckDB.

use std::cmp::Ordering;

/// A single typed column of data.
#[derive(Debug, Clone)]
pub enum Column {
    /// 64-bit signed integer column.
    Int(Vec<i64>),
    /// 64-bit floating-point column.
    Float(Vec<f64>),
    /// UTF-8 string column.
    Str(Vec<String>),
    /// Boolean column.
    Bool(Vec<bool>),
}

impl Column {
    /// Number of rows.
    pub fn len(&self) -> usize {
        match self {
            Column::Int(v) => v.len(),
            Column::Float(v) => v.len(),
            Column::Str(v) => v.len(),
            Column::Bool(v) => v.len(),
        }
    }

    /// Whether the column is empty.
    pub fn is_empty(&self) -> bool {
        self.len() == 0
    }

    /// Human-readable type name.
    pub fn type_name(&self) -> &'static str {
        match self {
            Column::Int(_) => "Int",
            Column::Float(_) => "Float",
            Column::Str(_) => "Str",
            Column::Bool(_) => "Bool",
        }
    }

    /// Get a display-friendly string value at index.
    pub fn get_display(&self, idx: usize) -> String {
        match self {
            Column::Int(v) => format!("{}", v[idx]),
            Column::Float(v) => format!("{}", v[idx]),
            Column::Str(v) => v[idx].clone(),
            Column::Bool(v) => format!("{}", v[idx]),
        }
    }

    /// Get value as f64 (for numeric aggregation). Returns None for non-numeric.
    pub fn get_f64(&self, idx: usize) -> Option<f64> {
        match self {
            Column::Int(v) => Some(v[idx] as f64),
            Column::Float(v) => Some(v[idx]),
            Column::Bool(v) => Some(if v[idx] { 1.0 } else { 0.0 }),
            Column::Str(_) => None,
        }
    }

    /// Compare two rows within this column. Used by sort.
    pub fn compare_rows(&self, a: usize, b: usize) -> Ordering {
        match self {
            Column::Int(v) => v[a].cmp(&v[b]),
            Column::Float(v) => v[a].partial_cmp(&v[b]).unwrap_or(Ordering::Equal),
            Column::Str(v) => v[a].cmp(&v[b]),
            Column::Bool(v) => v[a].cmp(&v[b]),
        }
    }

    /// Gather rows by index — create a new column from selected rows.
    pub fn gather(&self, indices: &[usize]) -> Column {
        match self {
            Column::Int(v) => Column::Int(indices.iter().map(|&i| v[i]).collect()),
            Column::Float(v) => Column::Float(indices.iter().map(|&i| v[i]).collect()),
            Column::Str(v) => Column::Str(indices.iter().map(|&i| v[i].clone()).collect()),
            Column::Bool(v) => Column::Bool(indices.iter().map(|&i| v[i]).collect()),
        }
    }
}

/// Wrapper for f64 that provides total ordering (for BTreeMap keys).
/// NaN sorts last, consistent with IEEE 754 totalOrder.
#[derive(Debug, Clone, Copy)]
pub struct FloatKey(pub f64);

impl PartialEq for FloatKey {
    fn eq(&self, other: &Self) -> bool {
        self.0.to_bits() == other.0.to_bits()
    }
}

impl Eq for FloatKey {}

impl PartialOrd for FloatKey {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
    }
}

impl Ord for FloatKey {
    fn cmp(&self, other: &Self) -> Ordering {
        self.0.total_cmp(&other.0)
    }
}

/// Owned group key — used for storing unique group values.
#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord)]
pub enum GroupKey {
    Int(i64),
    Float(FloatKey),
    Str(String),
    Bool(bool),
}

impl GroupKey {
    /// Construct from a column at a given row.
    pub fn from_column(col: &Column, row: usize) -> Self {
        match col {
            Column::Int(v) => GroupKey::Int(v[row]),
            Column::Float(v) => GroupKey::Float(FloatKey(v[row])),
            Column::Str(v) => GroupKey::Str(v[row].clone()),
            Column::Bool(v) => GroupKey::Bool(v[row]),
        }
    }

    /// Display as string.
    pub fn to_display(&self) -> String {
        match self {
            GroupKey::Int(v) => format!("{}", v),
            GroupKey::Float(FloatKey(v)) => format!("{}", v),
            GroupKey::Str(s) => s.clone(),
            GroupKey::Bool(b) => format!("{}", b),
        }
    }
}

/// Borrowed group key — zero-copy reference into column data.
/// Used for BTreeMap lookups without per-row allocation.
#[derive(Debug, PartialEq, Eq, PartialOrd, Ord)]
pub enum ColumnKeyRef<'a> {
    Int(i64),
    Float(FloatKey),
    Str(&'a str),
    Bool(bool),
}

impl<'a> ColumnKeyRef<'a> {
    /// Borrow a key from a column at a given row — zero allocation.
    #[inline]
    pub fn from_column(col: &'a Column, row: usize) -> Self {
        match col {
            Column::Int(v) => ColumnKeyRef::Int(v[row]),
            Column::Float(v) => ColumnKeyRef::Float(FloatKey(v[row])),
            Column::Str(v) => ColumnKeyRef::Str(&v[row]),
            Column::Bool(v) => ColumnKeyRef::Bool(v[row]),
        }
    }

    /// Convert to owned GroupKey (clones string if Str variant).
    pub fn to_owned_key(&self) -> GroupKey {
        match self {
            ColumnKeyRef::Int(v) => GroupKey::Int(*v),
            ColumnKeyRef::Float(v) => GroupKey::Float(*v),
            ColumnKeyRef::Str(s) => GroupKey::Str((*s).to_string()),
            ColumnKeyRef::Bool(v) => GroupKey::Bool(*v),
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_column_len() {
        let col = Column::Int(vec![1, 2, 3]);
        assert_eq!(col.len(), 3);
    }

    #[test]
    fn test_column_gather() {
        let col = Column::Str(vec!["a".into(), "b".into(), "c".into(), "d".into()]);
        let gathered = col.gather(&[0, 2, 3]);
        if let Column::Str(v) = gathered {
            assert_eq!(v, vec!["a", "c", "d"]);
        } else {
            panic!("wrong type");
        }
    }

    #[test]
    fn test_float_key_nan_ordering() {
        let a = FloatKey(f64::NAN);
        let b = FloatKey(1.0);
        // NaN should sort consistently (not panic)
        let _ = a.cmp(&b);
    }

    #[test]
    fn test_column_key_ref_zero_copy() {
        let col = Column::Str(vec!["hello".into(), "world".into()]);
        let key = ColumnKeyRef::from_column(&col, 0);
        // key borrows from col — no allocation
        assert_eq!(key, ColumnKeyRef::Str("hello"));
    }
}