//! CJC Data DSL â€" Typed expression trees, logical plans, plan optimizer, and
//! column-buffer kernel execution.
//!
//! This implements the tidyverse-inspired data pipeline:
//! ```text
//! df |> filter(col("age") > 18) |> group_by("dept") |> summarize(avg_salary = mean(col("salary")))
//! ```
use cjc_repro::kahan_sum_f64;
use std::cell::RefCell;
use std::collections::{BTreeMap, BTreeSet, VecDeque};
use std::fmt;
use std::rc::Rc;
mod csv;
pub use csv::{CsvConfig, CsvReader, StreamingCsvProcessor};
pub mod adaptive_selection;
pub mod agg_kernels;
pub mod byte_dict;
pub mod column_meta;
pub mod dataset_plan;
pub mod detcoll;
pub mod dict_encoding;
pub mod lazy;
pub mod predicate_bytecode;
pub mod tidy_dispatch;
pub use adaptive_selection::{AdaptiveSelection, SelectionIndices};
pub use dataset_plan::{
BatchIterator, BatchSpec, DatasetError, DatasetPlan, EncodingSpec, MaterializedBatch,
Split, SplitSpec,
};
// â"€â"€ Column Storage â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A single column in a DataFrame.
#[derive(Debug, Clone)]
pub enum Column {
/// 64-bit signed integer column.
Int(Vec<i64>),
/// 64-bit floating-point column.
Float(Vec<f64>),
/// UTF-8 string column.
Str(Vec<String>),
/// Boolean column.
Bool(Vec<bool>),
/// Categorical column: sorted unique level names + per-row index into levels.
Categorical {
levels: Vec<String>,
codes: Vec<u32>,
},
/// Adaptive-width categorical column wrapping Phase 1's
/// `byte_dict::CategoricalColumn`. Backed by `AdaptiveCodes`
/// (U8/U16/U32/U64 auto-promoting at 256 / 65 536 / 2³² thresholds)
/// and a `ByteDictionary` with optional shared/frozen state.
///
/// Coexists with `Column::Categorical` rather than replacing it —
/// existing column readers continue to use the simpler
/// `(Vec<String>, Vec<u32>)` storage; new code that needs adaptive
/// widths or shared dictionaries opts into this variant via
/// `Column::categorical_adaptive(...)`.
CategoricalAdaptive(Box<crate::byte_dict::CategoricalColumn>),
/// DateTime column: epoch milliseconds.
DateTime(Vec<i64>),
}
impl Column {
/// Returns the number of rows in this column.
pub fn len(&self) -> usize {
match self {
Column::Int(v) => v.len(),
Column::Float(v) => v.len(),
Column::Str(v) => v.len(),
Column::Bool(v) => v.len(),
Column::Categorical { codes, .. } => codes.len(),
Column::CategoricalAdaptive(cc) => cc.len(),
Column::DateTime(v) => v.len(),
}
}
/// Returns `true` if the column has zero rows.
pub fn is_empty(&self) -> bool {
self.len() == 0
}
/// Returns the human-readable type name of this column variant.
pub fn type_name(&self) -> &'static str {
match self {
Column::Int(_) => "Int",
Column::Float(_) => "Float",
Column::Str(_) => "Str",
Column::Bool(_) => "Bool",
Column::Categorical { .. } => "Categorical",
Column::CategoricalAdaptive(_) => "CategoricalAdaptive",
Column::DateTime(_) => "DateTime",
}
}
/// Get a display-friendly value at index.
pub fn get_display(&self, idx: usize) -> String {
match self {
Column::Int(v) => format!("{}", v[idx]),
Column::Float(v) => format!("{}", v[idx]),
Column::Str(v) => v[idx].clone(),
Column::Bool(v) => format!("{}", v[idx]),
Column::Categorical { levels, codes } => levels[codes[idx] as usize].clone(),
Column::CategoricalAdaptive(cc) => match cc.get(idx) {
None => String::new(),
Some(bytes) => String::from_utf8_lossy(bytes).into_owned(),
},
Column::DateTime(v) => format!("{}ms", v[idx]),
}
}
/// Construct a `Column::CategoricalAdaptive` from a `CategoricalColumn`.
/// New-style categorical column with adaptive code widths.
pub fn categorical_adaptive(cc: crate::byte_dict::CategoricalColumn) -> Self {
Column::CategoricalAdaptive(Box::new(cc))
}
/// Materialize any `Column::CategoricalAdaptive` into a
/// `Column::Categorical` for consumption by legacy code paths. For
/// non-adaptive variants this returns `self.clone()`. For adaptive
/// variants with non-UTF-8 levels or null values this returns
/// `Column::Str` (display-equivalent) — preserves the consumer's
/// per-row read semantics while avoiding silent data loss.
///
/// This is the universal back-compat shim for the 19 column-reader
/// match sites added before the adaptive variant existed. New code
/// should switch on the variant directly.
pub fn to_legacy_categorical(&self) -> Column {
match self {
Column::CategoricalAdaptive(cc) => {
// Try lossless conversion to Column::Categorical.
if let Some(legacy) = Column::from_categorical_column(cc) {
return legacy;
}
// Fallback: render as Str (handles nulls and non-UTF-8 by
// using lossy display).
let n = cc.len();
let mut out: Vec<String> = Vec::with_capacity(n);
for i in 0..n {
out.push(match cc.get(i) {
None => String::new(),
Some(b) => String::from_utf8_lossy(b).into_owned(),
});
}
Column::Str(out)
}
_ => self.clone(),
}
}
// ── v3 Phase 4: CategoricalColumn (byte_dict.rs) interop ────────────
//
// Limited-scope wiring of Phase 1's adaptive-width categorical engine
// into the DataFrame surface. The full replacement of
// `Column::Categorical` with `byte_dict::CategoricalColumn` is
// deferred to a future phase (every column reader would migrate). For
// now we expose lossless conversions in both directions so callers
// that need adaptive code widths or shared/frozen dictionaries can
// round-trip through the new type.
//
// Round-trip is byte-equal: `from_categorical_column(to_categorical_column(c))`
// produces the same `levels`/`codes` for any `Column::Categorical c`.
/// Convert a `Column::Categorical` to a `byte_dict::CategoricalColumn`.
///
/// Uses `CategoryOrdering::Explicit` to pin the level→code mapping
/// exactly as it stands in the source column, so round-tripping back
/// via `from_categorical_column` yields byte-identical levels and
/// codes. Returns `None` for non-categorical variants.
pub fn to_categorical_column(&self) -> Option<crate::byte_dict::CategoricalColumn> {
use crate::byte_dict::{ByteDictionary, CategoricalColumn};
match self {
Column::Categorical { levels, codes } => {
let explicit: Vec<Vec<u8>> =
levels.iter().map(|s| s.as_bytes().to_vec()).collect();
// `from_explicit` errors only on duplicate levels; the
// invariant for `Column::Categorical` is unique levels,
// so any error here indicates upstream corruption.
let dict = ByteDictionary::from_explicit(explicit).ok()?;
let mut col = CategoricalColumn::with_dictionary(dict);
// Push codes via the public surface. `levels` is the
// authoritative ordering, so we push by `levels[code]`
// bytes; `Explicit` ordering will assign back the same
// code value, keeping the codes byte-equal.
for &c in codes {
let bytes = levels[c as usize].as_bytes();
// `intern` on an unfrozen Explicit dictionary returns
// the existing code for known values.
col.push(bytes).ok()?;
}
Some(col)
}
_ => None,
}
}
/// Build a `Column::Categorical` from a `byte_dict::CategoricalColumn`.
///
/// Iterates the dictionary in code order to reconstruct `levels`, and
/// the codes via `AdaptiveCodes::iter()` cast to `u32`. Returns
/// `None` if any level is not valid UTF-8 (the byte dictionary is
/// byte-keyed; `Column::Categorical` is `String`-keyed) or if
/// cardinality exceeds `u32::MAX` (would not fit in `Vec<u32>` codes).
/// Null-bearing categorical columns also return `None` for now —
/// `Column::Categorical` does not carry a null bitmap.
pub fn from_categorical_column(
cat: &crate::byte_dict::CategoricalColumn,
) -> Option<Self> {
if cat.nulls().is_some() {
return None;
}
let dict = cat.dictionary();
let mut levels: Vec<String> = Vec::with_capacity(dict.len());
for (_, bytes) in dict.iter() {
match std::str::from_utf8(bytes) {
Ok(s) => levels.push(s.to_string()),
Err(_) => return None,
}
}
let mut codes: Vec<u32> = Vec::with_capacity(cat.len());
for c in cat.codes().iter() {
if c > u32::MAX as u64 {
return None;
}
codes.push(c as u32);
}
Some(Column::Categorical { levels, codes })
}
}
// â"€â"€ DataFrame â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A columnar DataFrame.
#[derive(Debug, Clone)]
pub struct DataFrame {
pub columns: Vec<(String, Column)>,
}
impl DataFrame {
/// Create an empty DataFrame with no columns.
pub fn new() -> Self {
Self {
columns: Vec::new(),
}
}
/// Create a DataFrame from a list of named columns.
///
/// Returns an error if column lengths are not all equal.
pub fn from_columns(columns: Vec<(String, Column)>) -> Result<Self, DataError> {
if columns.is_empty() {
return Ok(Self { columns });
}
let len = columns[0].1.len();
for (name, col) in &columns {
if col.len() != len {
return Err(DataError::ColumnLengthMismatch {
expected: len,
got: col.len(),
column: name.clone(),
});
}
}
Ok(Self { columns })
}
/// Returns the number of rows (determined from the first column, or 0 if empty).
pub fn nrows(&self) -> usize {
self.columns.first().map(|(_, c)| c.len()).unwrap_or(0)
}
/// Returns the number of columns.
pub fn ncols(&self) -> usize {
self.columns.len()
}
/// Returns the column names in order.
pub fn column_names(&self) -> Vec<&str> {
self.columns.iter().map(|(n, _)| n.as_str()).collect()
}
/// Look up a column by name, returning a reference if found.
pub fn get_column(&self, name: &str) -> Option<&Column> {
self.columns
.iter()
.find(|(n, _)| n == name)
.map(|(_, c)| c)
}
/// Convert selected columns to a flat Vec<f64> (for tensor bridge).
pub fn to_tensor_data(&self, col_names: &[&str]) -> Result<(Vec<f64>, Vec<usize>), DataError> {
let nrows = self.nrows();
let ncols = col_names.len();
let mut data = Vec::with_capacity(nrows * ncols);
for row in 0..nrows {
for &col_name in col_names {
let col = self
.get_column(col_name)
.ok_or_else(|| DataError::ColumnNotFound(col_name.to_string()))?;
let val = match col {
Column::Float(v) => v[row],
Column::Int(v) => v[row] as f64,
_ => {
return Err(DataError::InvalidOperation(format!(
"column `{}` is not numeric",
col_name
)))
}
};
data.push(val);
}
}
Ok((data, vec![nrows, ncols]))
}
}
impl Default for DataFrame {
fn default() -> Self {
Self::new()
}
}
impl fmt::Display for DataFrame {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.columns.is_empty() {
return write!(f, "(empty DataFrame)");
}
// Header
let names: Vec<&str> = self.columns.iter().map(|(n, _)| n.as_str()).collect();
let mut col_widths: Vec<usize> = names.iter().map(|n| n.len()).collect();
// Compute column widths
let nrows = self.nrows();
for (col_idx, (_, col)) in self.columns.iter().enumerate() {
for row in 0..nrows {
let s = col.get_display(row);
col_widths[col_idx] = col_widths[col_idx].max(s.len());
}
}
// Print header
for (i, name) in names.iter().enumerate() {
if i > 0 {
write!(f, " | ")?;
}
write!(f, "{:>width$}", name, width = col_widths[i])?;
}
writeln!(f)?;
// Separator
for (i, &w) in col_widths.iter().enumerate() {
if i > 0 {
write!(f, "-+-")?;
}
write!(f, "{}", "-".repeat(w))?;
}
writeln!(f)?;
// Rows
for row in 0..nrows {
for (col_idx, (_, col)) in self.columns.iter().enumerate() {
if col_idx > 0 {
write!(f, " | ")?;
}
let s = col.get_display(row);
write!(f, "{:>width$}", s, width = col_widths[col_idx])?;
}
writeln!(f)?;
}
Ok(())
}
}
// â"€â"€ Data DSL Expression Trees â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// An expression in the Data DSL.
#[derive(Debug, Clone)]
pub enum DExpr {
/// Column reference: col("name")
Col(String),
/// Literal integer
LitInt(i64),
/// Literal float
LitFloat(f64),
/// Literal bool
LitBool(bool),
/// Literal string
LitStr(String),
/// Binary operation
BinOp {
op: DBinOp,
left: Box<DExpr>,
right: Box<DExpr>,
},
/// Aggregation function
Agg(AggFunc, Box<DExpr>),
/// Count (no argument)
Count,
/// Named function call: FnCall("log", vec![Col("x")])
FnCall(String, Vec<DExpr>),
/// Cumulative sum (window)
CumSum(Box<DExpr>),
/// Cumulative product (window)
CumProd(Box<DExpr>),
/// Cumulative max (window)
CumMax(Box<DExpr>),
/// Cumulative min (window)
CumMin(Box<DExpr>),
/// Lag(expr, k): value at row i-k, or NaN if i < k
Lag(Box<DExpr>, usize),
/// Lead(expr, k): value at row i+k, or NaN if i+k >= n
Lead(Box<DExpr>, usize),
/// Rank of values (1-based, average ties)
Rank(Box<DExpr>),
/// Dense rank (1-based, no gaps)
DenseRank(Box<DExpr>),
/// Row number (1-indexed sequential)
RowNumber,
/// Rolling sum over a fixed-size window (Kahan-compensated removable accumulation)
RollingSum(String, usize),
/// Rolling mean over a fixed-size window
RollingMean(String, usize),
/// Rolling minimum over a fixed-size window (monotonic deque, O(n) amortized)
RollingMin(String, usize),
/// Rolling maximum over a fixed-size window (monotonic deque, O(n) amortized)
RollingMax(String, usize),
/// Rolling variance over a fixed-size window (Welford's online with removal)
RollingVar(String, usize),
/// Rolling standard deviation over a fixed-size window
RollingSd(String, usize),
}
/// Binary operator for Data DSL expressions.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DBinOp {
/// Addition (`+`).
Add,
/// Subtraction (`-`).
Sub,
/// Multiplication (`*`).
Mul,
/// Division (`/`).
Div,
/// Greater than (`>`).
Gt,
/// Less than (`<`).
Lt,
/// Greater than or equal (`>=`).
Ge,
/// Less than or equal (`<=`).
Le,
/// Equality (`==`).
Eq,
/// Not equal (`!=`).
Ne,
/// Logical AND (`&&`).
And,
/// Logical OR (`||`).
Or,
}
/// Aggregation function for use in `summarize` expressions.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum AggFunc {
/// Kahan-compensated sum.
Sum,
/// Arithmetic mean.
Mean,
/// Minimum value.
Min,
/// Maximum value.
Max,
/// Row count.
Count,
}
impl fmt::Display for DExpr {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
DExpr::Col(name) => write!(f, "col(\"{}\")", name),
DExpr::LitInt(v) => write!(f, "{}", v),
DExpr::LitFloat(v) => write!(f, "{}", v),
DExpr::LitBool(b) => write!(f, "{}", b),
DExpr::LitStr(s) => write!(f, "\"{}\"", s),
DExpr::BinOp { op, left, right } => {
let op_str = match op {
DBinOp::Add => "+",
DBinOp::Sub => "-",
DBinOp::Mul => "*",
DBinOp::Div => "/",
DBinOp::Gt => ">",
DBinOp::Lt => "<",
DBinOp::Ge => ">=",
DBinOp::Le => "<=",
DBinOp::Eq => "==",
DBinOp::Ne => "!=",
DBinOp::And => "&&",
DBinOp::Or => "||",
};
write!(f, "({} {} {})", left, op_str, right)
}
DExpr::Agg(func, expr) => {
let name = match func {
AggFunc::Sum => "sum",
AggFunc::Mean => "mean",
AggFunc::Min => "min",
AggFunc::Max => "max",
AggFunc::Count => "count",
};
write!(f, "{}({})", name, expr)
}
DExpr::Count => write!(f, "count()"),
DExpr::FnCall(name, args) => {
let args_str: Vec<String> = args.iter().map(|a| format!("{}", a)).collect();
write!(f, "{}({})", name, args_str.join(", "))
}
DExpr::CumSum(e) => write!(f, "cumsum({})", e),
DExpr::CumProd(e) => write!(f, "cumprod({})", e),
DExpr::CumMax(e) => write!(f, "cummax({})", e),
DExpr::CumMin(e) => write!(f, "cummin({})", e),
DExpr::Lag(e, k) => write!(f, "lag({}, {})", e, k),
DExpr::Lead(e, k) => write!(f, "lead({}, {})", e, k),
DExpr::Rank(e) => write!(f, "rank({})", e),
DExpr::DenseRank(e) => write!(f, "dense_rank({})", e),
DExpr::RowNumber => write!(f, "row_number()"),
DExpr::RollingSum(col, w) => write!(f, "rolling_sum(\"{}\", {})", col, w),
DExpr::RollingMean(col, w) => write!(f, "rolling_mean(\"{}\", {})", col, w),
DExpr::RollingMin(col, w) => write!(f, "rolling_min(\"{}\", {})", col, w),
DExpr::RollingMax(col, w) => write!(f, "rolling_max(\"{}\", {})", col, w),
DExpr::RollingVar(col, w) => write!(f, "rolling_var(\"{}\", {})", col, w),
DExpr::RollingSd(col, w) => write!(f, "rolling_sd(\"{}\", {})", col, w),
}
}
}
// â"€â"€ Logical Plan â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A logical query plan node.
#[derive(Debug, Clone)]
pub enum LogicalPlan {
/// Scan a source DataFrame.
Scan {
source: DataFrame,
},
/// Filter rows by predicate.
Filter {
input: Box<LogicalPlan>,
predicate: DExpr,
},
/// Group by one or more columns.
GroupBy {
input: Box<LogicalPlan>,
keys: Vec<String>,
},
/// Aggregate with named expressions.
Aggregate {
input: Box<LogicalPlan>,
keys: Vec<String>,
aggs: Vec<(String, DExpr)>,
},
/// Select/project specific columns.
Project {
input: Box<LogicalPlan>,
columns: Vec<String>,
},
/// Inner join: rows matching on both sides.
InnerJoin {
left: Box<LogicalPlan>,
right: Box<LogicalPlan>,
left_on: String,
right_on: String,
},
/// Left join: all left rows, matching right rows or null.
LeftJoin {
left: Box<LogicalPlan>,
right: Box<LogicalPlan>,
left_on: String,
right_on: String,
},
/// Cross join: cartesian product.
CrossJoin {
left: Box<LogicalPlan>,
right: Box<LogicalPlan>,
},
}
impl LogicalPlan {
/// Collect the column names referenced by this plan (for pruning).
pub fn referenced_columns(&self) -> Vec<String> {
let mut cols = Vec::new();
self.collect_columns(&mut cols);
cols.sort();
cols.dedup();
cols
}
fn collect_columns(&self, cols: &mut Vec<String>) {
match self {
LogicalPlan::Scan { .. } => {}
LogicalPlan::Filter { input, predicate } => {
input.collect_columns(cols);
collect_expr_columns(predicate, cols);
}
LogicalPlan::GroupBy { input, keys } => {
input.collect_columns(cols);
cols.extend(keys.clone());
}
LogicalPlan::Aggregate {
input, keys, aggs, ..
} => {
input.collect_columns(cols);
cols.extend(keys.clone());
for (_, expr) in aggs {
collect_expr_columns(expr, cols);
}
}
LogicalPlan::Project { input, columns } => {
input.collect_columns(cols);
cols.extend(columns.clone());
}
LogicalPlan::InnerJoin {
left,
right,
left_on,
right_on,
}
| LogicalPlan::LeftJoin {
left,
right,
left_on,
right_on,
} => {
left.collect_columns(cols);
right.collect_columns(cols);
cols.push(left_on.clone());
cols.push(right_on.clone());
}
LogicalPlan::CrossJoin { left, right } => {
left.collect_columns(cols);
right.collect_columns(cols);
}
}
}
}
fn collect_expr_columns(expr: &DExpr, cols: &mut Vec<String>) {
match expr {
DExpr::Col(name) => cols.push(name.clone()),
DExpr::BinOp { left, right, .. } => {
collect_expr_columns(left, cols);
collect_expr_columns(right, cols);
}
DExpr::Agg(_, inner) => collect_expr_columns(inner, cols),
DExpr::FnCall(_, args) => {
for arg in args {
collect_expr_columns(arg, cols);
}
}
DExpr::CumSum(e) | DExpr::CumProd(e) | DExpr::CumMax(e) | DExpr::CumMin(e)
| DExpr::Lag(e, _) | DExpr::Lead(e, _) | DExpr::Rank(e) | DExpr::DenseRank(e) => {
collect_expr_columns(e, cols);
}
DExpr::RollingSum(col, _) | DExpr::RollingMean(col, _)
| DExpr::RollingMin(col, _) | DExpr::RollingMax(col, _)
| DExpr::RollingVar(col, _) | DExpr::RollingSd(col, _) => {
cols.push(col.clone());
}
_ => {}
}
}
// â"€â"€ Plan Optimizer â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Optimize a logical plan.
pub fn optimize(plan: LogicalPlan) -> LogicalPlan {
let plan = push_down_predicates(plan);
let plan = prune_columns(plan);
plan
}
/// Predicate pushdown: move Filter below GroupBy/Aggregate when possible.
fn push_down_predicates(plan: LogicalPlan) -> LogicalPlan {
match plan {
LogicalPlan::Filter {
input,
predicate,
} => {
let optimized_input = push_down_predicates(*input);
match optimized_input {
// Push filter below GroupBy if predicate only references keys
LogicalPlan::GroupBy {
input: inner,
keys,
} => {
let pred_cols = {
let mut c = Vec::new();
collect_expr_columns(&predicate, &mut c);
c
};
let can_push = pred_cols.iter().all(|c| !keys.contains(c))
|| pred_cols.iter().all(|c| {
// Check if column exists in the input (not an aggregation)
!keys.contains(c) || keys.contains(c)
});
// Conservative: only push if predicate refs columns available before groupby
if can_push && pred_cols.iter().all(|c| !keys.contains(c)) {
LogicalPlan::GroupBy {
input: Box::new(LogicalPlan::Filter {
input: inner,
predicate,
}),
keys,
}
} else {
LogicalPlan::Filter {
input: Box::new(LogicalPlan::GroupBy {
input: inner,
keys,
}),
predicate,
}
}
}
other => LogicalPlan::Filter {
input: Box::new(other),
predicate,
},
}
}
LogicalPlan::GroupBy { input, keys } => LogicalPlan::GroupBy {
input: Box::new(push_down_predicates(*input)),
keys,
},
LogicalPlan::Aggregate {
input,
keys,
aggs,
} => LogicalPlan::Aggregate {
input: Box::new(push_down_predicates(*input)),
keys,
aggs,
},
LogicalPlan::Project { input, columns } => LogicalPlan::Project {
input: Box::new(push_down_predicates(*input)),
columns,
},
LogicalPlan::InnerJoin {
left,
right,
left_on,
right_on,
} => LogicalPlan::InnerJoin {
left: Box::new(push_down_predicates(*left)),
right: Box::new(push_down_predicates(*right)),
left_on,
right_on,
},
LogicalPlan::LeftJoin {
left,
right,
left_on,
right_on,
} => LogicalPlan::LeftJoin {
left: Box::new(push_down_predicates(*left)),
right: Box::new(push_down_predicates(*right)),
left_on,
right_on,
},
LogicalPlan::CrossJoin { left, right } => LogicalPlan::CrossJoin {
left: Box::new(push_down_predicates(*left)),
right: Box::new(push_down_predicates(*right)),
},
other => other,
}
}
/// Column pruning: add Project nodes to avoid materializing unused columns.
fn prune_columns(plan: LogicalPlan) -> LogicalPlan {
// For v1, this is a no-op structural pass. Full implementation tracks
// which columns are actually needed downstream and inserts Project nodes.
plan
}
// â"€â"€ Plan Executor â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Execute a logical plan against in-memory data.
pub fn execute(plan: &LogicalPlan) -> Result<DataFrame, DataError> {
match plan {
LogicalPlan::Scan { source } => Ok(source.clone()),
LogicalPlan::Filter { input, predicate } => {
let df = execute(input)?;
execute_filter(&df, predicate)
}
LogicalPlan::GroupBy { input, keys: _ } => {
// GroupBy alone just passes through; it's Aggregate that does the work
let df = execute(input)?;
// Return the data with a hint that it's grouped
Ok(df)
}
LogicalPlan::Aggregate { input, keys, aggs } => {
let df = execute(input)?;
execute_aggregate(&df, keys, aggs)
}
LogicalPlan::Project { input, columns } => {
let df = execute(input)?;
let projected = df
.columns
.into_iter()
.filter(|(name, _)| columns.contains(name))
.collect();
Ok(DataFrame { columns: projected })
}
LogicalPlan::InnerJoin {
left,
right,
left_on,
right_on,
} => {
let left_df = execute(left)?;
let right_df = execute(right)?;
execute_inner_join(&left_df, &right_df, left_on, right_on)
}
LogicalPlan::LeftJoin {
left,
right,
left_on,
right_on,
} => {
let left_df = execute(left)?;
let right_df = execute(right)?;
execute_left_join(&left_df, &right_df, left_on, right_on)
}
LogicalPlan::CrossJoin { left, right } => {
let left_df = execute(left)?;
let right_df = execute(right)?;
execute_cross_join(&left_df, &right_df)
}
}
}
fn execute_filter(df: &DataFrame, predicate: &DExpr) -> Result<DataFrame, DataError> {
let nrows = df.nrows();
let mut mask = vec![false; nrows];
for row in 0..nrows {
let val = eval_expr_row(df, predicate, row)?;
mask[row] = match val {
ExprValue::Bool(b) => b,
_ => return Err(DataError::InvalidOperation("filter predicate must be boolean".into())),
};
}
let mut new_columns = Vec::new();
for (name, col) in &df.columns {
let filtered = filter_column(col, &mask);
new_columns.push((name.clone(), filtered));
}
Ok(DataFrame {
columns: new_columns,
})
}
fn filter_column(col: &Column, mask: &[bool]) -> Column {
if matches!(col, Column::CategoricalAdaptive(_)) {
return filter_column(&col.to_legacy_categorical(), mask);
}
match col {
Column::Int(v) => Column::Int(
v.iter()
.zip(mask)
.filter(|(_, &m)| m)
.map(|(v, _)| *v)
.collect(),
),
Column::Float(v) => Column::Float(
v.iter()
.zip(mask)
.filter(|(_, &m)| m)
.map(|(v, _)| *v)
.collect(),
),
Column::Str(v) => Column::Str(
v.iter()
.zip(mask)
.filter(|(_, &m)| m)
.map(|(v, _)| v.clone())
.collect(),
),
Column::Bool(v) => Column::Bool(
v.iter()
.zip(mask)
.filter(|(_, &m)| m)
.map(|(v, _)| *v)
.collect(),
),
Column::Categorical { levels, codes } => Column::Categorical {
levels: levels.clone(),
codes: codes
.iter()
.zip(mask)
.filter(|(_, &m)| m)
.map(|(v, _)| *v)
.collect(),
},
Column::DateTime(v) => Column::DateTime(
v.iter()
.zip(mask)
.filter(|(_, &m)| m)
.map(|(v, _)| *v)
.collect(),
),
Column::CategoricalAdaptive(_) => unreachable!("handled by early return"),
}
}
fn execute_aggregate(
df: &DataFrame,
keys: &[String],
aggs: &[(String, DExpr)],
) -> Result<DataFrame, DataError> {
// Build groups
let nrows = df.nrows();
let mut groups: BTreeMap<Vec<String>, Vec<usize>> = BTreeMap::new();
for row in 0..nrows {
let key: Vec<String> = keys
.iter()
.map(|k| {
df.get_column(k)
.map(|col| col.get_display(row))
.ok_or_else(|| DataError::ColumnNotFound(k.to_string()))
})
.collect::<Result<Vec<String>, DataError>>()?;
groups.entry(key).or_default().push(row);
}
// Sort groups for deterministic output
let mut sorted_groups: Vec<(Vec<String>, Vec<usize>)> = groups.into_iter().collect();
sorted_groups.sort_by(|a, b| a.0.cmp(&b.0));
// Build result columns
let mut result_columns: Vec<(String, Column)> = Vec::new();
// Key columns
for (key_idx, key_name) in keys.iter().enumerate() {
let values: Vec<String> = sorted_groups
.iter()
.map(|(key, _)| key[key_idx].clone())
.collect();
// Determine type from source
let source_col = df.get_column(key_name).ok_or_else(|| {
DataError::ColumnNotFound(key_name.clone())
})?;
match source_col {
Column::Int(_) => {
let int_vals: Vec<i64> = values.iter().map(|s| s.parse().unwrap_or(0)).collect();
result_columns.push((key_name.clone(), Column::Int(int_vals)));
}
Column::Str(_) => {
result_columns.push((key_name.clone(), Column::Str(values)));
}
_ => {
result_columns.push((key_name.clone(), Column::Str(values)));
}
}
}
// Aggregation columns
for (agg_name, agg_expr) in aggs {
let mut values = Vec::new();
for (_, row_indices) in &sorted_groups {
let val = eval_agg_expr(df, agg_expr, row_indices)?;
values.push(val);
}
result_columns.push((agg_name.clone(), Column::Float(values)));
}
Ok(DataFrame {
columns: result_columns,
})
}
// â"€â"€ Expression Evaluation â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
#[derive(Debug, Clone)]
enum ExprValue {
Int(i64),
Float(f64),
Str(String),
Bool(bool),
}
fn eval_expr_row(df: &DataFrame, expr: &DExpr, row: usize) -> Result<ExprValue, DataError> {
match expr {
DExpr::Col(name) => {
let col = df
.get_column(name)
.ok_or_else(|| DataError::ColumnNotFound(name.clone()))?;
match col {
Column::Int(v) => Ok(ExprValue::Int(v[row])),
Column::Float(v) => Ok(ExprValue::Float(v[row])),
Column::Str(v) => Ok(ExprValue::Str(v[row].clone())),
Column::Bool(v) => Ok(ExprValue::Bool(v[row])),
Column::Categorical { levels, codes } => {
Ok(ExprValue::Str(levels[codes[row] as usize].clone()))
}
Column::CategoricalAdaptive(cc) => Ok(ExprValue::Str(match cc.get(row) {
None => String::new(),
Some(b) => String::from_utf8_lossy(b).into_owned(),
})),
Column::DateTime(v) => Ok(ExprValue::Int(v[row])),
}
}
DExpr::LitInt(v) => Ok(ExprValue::Int(*v)),
DExpr::LitFloat(v) => Ok(ExprValue::Float(*v)),
DExpr::LitBool(b) => Ok(ExprValue::Bool(*b)),
DExpr::LitStr(s) => Ok(ExprValue::Str(s.clone())),
DExpr::BinOp { op, left, right } => {
let lv = eval_expr_row(df, left, row)?;
let rv = eval_expr_row(df, right, row)?;
eval_binop(*op, lv, rv)
}
DExpr::Agg(_, _) | DExpr::Count => Err(DataError::InvalidOperation(
"aggregation not allowed in row context".into(),
)),
DExpr::FnCall(name, args) => {
if args.len() != 1 {
return Err(DataError::InvalidOperation(
format!("FnCall '{}' requires exactly 1 argument, got {}", name, args.len()),
));
}
let val = eval_expr_row(df, &args[0], row)?;
let x = match val {
ExprValue::Float(f) => f,
ExprValue::Int(i) => i as f64,
_ => return Err(DataError::InvalidOperation(
format!("FnCall '{}' requires numeric argument", name),
)),
};
let result = match name.as_str() {
"log" => x.ln(),
"exp" => x.exp(),
"sqrt" => x.sqrt(),
"abs" => x.abs(),
"ceil" => x.ceil(),
"floor" => x.floor(),
"round" => x.round(),
"sin" => x.sin(),
"cos" => x.cos(),
"tan" => x.tan(),
other => return Err(DataError::InvalidOperation(
format!("unknown DExpr function: {}", other),
)),
};
Ok(ExprValue::Float(result))
}
DExpr::CumSum(_) | DExpr::CumProd(_) | DExpr::CumMax(_) | DExpr::CumMin(_)
| DExpr::Lag(_, _) | DExpr::Lead(_, _) | DExpr::Rank(_) | DExpr::DenseRank(_)
| DExpr::RowNumber
| DExpr::RollingSum(..) | DExpr::RollingMean(..) | DExpr::RollingMin(..)
| DExpr::RollingMax(..) | DExpr::RollingVar(..) | DExpr::RollingSd(..) => {
Err(DataError::InvalidOperation(
"window function not allowed in row context; use eval_expr_column".into(),
))
}
}
}
fn eval_binop(op: DBinOp, left: ExprValue, right: ExprValue) -> Result<ExprValue, DataError> {
match (left, right) {
(ExprValue::Int(a), ExprValue::Int(b)) => match op {
DBinOp::Add => Ok(ExprValue::Int(a + b)),
DBinOp::Sub => Ok(ExprValue::Int(a - b)),
DBinOp::Mul => Ok(ExprValue::Int(a * b)),
DBinOp::Div => Ok(ExprValue::Int(a / b)),
DBinOp::Gt => Ok(ExprValue::Bool(a > b)),
DBinOp::Lt => Ok(ExprValue::Bool(a < b)),
DBinOp::Ge => Ok(ExprValue::Bool(a >= b)),
DBinOp::Le => Ok(ExprValue::Bool(a <= b)),
DBinOp::Eq => Ok(ExprValue::Bool(a == b)),
DBinOp::Ne => Ok(ExprValue::Bool(a != b)),
_ => Err(DataError::InvalidOperation(format!(
"unsupported op {:?} on Int",
op
))),
},
(ExprValue::Float(a), ExprValue::Float(b)) => match op {
DBinOp::Add => Ok(ExprValue::Float(a + b)),
DBinOp::Sub => Ok(ExprValue::Float(a - b)),
DBinOp::Mul => Ok(ExprValue::Float(a * b)),
DBinOp::Div => Ok(ExprValue::Float(a / b)),
DBinOp::Gt => Ok(ExprValue::Bool(a > b)),
DBinOp::Lt => Ok(ExprValue::Bool(a < b)),
DBinOp::Ge => Ok(ExprValue::Bool(a >= b)),
DBinOp::Le => Ok(ExprValue::Bool(a <= b)),
DBinOp::Eq => Ok(ExprValue::Bool(a == b)),
DBinOp::Ne => Ok(ExprValue::Bool(a != b)),
_ => Err(DataError::InvalidOperation(format!(
"unsupported op {:?} on Float",
op
))),
},
// Int promoted to Float
(ExprValue::Int(a), ExprValue::Float(b)) => {
eval_binop(op, ExprValue::Float(a as f64), ExprValue::Float(b))
}
(ExprValue::Float(a), ExprValue::Int(b)) => {
eval_binop(op, ExprValue::Float(a), ExprValue::Float(b as f64))
}
(ExprValue::Bool(a), ExprValue::Bool(b)) => match op {
DBinOp::And => Ok(ExprValue::Bool(a && b)),
DBinOp::Or => Ok(ExprValue::Bool(a || b)),
DBinOp::Eq => Ok(ExprValue::Bool(a == b)),
DBinOp::Ne => Ok(ExprValue::Bool(a != b)),
_ => Err(DataError::InvalidOperation(format!(
"unsupported op {:?} on Bool",
op
))),
},
(ExprValue::Str(a), ExprValue::Str(b)) => match op {
DBinOp::Eq => Ok(ExprValue::Bool(a == b)),
DBinOp::Ne => Ok(ExprValue::Bool(a != b)),
_ => Err(DataError::InvalidOperation(format!(
"unsupported op {:?} on String",
op
))),
},
_ => Err(DataError::InvalidOperation(
"type mismatch in binary operation".into(),
)),
}
}
fn eval_agg_expr(
df: &DataFrame,
expr: &DExpr,
rows: &[usize],
) -> Result<f64, DataError> {
match expr {
DExpr::Agg(func, inner) => {
let values = extract_float_values(df, inner, rows)?;
match func {
AggFunc::Sum => Ok(kahan_sum_f64(&values)),
AggFunc::Mean => {
if values.is_empty() {
Ok(0.0)
} else {
Ok(kahan_sum_f64(&values) / values.len() as f64)
}
}
AggFunc::Min => Ok(values
.iter()
.cloned()
.fold(f64::INFINITY, f64::min)),
AggFunc::Max => Ok(values
.iter()
.cloned()
.fold(f64::NEG_INFINITY, f64::max)),
AggFunc::Count => Ok(values.len() as f64),
}
}
DExpr::Count => Ok(rows.len() as f64),
_ => Err(DataError::InvalidOperation(
"expected aggregation expression".into(),
)),
}
}
fn extract_float_values(
df: &DataFrame,
expr: &DExpr,
rows: &[usize],
) -> Result<Vec<f64>, DataError> {
match expr {
DExpr::Col(name) => {
let col = df
.get_column(name)
.ok_or_else(|| DataError::ColumnNotFound(name.clone()))?;
let vals: Vec<f64> = match col {
Column::Float(v) => rows.iter().map(|&r| v[r]).collect(),
Column::Int(v) => rows.iter().map(|&r| v[r] as f64).collect(),
_ => {
return Err(DataError::InvalidOperation(format!(
"cannot aggregate non-numeric column `{}`",
name
)))
}
};
Ok(vals)
}
_ => Err(DataError::InvalidOperation(
"expected column reference in aggregation".into(),
)),
}
}
// â"€â"€ Pipeline Builder â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Fluent builder for data pipelines.
pub struct Pipeline {
plan: LogicalPlan,
}
impl Pipeline {
/// Start a pipeline by scanning a source DataFrame.
pub fn scan(df: DataFrame) -> Self {
Self {
plan: LogicalPlan::Scan { source: df },
}
}
/// Add a filter step to the pipeline.
pub fn filter(self, predicate: DExpr) -> Self {
Self {
plan: LogicalPlan::Filter {
input: Box::new(self.plan),
predicate,
},
}
}
/// Add a group-by step to the pipeline.
pub fn group_by(self, keys: Vec<String>) -> Self {
Self {
plan: LogicalPlan::GroupBy {
input: Box::new(self.plan),
keys,
},
}
}
/// Add a summarize (aggregate) step to the pipeline.
pub fn summarize(self, keys: Vec<String>, aggs: Vec<(String, DExpr)>) -> Self {
Self {
plan: LogicalPlan::Aggregate {
input: Box::new(self.plan),
keys,
aggs,
},
}
}
/// Add a column projection step to the pipeline.
pub fn select(self, columns: Vec<String>) -> Self {
Self {
plan: LogicalPlan::Project {
input: Box::new(self.plan),
columns,
},
}
}
/// Add an inner join step to the pipeline.
pub fn inner_join(self, right: DataFrame, left_on: &str, right_on: &str) -> Self {
Self {
plan: LogicalPlan::InnerJoin {
left: Box::new(self.plan),
right: Box::new(LogicalPlan::Scan { source: right }),
left_on: left_on.to_string(),
right_on: right_on.to_string(),
},
}
}
/// Add a left join step to the pipeline.
pub fn left_join(self, right: DataFrame, left_on: &str, right_on: &str) -> Self {
Self {
plan: LogicalPlan::LeftJoin {
left: Box::new(self.plan),
right: Box::new(LogicalPlan::Scan { source: right }),
left_on: left_on.to_string(),
right_on: right_on.to_string(),
},
}
}
/// Add a cross (cartesian) join step to the pipeline.
pub fn cross_join(self, right: DataFrame) -> Self {
Self {
plan: LogicalPlan::CrossJoin {
left: Box::new(self.plan),
right: Box::new(LogicalPlan::Scan { source: right }),
},
}
}
/// Optimize and execute the pipeline.
pub fn collect(self) -> Result<DataFrame, DataError> {
let optimized = optimize(self.plan);
execute(&optimized)
}
/// Get the logical plan (for inspection/testing).
pub fn plan(&self) -> &LogicalPlan {
&self.plan
}
}
// â"€â"€ Errors â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Errors from DataFrame operations (plan execution, joins, tensor conversion).
#[derive(Debug, Clone)]
pub enum DataError {
/// A referenced column name does not exist in the DataFrame.
ColumnNotFound(String),
/// A column has a different row count than expected.
ColumnLengthMismatch {
/// Expected row count.
expected: usize,
/// Actual row count.
got: usize,
/// Name of the mismatched column.
column: String,
},
/// A generic invalid-operation error with a descriptive message.
InvalidOperation(String),
}
impl fmt::Display for DataError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
DataError::ColumnNotFound(name) => write!(f, "column `{}` not found", name),
DataError::ColumnLengthMismatch {
expected,
got,
column,
} => write!(
f,
"column `{}` has {} rows, expected {}",
column, got, expected
),
DataError::InvalidOperation(msg) => write!(f, "invalid operation: {}", msg),
}
}
}
impl std::error::Error for DataError {}
// â"€â"€ Join Execution â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Get a column value as a string for join key comparison.
fn column_value_str(col: &Column, row: usize) -> String {
match col {
Column::Int(v) => v[row].to_string(),
Column::Float(v) => v[row].to_string(),
Column::Str(v) => v[row].clone(),
Column::Bool(v) => v[row].to_string(),
Column::Categorical { levels, codes } => levels[codes[row] as usize].clone(),
Column::CategoricalAdaptive(cc) => match cc.get(row) {
None => String::new(),
Some(b) => String::from_utf8_lossy(b).into_owned(),
},
Column::DateTime(v) => v[row].to_string(),
}
}
fn execute_inner_join(
left: &DataFrame,
right: &DataFrame,
left_on: &str,
right_on: &str,
) -> Result<DataFrame, DataError> {
let left_col = left.get_column(left_on)
.ok_or_else(|| DataError::InvalidOperation(format!("join key `{}` not found in left", left_on)))?;
let right_col = right.get_column(right_on)
.ok_or_else(|| DataError::InvalidOperation(format!("join key `{}` not found in right", right_on)))?;
// Build hash index on right table
let right_nrows = right.nrows();
let mut index: std::collections::BTreeMap<String, Vec<usize>> = std::collections::BTreeMap::new();
for i in 0..right_nrows {
let key = column_value_str(right_col, i);
index.entry(key).or_default().push(i);
}
let left_nrows = left.nrows();
let mut left_indices = Vec::new();
let mut right_indices = Vec::new();
for i in 0..left_nrows {
let key = column_value_str(left_col, i);
if let Some(matches) = index.get(&key) {
for &j in matches {
left_indices.push(i);
right_indices.push(j);
}
}
}
build_join_result(left, right, &left_indices, &right_indices, right_on)
}
fn execute_left_join(
left: &DataFrame,
right: &DataFrame,
left_on: &str,
right_on: &str,
) -> Result<DataFrame, DataError> {
let left_col = left.get_column(left_on)
.ok_or_else(|| DataError::InvalidOperation(format!("join key `{}` not found in left", left_on)))?;
let right_col = right.get_column(right_on)
.ok_or_else(|| DataError::InvalidOperation(format!("join key `{}` not found in right", right_on)))?;
let right_nrows = right.nrows();
let mut index: std::collections::BTreeMap<String, Vec<usize>> = std::collections::BTreeMap::new();
for i in 0..right_nrows {
let key = column_value_str(right_col, i);
index.entry(key).or_default().push(i);
}
let left_nrows = left.nrows();
let mut left_indices = Vec::new();
let mut right_indices: Vec<Option<usize>> = Vec::new();
for i in 0..left_nrows {
let key = column_value_str(left_col, i);
if let Some(matches) = index.get(&key) {
for &j in matches {
left_indices.push(i);
right_indices.push(Some(j));
}
} else {
left_indices.push(i);
right_indices.push(None);
}
}
build_left_join_result(left, right, &left_indices, &right_indices, right_on)
}
fn execute_cross_join(left: &DataFrame, right: &DataFrame) -> Result<DataFrame, DataError> {
let left_nrows = left.nrows();
let right_nrows = right.nrows();
let mut left_indices = Vec::with_capacity(left_nrows * right_nrows);
let mut right_indices = Vec::with_capacity(left_nrows * right_nrows);
for i in 0..left_nrows {
for j in 0..right_nrows {
left_indices.push(i);
right_indices.push(j);
}
}
build_join_result(left, right, &left_indices, &right_indices, "")
}
fn build_join_result(
left: &DataFrame,
right: &DataFrame,
left_indices: &[usize],
right_indices: &[usize],
right_on: &str,
) -> Result<DataFrame, DataError> {
let mut columns = Vec::new();
// Add all left columns
for (name, col) in &left.columns {
columns.push((name.clone(), gather_column(col, left_indices)));
}
// Add right columns (skip the join key to avoid duplication)
for (name, col) in &right.columns {
if name == right_on {
continue;
}
let out_name = if left.get_column(name).is_some() {
format!("{}_right", name)
} else {
name.clone()
};
columns.push((out_name, gather_column(col, right_indices)));
}
Ok(DataFrame { columns })
}
fn build_left_join_result(
left: &DataFrame,
right: &DataFrame,
left_indices: &[usize],
right_indices: &[Option<usize>],
right_on: &str,
) -> Result<DataFrame, DataError> {
let mut columns = Vec::new();
for (name, col) in &left.columns {
columns.push((name.clone(), gather_column(col, left_indices)));
}
for (name, col) in &right.columns {
if name == right_on {
continue;
}
let out_name = if left.get_column(name).is_some() {
format!("{}_right", name)
} else {
name.clone()
};
columns.push((out_name, gather_column_nullable(col, right_indices)));
}
Ok(DataFrame { columns })
}
fn gather_column(col: &Column, indices: &[usize]) -> Column {
if matches!(col, Column::CategoricalAdaptive(_)) {
return gather_column(&col.to_legacy_categorical(), indices);
}
match col {
Column::Int(v) => Column::Int(indices.iter().map(|&i| v[i]).collect()),
Column::Float(v) => Column::Float(indices.iter().map(|&i| v[i]).collect()),
Column::Str(v) => Column::Str(indices.iter().map(|&i| v[i].clone()).collect()),
Column::Bool(v) => Column::Bool(indices.iter().map(|&i| v[i]).collect()),
Column::Categorical { levels, codes } => Column::Categorical {
levels: levels.clone(),
codes: indices.iter().map(|&i| codes[i]).collect(),
},
Column::DateTime(v) => Column::DateTime(indices.iter().map(|&i| v[i]).collect()),
Column::CategoricalAdaptive(_) => unreachable!("handled by early return"),
}
}
fn gather_column_nullable(col: &Column, indices: &[Option<usize>]) -> Column {
if matches!(col, Column::CategoricalAdaptive(_)) {
return gather_column_nullable(&col.to_legacy_categorical(), indices);
}
match col {
Column::Int(v) => Column::Int(indices.iter().map(|opt| opt.map_or(0, |i| v[i])).collect()),
Column::Float(v) => Column::Float(indices.iter().map(|opt| opt.map_or(f64::NAN, |i| v[i])).collect()),
Column::Str(v) => Column::Str(indices.iter().map(|opt| opt.map_or_else(String::new, |i| v[i].clone())).collect()),
Column::Bool(v) => Column::Bool(indices.iter().map(|opt| opt.map_or(false, |i| v[i])).collect()),
Column::Categorical { levels, codes } => Column::Categorical {
levels: levels.clone(),
codes: indices.iter().map(|opt| opt.map_or(0, |i| codes[i])).collect(),
},
Column::DateTime(v) => Column::DateTime(indices.iter().map(|opt| opt.map_or(0, |i| v[i])).collect()),
Column::CategoricalAdaptive(_) => unreachable!("handled by early return"),
}
}
// â"€â"€ Tests â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
#[cfg(test)]
mod tests {
use super::*;
fn sample_df() -> DataFrame {
DataFrame::from_columns(vec![
(
"name".into(),
Column::Str(vec![
"Alice".into(),
"Bob".into(),
"Carol".into(),
"Dave".into(),
"Eve".into(),
"Frank".into(),
]),
),
(
"dept".into(),
Column::Str(vec![
"eng".into(),
"eng".into(),
"sales".into(),
"eng".into(),
"sales".into(),
"eng".into(),
]),
),
(
"salary".into(),
Column::Float(vec![95000.0, 102000.0, 78000.0, 110000.0, 82000.0, 98000.0]),
),
(
"tenure".into(),
Column::Int(vec![3, 7, 2, 10, 1, 5]),
),
])
.unwrap()
}
#[test]
fn test_dataframe_creation() {
let df = sample_df();
assert_eq!(df.nrows(), 6);
assert_eq!(df.ncols(), 4);
assert_eq!(
df.column_names(),
vec!["name", "dept", "salary", "tenure"]
);
}
#[test]
fn test_filter() {
let df = sample_df();
// Filter tenure > 2
let result = Pipeline::scan(df)
.filter(DExpr::BinOp {
op: DBinOp::Gt,
left: Box::new(DExpr::Col("tenure".into())),
right: Box::new(DExpr::LitInt(2)),
})
.collect()
.unwrap();
assert_eq!(result.nrows(), 4); // Alice(3), Bob(7), Dave(10), Frank(5)
}
#[test]
fn test_group_by_summarize() {
let df = sample_df();
let result = Pipeline::scan(df)
.summarize(
vec!["dept".into()],
vec![
(
"avg_salary".into(),
DExpr::Agg(AggFunc::Mean, Box::new(DExpr::Col("salary".into()))),
),
("headcount".into(), DExpr::Count),
],
)
.collect()
.unwrap();
assert_eq!(result.nrows(), 2); // eng, sales
// Find eng row
let dept_col = result.get_column("dept").unwrap();
let avg_col = result.get_column("avg_salary").unwrap();
let count_col = result.get_column("headcount").unwrap();
if let (Column::Str(depts), Column::Float(avgs), Column::Float(counts)) =
(dept_col, avg_col, count_col)
{
let eng_idx = depts.iter().position(|d| d == "eng").unwrap();
let sales_idx = depts.iter().position(|d| d == "sales").unwrap();
// eng: (95000 + 102000 + 110000 + 98000) / 4 = 101250
assert!((avgs[eng_idx] - 101250.0).abs() < 0.01);
assert!((counts[eng_idx] - 4.0).abs() < 0.01);
// sales: (78000 + 82000) / 2 = 80000
assert!((avgs[sales_idx] - 80000.0).abs() < 0.01);
assert!((counts[sales_idx] - 2.0).abs() < 0.01);
} else {
panic!("unexpected column types");
}
}
#[test]
fn test_filter_then_aggregate() {
let df = sample_df();
// Filter tenure > 2, then aggregate by dept
let result = Pipeline::scan(df)
.filter(DExpr::BinOp {
op: DBinOp::Gt,
left: Box::new(DExpr::Col("tenure".into())),
right: Box::new(DExpr::LitInt(2)),
})
.summarize(
vec!["dept".into()],
vec![
(
"avg_salary".into(),
DExpr::Agg(AggFunc::Mean, Box::new(DExpr::Col("salary".into()))),
),
(
"max_tenure".into(),
DExpr::Agg(AggFunc::Max, Box::new(DExpr::Col("tenure".into()))),
),
("headcount".into(), DExpr::Count),
],
)
.collect()
.unwrap();
// After filter: Alice(3,eng), Bob(7,eng), Dave(10,eng), Frank(5,eng)
// Only eng remains
assert_eq!(result.nrows(), 1);
if let Column::Float(avgs) = result.get_column("avg_salary").unwrap() {
// (95000 + 102000 + 110000 + 98000) / 4 = 101250
assert!((avgs[0] - 101250.0).abs() < 0.01);
}
if let Column::Float(maxes) = result.get_column("max_tenure").unwrap() {
assert!((maxes[0] - 10.0).abs() < 0.01);
}
}
#[test]
fn test_to_tensor_data() {
let df = DataFrame::from_columns(vec![
("x".into(), Column::Float(vec![1.0, 2.0, 3.0])),
("y".into(), Column::Float(vec![4.0, 5.0, 6.0])),
])
.unwrap();
let (data, shape) = df.to_tensor_data(&["x", "y"]).unwrap();
assert_eq!(shape, vec![3, 2]);
assert_eq!(data, vec![1.0, 4.0, 2.0, 5.0, 3.0, 6.0]);
}
#[test]
fn test_display() {
let df = DataFrame::from_columns(vec![
("x".into(), Column::Int(vec![1, 2, 3])),
("y".into(), Column::Float(vec![4.5, 5.5, 6.5])),
])
.unwrap();
let output = format!("{}", df);
assert!(output.contains("x"));
assert!(output.contains("y"));
assert!(output.contains("4.5"));
}
#[test]
fn test_column_not_found() {
let df = sample_df();
let result = Pipeline::scan(df)
.filter(DExpr::BinOp {
op: DBinOp::Gt,
left: Box::new(DExpr::Col("nonexistent".into())),
right: Box::new(DExpr::LitInt(0)),
})
.collect();
assert!(result.is_err());
}
#[test]
fn test_aggregation_functions() {
let df = DataFrame::from_columns(vec![
("group".into(), Column::Str(vec!["a".into(), "a".into(), "a".into()])),
("val".into(), Column::Float(vec![10.0, 20.0, 30.0])),
])
.unwrap();
let result = Pipeline::scan(df)
.summarize(
vec!["group".into()],
vec![
("total".into(), DExpr::Agg(AggFunc::Sum, Box::new(DExpr::Col("val".into())))),
("avg".into(), DExpr::Agg(AggFunc::Mean, Box::new(DExpr::Col("val".into())))),
("lo".into(), DExpr::Agg(AggFunc::Min, Box::new(DExpr::Col("val".into())))),
("hi".into(), DExpr::Agg(AggFunc::Max, Box::new(DExpr::Col("val".into())))),
("n".into(), DExpr::Count),
],
)
.collect()
.unwrap();
if let Column::Float(totals) = result.get_column("total").unwrap() {
assert!((totals[0] - 60.0).abs() < 0.01);
}
if let Column::Float(avgs) = result.get_column("avg").unwrap() {
assert!((avgs[0] - 20.0).abs() < 0.01);
}
if let Column::Float(mins) = result.get_column("lo").unwrap() {
assert!((mins[0] - 10.0).abs() < 0.01);
}
if let Column::Float(maxs) = result.get_column("hi").unwrap() {
assert!((maxs[0] - 30.0).abs() < 0.01);
}
if let Column::Float(counts) = result.get_column("n").unwrap() {
assert!((counts[0] - 3.0).abs() < 0.01);
}
}
#[test]
fn test_empty_dataframe() {
let df = DataFrame::new();
assert_eq!(df.nrows(), 0);
assert_eq!(df.ncols(), 0);
}
#[test]
fn test_expr_display() {
let expr = DExpr::BinOp {
op: DBinOp::Gt,
left: Box::new(DExpr::Col("age".into())),
right: Box::new(DExpr::LitInt(18)),
};
assert_eq!(format!("{}", expr), "(col(\"age\") > 18)");
}
// ── Categorical Column and Encoding Tests ──────────────────────────────
#[test]
fn test_categorical_column_basics() {
let col = Column::Categorical {
levels: vec!["bird".into(), "cat".into(), "dog".into()],
codes: vec![1, 2, 1, 0],
};
assert_eq!(col.len(), 4);
assert_eq!(col.type_name(), "Categorical");
assert_eq!(col.get_display(0), "cat");
assert_eq!(col.get_display(1), "dog");
assert_eq!(col.get_display(2), "cat");
assert_eq!(col.get_display(3), "bird");
}
#[test]
fn test_datetime_column_basics() {
let col = Column::DateTime(vec![1000, 2000, 3000]);
assert_eq!(col.len(), 3);
assert_eq!(col.type_name(), "DateTime");
assert_eq!(col.get_display(0), "1000ms");
assert_eq!(col.get_display(1), "2000ms");
}
#[test]
fn test_label_encode() {
let data: Vec<String> = vec!["cat".into(), "dog".into(), "cat".into(), "bird".into()];
let (levels, codes) = label_encode(&data);
assert_eq!(levels, vec!["bird", "cat", "dog"]);
assert_eq!(codes, vec![1, 2, 1, 0]);
}
#[test]
fn test_label_encode_empty() {
let data: Vec<String> = vec![];
let (levels, codes) = label_encode(&data);
assert!(levels.is_empty());
assert!(codes.is_empty());
}
#[test]
fn test_label_encode_single_level() {
let data: Vec<String> = vec!["x".into(), "x".into(), "x".into()];
let (levels, codes) = label_encode(&data);
assert_eq!(levels, vec!["x"]);
assert_eq!(codes, vec![0, 0, 0]);
}
#[test]
fn test_label_encode_deterministic() {
// Run twice, must produce identical results (determinism)
let data: Vec<String> = vec!["z".into(), "a".into(), "m".into(), "a".into(), "z".into()];
let (levels1, codes1) = label_encode(&data);
let (levels2, codes2) = label_encode(&data);
assert_eq!(levels1, levels2);
assert_eq!(codes1, codes2);
// Sorted order
assert_eq!(levels1, vec!["a", "m", "z"]);
}
#[test]
fn test_ordinal_encode() {
let data: Vec<String> = vec!["low".into(), "high".into(), "medium".into(), "low".into()];
let order: Vec<String> = vec!["low".into(), "medium".into(), "high".into()];
let (levels, codes) = ordinal_encode(&data, &order).unwrap();
assert_eq!(levels, vec!["low", "medium", "high"]);
assert_eq!(codes, vec![0, 2, 1, 0]);
}
#[test]
fn test_ordinal_encode_missing_value() {
let data: Vec<String> = vec!["low".into(), "unknown".into()];
let order: Vec<String> = vec!["low".into(), "medium".into(), "high".into()];
let result = ordinal_encode(&data, &order);
assert!(result.is_err());
assert!(result.unwrap_err().contains("unknown"));
}
#[test]
fn test_one_hot_encode() {
let levels = vec!["bird".to_string(), "cat".to_string(), "dog".to_string()];
let codes = vec![1u32, 2, 1, 0];
let (names, cols) = one_hot_encode(&levels, &codes);
assert_eq!(names, vec!["bird", "cat", "dog"]);
assert_eq!(cols.len(), 3);
// bird column: [false, false, false, true]
assert_eq!(cols[0], vec![false, false, false, true]);
// cat column: [true, false, true, false]
assert_eq!(cols[1], vec![true, false, true, false]);
// dog column: [false, true, false, false]
assert_eq!(cols[2], vec![false, true, false, false]);
// Each row has exactly one true
for row in 0..4 {
let count: usize = cols.iter().map(|c| if c[row] { 1 } else { 0 }).sum();
assert_eq!(count, 1, "row {} should have exactly one true", row);
}
}
#[test]
fn test_one_hot_encode_empty() {
let levels = vec!["a".to_string(), "b".to_string()];
let codes: Vec<u32> = vec![];
let (names, cols) = one_hot_encode(&levels, &codes);
assert_eq!(names.len(), 2);
assert!(cols[0].is_empty());
assert!(cols[1].is_empty());
}
#[test]
fn test_categorical_column_in_dataframe() {
let data: Vec<String> = vec!["cat".into(), "dog".into(), "cat".into()];
let (levels, codes) = label_encode(&data);
let df = DataFrame::from_columns(vec![
("animal".into(), Column::Categorical { levels, codes }),
("score".into(), Column::Float(vec![1.0, 2.0, 3.0])),
])
.unwrap();
assert_eq!(df.nrows(), 3);
assert_eq!(df.ncols(), 2);
assert_eq!(df.get_column("animal").unwrap().type_name(), "Categorical");
}
#[test]
fn test_datetime_column_in_dataframe() {
let df = DataFrame::from_columns(vec![
("ts".into(), Column::DateTime(vec![1000, 2000, 3000])),
("val".into(), Column::Float(vec![1.0, 2.0, 3.0])),
])
.unwrap();
assert_eq!(df.nrows(), 3);
assert_eq!(df.get_column("ts").unwrap().type_name(), "DateTime");
}
#[test]
fn test_label_encode_to_column_roundtrip() {
let data: Vec<String> = vec!["cat".into(), "dog".into(), "cat".into(), "bird".into()];
let (levels, codes) = label_encode(&data);
let col = Column::Categorical { levels: levels.clone(), codes: codes.clone() };
// Verify roundtrip: display values match originals
for (i, original) in data.iter().enumerate() {
assert_eq!(col.get_display(i), *original);
}
}
}
// â"€â"€ Phase 8: CSV Ingestion & Tensor Bridge â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
// â"€â"€ DataFrame â†" Tensor bridge â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl DataFrame {
/// Convert selected numeric columns to a `cjc_runtime::Tensor` with shape
/// `[nrows, len(col_names)]` (row-major).
///
/// All selected columns must be `Float` or `Int`; `Str` and `Bool` columns
/// will return `DataError::InvalidOperation`.
/// Convert selected numeric columns into a `Tensor` with shape `[nrows, ncols]`.
pub fn to_tensor(
&self,
col_names: &[&str],
) -> Result<cjc_runtime::Tensor, DataError> {
let (data, shape) = self.to_tensor_data(col_names)?;
cjc_runtime::Tensor::from_vec(data, &shape)
.map_err(|e| DataError::InvalidOperation(format!("tensor conversion: {}", e)))
}
/// Append a single row of string values (parsed to the column type).
///
/// `values` must match `self.ncols()` in length.
/// Each string is parsed according to the existing column type:
/// - `Float`: parsed as f64, falls back to 0.0 on parse error
/// - `Int`: parsed as i64, falls back to 0 on parse error
/// - `Str`: stored as-is
/// - `Bool`: `"true"` / `"1"` → true, anything else → false
pub fn push_row(&mut self, values: &[&str]) -> Result<(), DataError> {
if values.len() != self.ncols() {
return Err(DataError::ColumnLengthMismatch {
expected: self.ncols(),
got: values.len(),
column: "row".to_string(),
});
}
for (i, (_, col)) in self.columns.iter_mut().enumerate() {
let s = values[i];
match col {
Column::Float(v) => v.push(s.trim().parse::<f64>().unwrap_or(0.0)),
Column::Int(v) => v.push(s.trim().parse::<i64>().unwrap_or(0)),
Column::Str(v) => v.push(s.to_string()),
Column::Bool(v) => v.push(matches!(s.trim(), "true" | "1")),
Column::Categorical { .. } => {
// Categorical columns are not populated via push_row
}
Column::CategoricalAdaptive(_) => {
// CategoricalAdaptive columns are not populated via push_row
}
Column::DateTime(v) => v.push(s.trim().parse::<i64>().unwrap_or(0)),
}
}
Ok(())
}
}
// -- Phase 10: Tidy Primitives ------------------------------------------------
//
// Design goals:
// - filter() -> zero-allocation view (bitmask), O(N) time, O(N/8) extra mem
// - select() -> zero-allocation view (projection map), O(ncols) time/mem
// - mutate() -> new column buffers only, copy-on-write on alias, @nogc-safe
// - All operations are bit-deterministic with stable iteration/column order.
// â"€â"€ BitMask â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A compact, word-aligned bitmask over `nrows` rows.
///
/// Words are `u64`, stored LSB-first within each word. Bit `i` is set in
/// `words[i / 64]` at position `i % 64`. Tail bits (above `nrows`) are
/// guaranteed to be zero so that iteration never yields phantom rows.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct BitMask {
words: Vec<u64>,
nrows: usize,
}
impl BitMask {
/// Construct a mask covering all `nrows` rows (all-true).
pub fn all_true(nrows: usize) -> Self {
let nwords = nwords_for(nrows);
let mut words = vec![u64::MAX; nwords];
// Zero tail bits for determinism
if nrows % 64 != 0 && nwords > 0 {
let tail = nrows % 64;
words[nwords - 1] = (1u64 << tail) - 1;
}
BitMask { words, nrows }
}
/// Construct a mask where no rows are set (all-false).
pub fn all_false(nrows: usize) -> Self {
let nwords = nwords_for(nrows);
BitMask {
words: vec![0u64; nwords],
nrows,
}
}
/// Construct from a `Vec<bool>`, one entry per row.
pub fn from_bools(bools: &[bool]) -> Self {
let nrows = bools.len();
let nwords = nwords_for(nrows);
let mut words = vec![0u64; nwords];
for (i, &b) in bools.iter().enumerate() {
if b {
words[i / 64] |= 1u64 << (i % 64);
}
}
BitMask { words, nrows }
}
/// Get bit at row `i`.
#[inline]
pub fn get(&self, i: usize) -> bool {
debug_assert!(i < self.nrows);
(self.words[i / 64] >> (i % 64)) & 1 == 1
}
/// Number of set bits (masked-in rows).
pub fn count_ones(&self) -> usize {
self.words.iter().map(|w| w.count_ones() as usize).sum()
}
/// Merge two masks with AND semantics (chain of filter().filter()).
///
/// Panics if `nrows` differs â€" this is a programming error (same base df).
pub fn and(&self, other: &BitMask) -> BitMask {
assert_eq!(
self.nrows, other.nrows,
"BitMask::and: nrows mismatch ({} vs {})",
self.nrows, other.nrows
);
let words = self
.words
.iter()
.zip(other.words.iter())
.map(|(a, b)| a & b)
.collect();
BitMask {
words,
nrows: self.nrows,
}
}
/// Iterate over set row indices in ascending order (deterministic).
pub fn iter_set(&self) -> impl Iterator<Item = usize> + '_ {
(0..self.nrows).filter(move |&i| self.get(i))
}
/// Returns the total number of rows this mask covers.
pub fn nrows(&self) -> usize {
self.nrows
}
/// Number of backing u64 words (= ceil(nrows / 64)).
pub fn nwords(&self) -> usize {
self.words.len()
}
/// Read-only access to the backing words. Used by `AdaptiveSelection`
/// to perform AND/OR over raw words without re-iterating bit-by-bit.
pub fn words_slice(&self) -> &[u64] {
&self.words
}
/// Construct a `BitMask` directly from owned words and a row count.
///
/// The caller must ensure tail bits past `nrows` are zero. Used by
/// `AdaptiveSelection::intersect`/`union` after the AND/OR step.
pub fn from_words_for_test(words: Vec<u64>, nrows: usize) -> Self {
debug_assert_eq!(words.len(), nwords_for(nrows));
BitMask { words, nrows }
}
}
#[inline]
pub(crate) fn nwords_for(nrows: usize) -> usize {
(nrows + 63) / 64
}
// â"€â"€ ProjectionMap â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A stable ordered list of column indices into the base DataFrame.
///
/// Selecting 0 columns yields an empty projection (valid empty view).
/// Duplicate names are rejected at construction time â€" callers must deduplicate.
/// Column ordering is exactly the order supplied by the caller.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ProjectionMap {
/// Column indices into the base DataFrame's `columns` vec.
indices: Vec<usize>,
}
impl ProjectionMap {
/// Identity projection (all columns, in original order).
pub fn identity(ncols: usize) -> Self {
ProjectionMap {
indices: (0..ncols).collect(),
}
}
/// Construct from explicit column indices.
pub fn from_indices(indices: Vec<usize>) -> Self {
ProjectionMap { indices }
}
/// Returns the number of projected columns.
pub fn len(&self) -> usize {
self.indices.len()
}
/// Returns `true` if no columns are projected.
pub fn is_empty(&self) -> bool {
self.indices.is_empty()
}
/// Returns the underlying column-index slice.
pub fn indices(&self) -> &[usize] {
&self.indices
}
}
// â"€â"€ TidyView â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A lazy, zero-allocation view over a base `DataFrame`.
///
/// Holds:
/// • `base` â€" shared reference to the underlying columnar data
/// • `mask` â€" bitmask of which rows are visible
/// • `proj` â€" ordered list of visible column indices
///
/// No column buffers are copied until `materialize()` / `to_tensor()` is called.
#[derive(Debug, Clone)]
pub struct TidyView {
base: Rc<DataFrame>,
mask: AdaptiveSelection,
proj: ProjectionMap,
}
// ── O3: columnar predicate evaluation ────────────────────────────────────────
/// Try to evaluate a filter predicate column-at-a-time for simple `Col op Literal`
/// patterns. Returns `Some(new_mask)` on success, `None` if the predicate shape
/// is unsupported (caller falls back to row-wise evaluation).
///
/// Supported patterns:
/// - `Col op LitFloat` / `LitFloat op Col` (Float columns)
/// - `Col op LitInt` / `LitInt op Col` (Int columns, or Float columns with i64→f64 promotion)
/// - `pred And pred` / `pred Or pred` (compound, recursively tries both sides)
///
/// The returned mask is the AND of the predicate result with `existing_mask`.
/// NaN comparisons follow IEEE 754 (deterministic).
fn try_eval_predicate_columnar(
base: &DataFrame,
predicate: &DExpr,
existing_mask: &BitMask,
) -> Option<BitMask> {
match predicate {
// Compound: And — try columnar on both sides, AND the results
DExpr::BinOp {
op: DBinOp::And,
left,
right,
} => {
let lmask = try_eval_predicate_columnar(base, left, existing_mask)?;
let rmask = try_eval_predicate_columnar(base, right, &lmask)?;
Some(rmask)
}
// Compound: Or — try columnar on both sides, OR the predicate bits
// then AND with existing mask
DExpr::BinOp {
op: DBinOp::Or,
left,
right,
} => {
// Evaluate each side against a full mask to get raw predicate results,
// then OR them, then AND with existing mask.
let all_mask = BitMask::all_true(existing_mask.nrows);
let lmask = try_eval_predicate_columnar(base, left, &all_mask)?;
let rmask = try_eval_predicate_columnar(base, right, &all_mask)?;
// OR the two predicate masks
let nrows = existing_mask.nrows;
let or_words: Vec<u64> = lmask
.words
.iter()
.zip(rmask.words.iter())
.map(|(a, b)| a | b)
.collect();
// AND with existing mask
let final_words: Vec<u64> = or_words
.iter()
.zip(existing_mask.words.iter())
.map(|(a, b)| a & b)
.collect();
Some(BitMask {
words: final_words,
nrows,
})
}
// Simple comparison: Col op Literal (or Literal op Col)
DExpr::BinOp { op, left, right } => {
// Only handle comparison operators
if !matches!(
op,
DBinOp::Gt | DBinOp::Lt | DBinOp::Ge | DBinOp::Le | DBinOp::Eq | DBinOp::Ne
) {
return None;
}
// Extract (column_name, literal_value_as_f64_or_i64, is_reversed)
// "reversed" means Literal op Col, so we flip the comparison direction
enum LitVal {
F(f64),
I(i64),
}
let (col_name, lit, reversed) = match (left.as_ref(), right.as_ref()) {
(DExpr::Col(name), DExpr::LitFloat(v)) => (name.as_str(), LitVal::F(*v), false),
(DExpr::LitFloat(v), DExpr::Col(name)) => (name.as_str(), LitVal::F(*v), true),
(DExpr::Col(name), DExpr::LitInt(v)) => (name.as_str(), LitVal::I(*v), false),
(DExpr::LitInt(v), DExpr::Col(name)) => (name.as_str(), LitVal::I(*v), true),
_ => return None,
};
let column = base.get_column(col_name)?;
// Flip operator when literal is on the left: `5 > col` becomes `col < 5`
let effective_op = if reversed {
match op {
DBinOp::Gt => DBinOp::Lt,
DBinOp::Lt => DBinOp::Gt,
DBinOp::Ge => DBinOp::Le,
DBinOp::Le => DBinOp::Ge,
other => *other, // Eq, Ne are symmetric
}
} else {
*op
};
let nrows = existing_mask.nrows;
let nwords = nwords_for(nrows);
let mut words = vec![0u64; nwords];
match (column, &lit) {
// Float column, float literal
(Column::Float(data), LitVal::F(v)) => {
columnar_cmp_f64(data, *v, effective_op, &mut words);
}
// Float column, int literal (promote i64 → f64)
(Column::Float(data), LitVal::I(v)) => {
columnar_cmp_f64(data, *v as f64, effective_op, &mut words);
}
// Int column, int literal
(Column::Int(data), LitVal::I(v)) => {
columnar_cmp_i64(data, *v, effective_op, &mut words);
}
// Int column, float literal (promote each i64 → f64)
(Column::Int(data), LitVal::F(v)) => {
// Compare as f64 to match row-wise semantics
let floats: Vec<f64> = data.iter().map(|&x| x as f64).collect();
columnar_cmp_f64(&floats, *v, effective_op, &mut words);
}
_ => return None,
}
// AND with existing mask
for (w, ew) in words.iter_mut().zip(existing_mask.words.iter()) {
*w &= *ew;
}
Some(BitMask { words, nrows })
}
_ => None,
}
}
/// Columnar comparison of f64 slice against a scalar.
/// Sets bits in `out_words` for rows where the comparison is true.
/// NaN follows IEEE 754: NaN != NaN, NaN < x is false, NaN > x is false, etc.
#[inline]
pub(crate) fn columnar_cmp_f64(data: &[f64], lit: f64, op: DBinOp, out_words: &mut [u64]) {
for (i, &val) in data.iter().enumerate() {
let pass = match op {
DBinOp::Gt => val > lit,
DBinOp::Lt => val < lit,
DBinOp::Ge => val >= lit,
DBinOp::Le => val <= lit,
DBinOp::Eq => val == lit,
DBinOp::Ne => val != lit,
_ => false,
};
if pass {
out_words[i / 64] |= 1u64 << (i % 64);
}
}
}
/// Columnar comparison of i64 slice against a scalar.
/// Sets bits in `out_words` for rows where the comparison is true.
#[inline]
pub(crate) fn columnar_cmp_i64(data: &[i64], lit: i64, op: DBinOp, out_words: &mut [u64]) {
for (i, &val) in data.iter().enumerate() {
let pass = match op {
DBinOp::Gt => val > lit,
DBinOp::Lt => val < lit,
DBinOp::Ge => val >= lit,
DBinOp::Le => val <= lit,
DBinOp::Eq => val == lit,
DBinOp::Ne => val != lit,
_ => false,
};
if pass {
out_words[i / 64] |= 1u64 << (i % 64);
}
}
}
impl TidyView {
// ── constructors ────────────────────────────────────────────────────
/// Wrap a `DataFrame` as a full view (all rows, all columns).
pub fn from_df(df: DataFrame) -> Self {
let nrows = df.nrows();
let ncols = df.ncols();
TidyView {
base: Rc::new(df),
mask: AdaptiveSelection::all(nrows),
proj: ProjectionMap::identity(ncols),
}
}
/// Wrap a shared `Rc<DataFrame>` as a full view.
pub fn from_rc(df: Rc<DataFrame>) -> Self {
let nrows = df.nrows();
let ncols = df.ncols();
TidyView {
base: df,
mask: AdaptiveSelection::all(nrows),
proj: ProjectionMap::identity(ncols),
}
}
/// Stable identifier of the current selection's adaptive mode. Useful
/// for tests, instrumentation, and the user-visible `glimpse` output.
pub fn explain_selection_mode(&self) -> &'static str {
self.mask.explain_selection_mode()
}
// â"€â"€ shape â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Number of visible rows (set bits in mask).
pub fn nrows(&self) -> usize {
self.mask.count()
}
/// Number of visible columns (length of projection).
pub fn ncols(&self) -> usize {
self.proj.len()
}
/// Names of projected columns in stable projection order.
pub fn column_names(&self) -> Vec<&str> {
self.proj
.indices()
.iter()
.map(|&ci| self.base.columns[ci].0.as_str())
.collect()
}
// â"€â"€ filter â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Filter rows by a `DExpr` predicate.
///
/// Returns a new `TidyView` with a tighter bitmask (AND with existing mask).
/// Does NOT copy any column buffers.
///
/// Edge cases:
/// • 0-row base → empty mask returned, no panic.
/// • Non-bool predicate → `TidyError::PredicateNotBool`.
/// • Float NaN comparisons → deterministic: `NaN != NaN` (IEEE 754).
/// • Chained filters compose masks with AND without materializing.
pub fn filter(&self, predicate: &DExpr) -> Result<TidyView, TidyError> {
// Validate predicate type references known projected columns
validate_expr_columns_proj(predicate, &self.base, &self.proj)?;
let nrows_base = self.base.nrows();
// v2.1: lower the predicate to bytecode and interpret. Bit-identical
// output to the legacy AST-walk path on every shape; falls through
// (None) on unsupported shapes, exactly like try_eval_predicate_columnar.
if let Some(bc) = predicate_bytecode::PredicateBytecode::lower(predicate, &self.base) {
let count = self.mask.count();
// v2.2: when the existing selection has already narrowed the row
// set substantially, the sparse-aware path beats the full column
// scan — random-access gather over `iter_indices()` is bounded
// by O(count) per leaf instead of O(nrows). The dense path needs
// a full materialized BitMask; the sparse path doesn't, so we
// defer that allocation until we know it's needed.
// v3 Phase 5: when the existing selection is already Hybrid
// (mid-band density, large nrows), evaluate the predicate to
// a fresh AdaptiveSelection and route through Phase 3's
// per-chunk dispatch via `existing.intersect(fresh)`. Avoids
// the O(nrows/64) full-bitmap allocation that the dense
// BitMask path pays even when the predicate result itself is
// chunk-sparse.
//
// Hybrid activation needs nrows ≥ 8192 and mid-band density
// — the AdaptiveSelection classifier handles that
// automatically, so we only need to gate by "existing is
// Hybrid" to avoid building a fresh AdaptiveSelection
// pointlessly.
if matches!(self.mask, AdaptiveSelection::Hybrid { .. })
&& !predicate_bytecode::should_use_sparse_path(count, nrows_base)
{
let fresh = bc.evaluate_to_selection(&self.base, nrows_base);
let intersected = self.mask.intersect(&fresh);
return Ok(TidyView {
base: Rc::clone(&self.base),
mask: intersected,
proj: self.proj.clone(),
});
}
let new_mask = if predicate_bytecode::should_use_sparse_path(count, nrows_base) {
let existing_indices: Vec<usize> = self.mask.iter_indices().collect();
bc.interpret_sparse(&self.base, &existing_indices, nrows_base)
} else {
let current_mask = self.mask.materialize_mask();
bc.interpret(&self.base, ¤t_mask)
};
let words: Vec<u64> = new_mask.words_slice().to_vec();
return Ok(TidyView {
base: Rc::clone(&self.base),
mask: AdaptiveSelection::from_predicate_result(words, nrows_base),
proj: self.proj.clone(),
});
}
// Materialize once for the legacy columnar path and the row-wise
// fallback below. (Bytecode handles every shape these support, so
// we only reach here on unusual predicate trees.)
let current_mask = self.mask.materialize_mask();
// Legacy O3 columnar path retained as a safety net (e.g. for the
// parity oracle in tests/tidy_tests/test_v2_1_bytecode_parity.rs).
// In production, bytecode handles every shape this path handles, so
// execution rarely reaches here.
if let Some(new_mask) = try_eval_predicate_columnar(&self.base, predicate, ¤t_mask) {
let words: Vec<u64> = new_mask.words_slice().to_vec();
return Ok(TidyView {
base: Rc::clone(&self.base),
mask: AdaptiveSelection::from_predicate_result(words, nrows_base),
proj: self.proj.clone(),
});
}
// Fallback: row-wise evaluation
let mut new_words: Vec<u64> = current_mask.words_slice().to_vec();
// Evaluate predicate over every currently-masked-in row.
// Rows masked out remain 0 (no change needed, AND semantics).
for row in self.mask.iter_indices() {
let b = eval_expr_row_proj(&self.base, predicate, row, &self.proj)?;
let pass = match b {
ExprValue::Bool(v) => v,
_ => {
return Err(TidyError::PredicateNotBool {
got: b.type_name().to_string(),
})
}
};
if !pass {
new_words[row / 64] &= !(1u64 << (row % 64));
}
}
Ok(TidyView {
base: Rc::clone(&self.base),
mask: AdaptiveSelection::from_predicate_result(new_words, nrows_base),
proj: self.proj.clone(),
})
}
// â"€â"€ select â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Project to a subset of named columns (in the given order).
///
/// Returns a new `TidyView` with an updated `ProjectionMap`.
/// No column buffers are copied.
///
/// Edge cases:
/// • 0 columns selected → valid empty-column view (no error).
/// • Unknown column → `TidyError::ColumnNotFound`.
/// • Duplicate column name in `cols` → `TidyError::DuplicateColumn`.
/// • Column ordering is exactly as supplied.
pub fn select(&self, cols: &[&str]) -> Result<TidyView, TidyError> {
// Check for duplicates in the requested list
{
let mut seen = std::collections::BTreeSet::new();
for &name in cols {
if !seen.insert(name) {
return Err(TidyError::DuplicateColumn(name.to_string()));
}
}
}
// Resolve each name to an index in `self.base`
let mut new_indices = Vec::with_capacity(cols.len());
for &name in cols {
let idx = self
.base
.columns
.iter()
.position(|(n, _)| n == name)
.ok_or_else(|| TidyError::ColumnNotFound(name.to_string()))?;
new_indices.push(idx);
}
Ok(TidyView {
base: Rc::clone(&self.base),
mask: self.mask.clone(),
proj: ProjectionMap::from_indices(new_indices),
})
}
// â"€â"€ mutate â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Apply column-wise assignments and return a materialized `TidyFrame`.
///
/// `assignments` is an ordered list of `(col_name, expr)` pairs evaluated
/// left-to-right. Each assignment sees the *snapshot* of columns at entry
/// to the mutate call (snapshot semantics â€" new columns created in earlier
/// assignments are NOT visible to later assignments within the same call).
///
/// Semantics decisions:
/// • Existing column → overwritten (copy-on-write safe).
/// • New column → appended after existing projected columns.
/// • Scalar broadcasting → a scalar expr is broadcast to all visible rows.
/// • Mask-awareness: only masked-in rows are computed; masked-out rows in
/// the materialized output retain the base value (or zero for new cols).
/// • Type promotion: Int + Float → Float; Int overflow → wrapping.
/// • Multiple assignments with the same target name in one call → error.
/// • Mutate on masked view produces a *materialized* `TidyFrame` where
/// only visible rows are present (mask applied during materialization).
pub fn mutate(&self, assignments: &[(&str, DExpr)]) -> Result<TidyFrame, TidyError> {
// Check for duplicate targets within this call
{
let mut seen = std::collections::BTreeSet::new();
for &(name, _) in assignments {
if !seen.insert(name) {
return Err(TidyError::DuplicateColumn(name.to_string()));
}
}
}
// Materialize the view into a fresh DataFrame (mask applied, cols projected)
let mut df = self.materialize()?;
// Snapshot: take column names present before any assignment
let snapshot_names: Vec<String> = df.columns.iter().map(|(n, _)| n.clone()).collect();
for &(col_name, ref expr) in assignments {
// Validate that all column refs in expr exist in the snapshot
validate_expr_columns_snapshot(expr, &snapshot_names)?;
let nrows = df.nrows();
// Evaluate expr for each row to build new column buffer
let new_col = eval_expr_column(&df, expr, nrows)?;
// Find or append column
if let Some(pos) = df.columns.iter().position(|(n, _)| n == col_name) {
df.columns[pos].1 = new_col;
} else {
df.columns.push((col_name.to_string(), new_col));
}
}
Ok(TidyFrame {
inner: Rc::new(RefCell::new(df)),
})
}
// â"€â"€ materialize â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Materialize the view into a new `DataFrame` (applies mask + projection).
///
/// Triggers exactly one allocation per visible column buffer.
/// Rows are emitted in ascending index order (stable/deterministic).
///
/// Edge cases:
/// • Empty rows → 0-row DataFrame.
/// • Empty cols → 0-column DataFrame.
/// • Row-major iteration is stable.
pub fn materialize(&self) -> Result<DataFrame, TidyError> {
let row_indices: Vec<usize> = self.mask.iter_indices().collect();
let mut columns = Vec::with_capacity(self.proj.len());
for &ci in self.proj.indices() {
let (name, col) = &self.base.columns[ci];
let new_col = gather_column(col, &row_indices);
columns.push((name.clone(), new_col));
}
DataFrame::from_columns(columns)
.map_err(|e| TidyError::Internal(e.to_string()))
}
/// Convert visible numeric columns to a tensor (row-major).
///
/// Only `Float` and `Int` columns are supported.
pub fn to_tensor(&self, col_names: &[&str]) -> Result<cjc_runtime::Tensor, TidyError> {
let df = self.materialize()?;
df.to_tensor(col_names)
.map_err(|e| TidyError::Internal(e.to_string()))
}
/// Access a materialized `BitMask` view of the current selection (for
/// testing/inspection). Always returns an owned `BitMask` regardless of
/// the underlying adaptive arm — this preserves the pre-v2 inspection
/// surface without coupling test code to the adaptive enum.
pub fn mask(&self) -> BitMask {
self.mask.materialize_mask()
}
/// Access the underlying adaptive selection (for testing/inspection
/// of the chosen mode). Use `mask()` if you want a materialized
/// `BitMask` view.
pub fn selection(&self) -> &AdaptiveSelection {
&self.mask
}
/// Access the underlying projection (for testing/inspection).
pub fn proj(&self) -> &ProjectionMap {
&self.proj
}
/// Access a column from the underlying base DataFrame by name.
///
/// Returns the raw `Column` (full length, unmasked) â€" callers must apply
/// the mask themselves if needed. Used by `fct_summary_means` and similar.
pub fn base_column(&self, name: &str) -> Option<&Column> {
self.base.columns.iter()
.find(|(n, _)| n == name)
.map(|(_, c)| c)
}
}
// â"€â"€ TidyFrame â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A materialized, mutable DataFrame with copy-on-write alias safety.
///
/// Wraps `Rc<RefCell<DataFrame>>`. Cloning a `TidyFrame` shares the buffer;
/// writing through `mutate()` triggers a deep copy if the refcount > 1,
/// ensuring other views are not corrupted.
#[derive(Debug, Clone)]
pub struct TidyFrame {
inner: Rc<RefCell<DataFrame>>,
}
impl TidyFrame {
/// Wrap an existing `DataFrame`.
pub fn from_df(df: DataFrame) -> Self {
TidyFrame {
inner: Rc::new(RefCell::new(df)),
}
}
/// Get a shared view of the inner DataFrame.
pub fn borrow(&self) -> std::cell::Ref<'_, DataFrame> {
self.inner.borrow()
}
/// Apply further tidy operations on this frame.
pub fn view(&self) -> TidyView {
let df = self.inner.borrow().clone();
TidyView::from_df(df)
}
/// Alias-safe mutate: if this `TidyFrame` is shared, clones first.
pub fn mutate(&mut self, assignments: &[(&str, DExpr)]) -> Result<(), TidyError> {
// Copy-on-write: if refcount > 1, deep-clone the inner DataFrame
if Rc::strong_count(&self.inner) > 1 {
let cloned = self.inner.borrow().clone();
self.inner = Rc::new(RefCell::new(cloned));
}
// Check for duplicate targets
{
let mut seen = std::collections::BTreeSet::new();
for &(name, _) in assignments {
if !seen.insert(name) {
return Err(TidyError::DuplicateColumn(name.to_string()));
}
}
}
let mut df = self.inner.borrow_mut();
// Snapshot column names before mutation
let snapshot_names: Vec<String> = df.columns.iter().map(|(n, _)| n.clone()).collect();
for &(col_name, ref expr) in assignments {
validate_expr_columns_snapshot(expr, &snapshot_names)?;
let nrows = df.nrows();
let new_col = eval_expr_column(&df, expr, nrows)?;
if let Some(pos) = df.columns.iter().position(|(n, _)| n == col_name) {
df.columns[pos].1 = new_col;
} else {
df.columns.push((col_name.to_string(), new_col));
}
}
Ok(())
}
}
// â"€â"€ TidyError â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Errors produced by Phase 10 tidy operations.
#[derive(Debug, Clone, PartialEq)]
pub enum TidyError {
/// A column referenced in the expression was not found.
ColumnNotFound(String),
/// A duplicate column name was supplied in select/mutate.
DuplicateColumn(String),
/// The filter predicate evaluated to a non-boolean value.
PredicateNotBool { got: String },
/// A mutate expression produced a type mismatch.
TypeMismatch { expected: String, got: String },
/// Scalar broadcast to a vector of non-matching length.
LengthMismatch { expected: usize, got: usize },
/// An internal/unexpected error (wraps DataError strings).
Internal(String),
/// `first()` or `last()` called on an empty group.
EmptyGroup,
/// Phase 17: too many distinct levels for a u16 FctColumn.
CapacityExceeded { limit: usize, got: usize },
}
impl fmt::Display for TidyError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
TidyError::ColumnNotFound(n) => write!(f, "column `{}` not found", n),
TidyError::DuplicateColumn(n) => write!(f, "duplicate column `{}`", n),
TidyError::PredicateNotBool { got } => {
write!(f, "filter predicate must be Bool, got {}", got)
}
TidyError::TypeMismatch { expected, got } => {
write!(f, "type mismatch: expected {}, got {}", expected, got)
}
TidyError::LengthMismatch { expected, got } => {
write!(
f,
"length mismatch: expected {} rows, got {}",
expected, got
)
}
TidyError::Internal(msg) => write!(f, "internal error: {}", msg),
TidyError::EmptyGroup => write!(f, "aggregation on empty group"),
TidyError::CapacityExceeded { limit, got } => {
write!(f, "factor capacity exceeded: limit {} distinct levels, got {}", limit, got)
}
}
}
}
impl std::error::Error for TidyError {}
// â"€â"€ Internal helpers â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
// Note: tidy uses the existing `gather_column(col, indices)` defined earlier in
// this file (line ~1160). No duplicate needed.
/// Evaluate a `DExpr` for all `nrows` rows, returning a typed `Column`.
///
/// Int + Float → Float promotion.
/// Int overflow → wrapping (i64 wrapping_add/mul etc.).
/// Scalar expression → broadcast to all rows.
/// Extract f64 values for all rows from a sub-expression.
fn extract_f64_column(df: &DataFrame, expr: &DExpr, nrows: usize) -> Result<Vec<f64>, TidyError> {
let col = eval_expr_column(df, expr, nrows)?;
match col {
Column::Float(v) => Ok(v),
Column::Int(v) => Ok(v.into_iter().map(|i| i as f64).collect()),
_ => Err(TidyError::TypeMismatch {
expected: "numeric".into(),
got: "non-numeric".into(),
}),
}
}
/// Evaluate window DExpr variants that need full-column context.
/// Returns `Ok(Some(column))` if expr is a window function, `Ok(None)` otherwise.
fn eval_window_column(
df: &DataFrame,
expr: &DExpr,
nrows: usize,
) -> Result<Option<Column>, TidyError> {
match expr {
DExpr::RowNumber => {
let vals: Vec<i64> = (1..=nrows as i64).collect();
Ok(Some(Column::Int(vals)))
}
DExpr::CumSum(inner) => {
let src = extract_f64_column(df, inner, nrows)?;
let mut result = Vec::with_capacity(nrows);
let mut sum = 0.0_f64;
let mut comp = 0.0_f64; // Kahan compensation
for &v in &src {
let y = v - comp;
let t = sum + y;
comp = (t - sum) - y;
sum = t;
result.push(sum);
}
Ok(Some(Column::Float(result)))
}
DExpr::CumProd(inner) => {
let src = extract_f64_column(df, inner, nrows)?;
let mut result = Vec::with_capacity(nrows);
let mut prod = 1.0_f64;
for &v in &src {
prod *= v;
result.push(prod);
}
Ok(Some(Column::Float(result)))
}
DExpr::CumMax(inner) => {
let src = extract_f64_column(df, inner, nrows)?;
let mut result = Vec::with_capacity(nrows);
let mut max = f64::NEG_INFINITY;
for &v in &src {
if v > max { max = v; }
result.push(max);
}
Ok(Some(Column::Float(result)))
}
DExpr::CumMin(inner) => {
let src = extract_f64_column(df, inner, nrows)?;
let mut result = Vec::with_capacity(nrows);
let mut min = f64::INFINITY;
for &v in &src {
if v < min { min = v; }
result.push(min);
}
Ok(Some(Column::Float(result)))
}
DExpr::Lag(inner, k) => {
let src = extract_f64_column(df, inner, nrows)?;
let mut result = Vec::with_capacity(nrows);
for i in 0..nrows {
if i < *k {
result.push(f64::NAN);
} else {
result.push(src[i - k]);
}
}
Ok(Some(Column::Float(result)))
}
DExpr::Lead(inner, k) => {
let src = extract_f64_column(df, inner, nrows)?;
let mut result = Vec::with_capacity(nrows);
for i in 0..nrows {
if i + k >= nrows {
result.push(f64::NAN);
} else {
result.push(src[i + k]);
}
}
Ok(Some(Column::Float(result)))
}
DExpr::Rank(inner) => {
let src = extract_f64_column(df, inner, nrows)?;
// Average rank (1-based): sort indices, assign ranks, average ties
let mut indexed: Vec<(usize, f64)> = src.iter().cloned().enumerate().collect();
indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let mut ranks = vec![0.0_f64; nrows];
let mut i = 0;
while i < nrows {
let mut j = i;
while j < nrows && indexed[j].1 == indexed[i].1 {
j += 1;
}
let avg_rank = (i + 1 + j) as f64 / 2.0; // 1-based average
for idx in i..j {
ranks[indexed[idx].0] = avg_rank;
}
i = j;
}
Ok(Some(Column::Float(ranks)))
}
DExpr::DenseRank(inner) => {
let src = extract_f64_column(df, inner, nrows)?;
let mut indexed: Vec<(usize, f64)> = src.iter().cloned().enumerate().collect();
indexed.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let mut ranks = vec![0_i64; nrows];
let mut rank = 0_i64;
let mut prev: Option<f64> = None;
for &(orig_idx, val) in &indexed {
if prev.is_none() || prev.unwrap() != val {
rank += 1;
}
ranks[orig_idx] = rank;
prev = Some(val);
}
Ok(Some(Column::Int(ranks)))
}
DExpr::RollingSum(col_name, window) => {
let vals = rolling_get_floats(df, col_name)?;
let n = vals.len();
let w = *window;
let mut result = Vec::with_capacity(n);
let mut sum = 0.0_f64;
let mut comp = 0.0_f64;
for i in 0..n {
// Kahan add entering element
let y = vals[i] - comp;
let t = sum + y;
comp = (t - sum) - y;
sum = t;
// Remove leaving element if window is full
if i >= w {
let y2 = -vals[i - w] - comp;
let t2 = sum + y2;
comp = (t2 - sum) - y2;
sum = t2;
}
result.push(sum);
}
Ok(Some(Column::Float(result)))
}
DExpr::RollingMean(col_name, window) => {
let vals = rolling_get_floats(df, col_name)?;
let n = vals.len();
let w = *window;
let mut result = Vec::with_capacity(n);
let mut sum = 0.0_f64;
let mut comp = 0.0_f64;
for i in 0..n {
let y = vals[i] - comp;
let t = sum + y;
comp = (t - sum) - y;
sum = t;
if i >= w {
let y2 = -vals[i - w] - comp;
let t2 = sum + y2;
comp = (t2 - sum) - y2;
sum = t2;
}
let count = if i < w { i + 1 } else { w };
result.push(sum / count as f64);
}
Ok(Some(Column::Float(result)))
}
DExpr::RollingMin(col_name, window) => {
let vals = rolling_get_floats(df, col_name)?;
let n = vals.len();
let w = *window;
let mut result = Vec::with_capacity(n);
let mut deque: VecDeque<usize> = VecDeque::new();
for i in 0..n {
// Remove elements outside window
while !deque.is_empty() && *deque.front().unwrap() + w <= i {
deque.pop_front();
}
// Remove elements >= current (maintain increasing monotonic deque)
while !deque.is_empty() && vals[*deque.back().unwrap()] >= vals[i] {
deque.pop_back();
}
deque.push_back(i);
result.push(vals[*deque.front().unwrap()]);
}
Ok(Some(Column::Float(result)))
}
DExpr::RollingMax(col_name, window) => {
let vals = rolling_get_floats(df, col_name)?;
let n = vals.len();
let w = *window;
let mut result = Vec::with_capacity(n);
let mut deque: VecDeque<usize> = VecDeque::new();
for i in 0..n {
while !deque.is_empty() && *deque.front().unwrap() + w <= i {
deque.pop_front();
}
// Remove elements <= current (maintain decreasing monotonic deque)
while !deque.is_empty() && vals[*deque.back().unwrap()] <= vals[i] {
deque.pop_back();
}
deque.push_back(i);
result.push(vals[*deque.front().unwrap()]);
}
Ok(Some(Column::Float(result)))
}
DExpr::RollingVar(col_name, window) => {
let vals = rolling_get_floats(df, col_name)?;
let n = vals.len();
let w = *window;
let mut result = Vec::with_capacity(n);
// Welford's online algorithm with removal
let mut count = 0_usize;
let mut mean = 0.0_f64;
let mut m2 = 0.0_f64;
for i in 0..n {
// Add entering element
count += 1;
let delta = vals[i] - mean;
mean += delta / count as f64;
let delta2 = vals[i] - mean;
m2 += delta * delta2;
// Remove leaving element if window is full
if i >= w {
let old = vals[i - w];
count -= 1;
if count == 0 {
mean = 0.0;
m2 = 0.0;
} else {
let delta_old = old - mean;
mean -= delta_old / count as f64;
let delta_old2 = old - mean;
m2 -= delta_old * delta_old2;
}
}
if count < 2 {
result.push(0.0);
} else {
// Population variance (not sample): m2 / count
// Use sample variance (Bessel's correction): m2 / (count - 1)
result.push(m2 / (count - 1) as f64);
}
}
Ok(Some(Column::Float(result)))
}
DExpr::RollingSd(col_name, window) => {
let vals = rolling_get_floats(df, col_name)?;
let n = vals.len();
let w = *window;
let mut result = Vec::with_capacity(n);
let mut count = 0_usize;
let mut mean = 0.0_f64;
let mut m2 = 0.0_f64;
for i in 0..n {
count += 1;
let delta = vals[i] - mean;
mean += delta / count as f64;
let delta2 = vals[i] - mean;
m2 += delta * delta2;
if i >= w {
let old = vals[i - w];
count -= 1;
if count == 0 {
mean = 0.0;
m2 = 0.0;
} else {
let delta_old = old - mean;
mean -= delta_old / count as f64;
let delta_old2 = old - mean;
m2 -= delta_old * delta_old2;
}
}
if count < 2 {
result.push(0.0);
} else {
result.push((m2 / (count - 1) as f64).sqrt());
}
}
Ok(Some(Column::Float(result)))
}
_ => Ok(None),
}
}
/// Extract a float column from a DataFrame by name (for rolling window functions).
fn rolling_get_floats(df: &DataFrame, col_name: &str) -> Result<Vec<f64>, TidyError> {
let col = df
.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.to_string()))?;
match col {
Column::Float(v) => Ok(v.clone()),
Column::Int(v) => Ok(v.iter().map(|&i| i as f64).collect()),
_ => Err(TidyError::TypeMismatch {
expected: "numeric".into(),
got: "non-numeric".into(),
}),
}
}
// -- O7: Vectorized column-level DExpr evaluation --------------------------------
/// Apply a binary operation element-wise on two columns.
/// Mirrors the semantics of `eval_binop` exactly for bit-identical results.
fn vectorized_binop(op: DBinOp, left: &Column, right: &Column) -> Result<Column, TidyError> {
match (left, right) {
(Column::Int(a), Column::Int(b)) => {
let n = a.len();
match op {
DBinOp::Add => { let mut r = vec![0i64; n]; for i in 0..n { r[i] = a[i] + b[i]; } Ok(Column::Int(r)) }
DBinOp::Sub => { let mut r = vec![0i64; n]; for i in 0..n { r[i] = a[i] - b[i]; } Ok(Column::Int(r)) }
DBinOp::Mul => { let mut r = vec![0i64; n]; for i in 0..n { r[i] = a[i] * b[i]; } Ok(Column::Int(r)) }
DBinOp::Div => { let mut r = vec![0i64; n]; for i in 0..n { r[i] = a[i] / b[i]; } Ok(Column::Int(r)) }
DBinOp::Gt => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] > b[i]; } Ok(Column::Bool(r)) }
DBinOp::Lt => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] < b[i]; } Ok(Column::Bool(r)) }
DBinOp::Ge => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] >= b[i]; } Ok(Column::Bool(r)) }
DBinOp::Le => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] <= b[i]; } Ok(Column::Bool(r)) }
DBinOp::Eq => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] == b[i]; } Ok(Column::Bool(r)) }
DBinOp::Ne => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] != b[i]; } Ok(Column::Bool(r)) }
_ => Err(TidyError::Internal(format!("unsupported op {:?} on Int", op))),
}
}
(Column::Float(a), Column::Float(b)) => {
let n = a.len();
match op {
DBinOp::Add => { let mut r = vec![0.0f64; n]; for i in 0..n { r[i] = a[i] + b[i]; } Ok(Column::Float(r)) }
DBinOp::Sub => { let mut r = vec![0.0f64; n]; for i in 0..n { r[i] = a[i] - b[i]; } Ok(Column::Float(r)) }
DBinOp::Mul => { let mut r = vec![0.0f64; n]; for i in 0..n { r[i] = a[i] * b[i]; } Ok(Column::Float(r)) }
DBinOp::Div => { let mut r = vec![0.0f64; n]; for i in 0..n { r[i] = a[i] / b[i]; } Ok(Column::Float(r)) }
DBinOp::Gt => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] > b[i]; } Ok(Column::Bool(r)) }
DBinOp::Lt => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] < b[i]; } Ok(Column::Bool(r)) }
DBinOp::Ge => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] >= b[i]; } Ok(Column::Bool(r)) }
DBinOp::Le => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] <= b[i]; } Ok(Column::Bool(r)) }
DBinOp::Eq => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] == b[i]; } Ok(Column::Bool(r)) }
DBinOp::Ne => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] != b[i]; } Ok(Column::Bool(r)) }
_ => Err(TidyError::Internal(format!("unsupported op {:?} on Float", op))),
}
}
(Column::Int(a), Column::Float(_b)) => {
let promoted: Vec<f64> = a.iter().map(|&v| v as f64).collect();
vectorized_binop(op, &Column::Float(promoted), right)
}
(Column::Float(_a), Column::Int(b)) => {
let promoted: Vec<f64> = b.iter().map(|&v| v as f64).collect();
vectorized_binop(op, left, &Column::Float(promoted))
}
(Column::Bool(a), Column::Bool(b)) => {
let n = a.len();
match op {
DBinOp::And => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] && b[i]; } Ok(Column::Bool(r)) }
DBinOp::Or => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] || b[i]; } Ok(Column::Bool(r)) }
DBinOp::Eq => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] == b[i]; } Ok(Column::Bool(r)) }
DBinOp::Ne => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] != b[i]; } Ok(Column::Bool(r)) }
_ => Err(TidyError::Internal(format!("unsupported op {:?} on Bool", op))),
}
}
(Column::Str(a), Column::Str(b)) => {
let n = a.len();
match op {
DBinOp::Eq => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] == b[i]; } Ok(Column::Bool(r)) }
DBinOp::Ne => { let mut r = vec![false; n]; for i in 0..n { r[i] = a[i] != b[i]; } Ok(Column::Bool(r)) }
_ => Err(TidyError::Internal(format!("unsupported op {:?} on String", op))),
}
}
_ => Err(TidyError::Internal("type mismatch in binary operation".into())),
}
}
/// Apply a unary math function element-wise to a column.
/// Mirrors the semantics of the FnCall arm in `eval_expr_row` exactly.
fn vectorized_fn_call(name: &str, arg: &Column) -> Result<Column, TidyError> {
let floats: Vec<f64> = match arg {
Column::Float(v) => v.clone(),
Column::Int(v) => v.iter().map(|&i| i as f64).collect(),
_ => return Err(TidyError::Internal(format!(
"FnCall '{}' requires numeric argument", name
))),
};
let f: fn(f64) -> f64 = match name {
"log" => f64::ln,
"exp" => f64::exp,
"sqrt" => f64::sqrt,
"abs" => f64::abs,
"ceil" => f64::ceil,
"floor" => f64::floor,
"round" => f64::round,
"sin" => f64::sin,
"cos" => f64::cos,
"tan" => f64::tan,
_ => return Err(TidyError::Internal(format!(
"unknown DExpr function: {}", name
))),
};
let mut result = vec![0.0f64; floats.len()];
for i in 0..floats.len() {
result[i] = f(floats[i]);
}
Ok(Column::Float(result))
}
/// O7: Try vectorized column-level evaluation of a DExpr.
/// Returns `None` if the expression is too complex for the fast path,
/// causing the caller to fall back to the row-by-row evaluator.
fn try_eval_expr_column_vectorized(
df: &DataFrame,
expr: &DExpr,
nrows: usize,
) -> Option<Result<Column, TidyError>> {
match expr {
DExpr::Col(name) => {
let col = df.get_column(name)?;
let result = match col {
Column::Int(v) => Column::Int(v[..nrows].to_vec()),
Column::Float(v) => Column::Float(v[..nrows].to_vec()),
Column::Str(v) => Column::Str(v[..nrows].to_vec()),
Column::Bool(v) => Column::Bool(v[..nrows].to_vec()),
Column::Categorical { levels, codes } => {
let strs: Vec<String> = codes[..nrows]
.iter()
.map(|&c| levels[c as usize].clone())
.collect();
Column::Str(strs)
}
Column::CategoricalAdaptive(cc) => {
let strs: Vec<String> = (0..nrows)
.map(|i| match cc.get(i) {
None => String::new(),
Some(b) => String::from_utf8_lossy(b).into_owned(),
})
.collect();
Column::Str(strs)
}
Column::DateTime(v) => Column::Int(v[..nrows].to_vec()),
};
Some(Ok(result))
}
DExpr::LitFloat(v) => Some(Ok(Column::Float(vec![*v; nrows]))),
DExpr::LitInt(v) => Some(Ok(Column::Int(vec![*v; nrows]))),
DExpr::LitBool(b) => Some(Ok(Column::Bool(vec![*b; nrows]))),
DExpr::LitStr(s) => Some(Ok(Column::Str(vec![s.clone(); nrows]))),
DExpr::BinOp { op, left, right } => {
let left_col = try_eval_expr_column_vectorized(df, left, nrows)?.ok()?;
let right_col = try_eval_expr_column_vectorized(df, right, nrows)?.ok()?;
Some(vectorized_binop(*op, &left_col, &right_col))
}
DExpr::FnCall(name, args) if args.len() == 1 => {
let arg_col = try_eval_expr_column_vectorized(df, &args[0], nrows)?.ok()?;
Some(vectorized_fn_call(name, &arg_col))
}
_ => None,
}
}
fn eval_expr_column(df: &DataFrame, expr: &DExpr, nrows: usize) -> Result<Column, TidyError> {
if nrows == 0 {
// Infer column type from a dry-run on nothing; default to Float for empty
return Ok(Column::Float(vec![]));
}
// v3 Phase 6: cat-aware mutate. When the expression is a bare `Col(name)`
// referring to a Categorical or CategoricalAdaptive column, return that
// column verbatim. Pre-Phase-6 the row-by-row fallback materialized a
// `Vec<String>` (one per row) and built a `Column::Str` — this loses
// the level table and forces a downstream re-categorization to recover
// it. Pass-through preserves both type and (for Adaptive) the
// dictionary's frozen / shared / sealed state.
if let DExpr::Col(name) = expr {
if let Some(src) = df.get_column(name) {
match src {
Column::Categorical { .. } | Column::CategoricalAdaptive(_) => {
return Ok(src.clone());
}
_ => {}
}
}
}
// Handle window functions at column level
if let Some(col) = eval_window_column(df, expr, nrows)? {
return Ok(col);
}
// O7: try vectorized fast path before falling back to row-by-row
if let Some(result) = try_eval_expr_column_vectorized(df, expr, nrows) {
return result;
}
// Evaluate row 0 to determine result type
let sample = eval_dexpr_row(df, expr, 0)?;
match sample {
ExprValue::Int(_) => {
let vals: Result<Vec<i64>, TidyError> = (0..nrows)
.map(|r| {
eval_dexpr_row(df, expr, r).and_then(|v| match v {
ExprValue::Int(i) => Ok(i),
ExprValue::Float(f) => Ok(f as i64),
other => Err(TidyError::TypeMismatch {
expected: "Int".into(),
got: other.type_name().into(),
}),
})
})
.collect();
Ok(Column::Int(vals?))
}
ExprValue::Float(_) => {
let vals: Result<Vec<f64>, TidyError> = (0..nrows)
.map(|r| {
eval_dexpr_row(df, expr, r).and_then(|v| match v {
ExprValue::Float(f) => Ok(f),
ExprValue::Int(i) => Ok(i as f64),
other => Err(TidyError::TypeMismatch {
expected: "Float".into(),
got: other.type_name().into(),
}),
})
})
.collect();
Ok(Column::Float(vals?))
}
ExprValue::Bool(_) => {
let vals: Result<Vec<bool>, TidyError> = (0..nrows)
.map(|r| {
eval_dexpr_row(df, expr, r).and_then(|v| match v {
ExprValue::Bool(b) => Ok(b),
other => Err(TidyError::TypeMismatch {
expected: "Bool".into(),
got: other.type_name().into(),
}),
})
})
.collect();
Ok(Column::Bool(vals?))
}
ExprValue::Str(_) => {
let vals: Result<Vec<String>, TidyError> = (0..nrows)
.map(|r| {
eval_dexpr_row(df, expr, r).and_then(|v| match v {
ExprValue::Str(s) => Ok(s),
other => Err(TidyError::TypeMismatch {
expected: "Str".into(),
got: other.type_name().into(),
}),
})
})
.collect();
Ok(Column::Str(vals?))
}
}
}
/// Evaluate a `DExpr` at a single row (returns `ExprValue`).
fn eval_dexpr_row(df: &DataFrame, expr: &DExpr, row: usize) -> Result<ExprValue, TidyError> {
eval_expr_row(df, expr, row).map_err(|e| TidyError::Internal(e.to_string()))
}
/// Evaluate a `DExpr` at a single row using the projection-aware base.
fn eval_expr_row_proj(
base: &DataFrame,
expr: &DExpr,
row: usize,
_proj: &ProjectionMap,
) -> Result<ExprValue, TidyError> {
// We always evaluate against the full base (all columns accessible).
// Projection only restricts what filter/select *expose*, not what predicates
// can reference in the base frame.
eval_expr_row(base, expr, row).map_err(|e| TidyError::Internal(e.to_string()))
}
/// Validate that all column references in `expr` exist in `base` (via projection
/// columns â€" filter predicates may reference any base column visible in proj).
///
/// For simplicity: filter predicates may reference ANY column in `base` because
/// the view is a window over the same base DataFrame. This is analogous to SQL
/// WHERE clauses that can reference any column, not just SELECT-listed ones.
fn validate_expr_columns_proj(
expr: &DExpr,
base: &DataFrame,
_proj: &ProjectionMap,
) -> Result<(), TidyError> {
let mut refs = Vec::new();
collect_expr_columns(expr, &mut refs);
for col_name in refs {
if base.get_column(&col_name).is_none() {
return Err(TidyError::ColumnNotFound(col_name));
}
}
Ok(())
}
/// Validate that all column references in `expr` exist in `snapshot_names`.
fn validate_expr_columns_snapshot(
expr: &DExpr,
snapshot_names: &[String],
) -> Result<(), TidyError> {
let mut refs = Vec::new();
collect_expr_columns(expr, &mut refs);
for col_name in refs {
if !snapshot_names.iter().any(|n| n == &col_name) {
return Err(TidyError::ColumnNotFound(col_name));
}
}
Ok(())
}
impl ExprValue {
fn type_name(&self) -> &'static str {
match self {
ExprValue::Int(_) => "Int",
ExprValue::Float(_) => "Float",
ExprValue::Str(_) => "Str",
ExprValue::Bool(_) => "Bool",
}
}
}
// â"€â"€ DataFrame::tidy() convenience entry point â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl DataFrame {
/// Wrap this DataFrame as a `TidyView` for Phase 10 tidy operations.
///
/// Consumes `self` (zero-copy â€" moves into an `Rc`).
pub fn tidy(self) -> TidyView {
TidyView::from_df(self)
}
/// Wrap this DataFrame as a `TidyFrame` for mutable tidy operations.
pub fn tidy_mut(self) -> TidyFrame {
TidyFrame::from_df(self)
}
}
// â"€â"€ NoGC annotation gate â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
//
// The @nogc verifier in cjc-mir/src/nogc_verify.rs tracks "safe builtins" that
// are known not to trigger GC. Phase 10 tidy operations on the Rust side are
// outside the CJC language runtime (they are library calls), so @nogc
// annotation at the CJC language level means: the CJC function body does not
// call gc_alloc. The Rust implementation of filter/select produces a TidyView
// that holds Rc references â€" no GC heap involvement.
//
// For the NoGC verifier to accept tidy calls inside @nogc CJC functions, the
// builtins "tidy_filter", "tidy_select", "tidy_materialize" must be added to
// the safe-builtins list in cjc-mir/src/nogc_verify.rs. See that file for
// the `is_safe_builtin` function.
//
// Allocation budget per operation (no GC heap, only Rust stack/heap via alloc):
// filter : O(N/64) u64 words for new mask (≈ 8 bytes / 64 rows)
// select : O(K) usize indices (K = ncols selected)
// mutate : O(N) per new column buffer (allowed â€" one allocation per column)
// materialize: O(N * K) total for visible data
// â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
// Phase 11â€"12: Grouping, Summarise, Arrange, Slice, Distinct, Joins
// â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
//
// Spec-Lock Table
// â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
// | Decision point | Choice |
// |---------------------------|--------------------------------------------------|
// | Group ordering | First-occurrence order of keys in visible rows |
// | Null/missing in keys | Not applicable (CJC Column has no null type); |
// | | NaN in Float keys: each NaN is its own group key |
// | | (NaN != NaN → separate groups per NaN position) |
// | summarise output order | Stable: same order as group creation |
// | Empty group agg behavior | count→0, sum→0.0, mean→NaN, min→NaN, max→NaN, |
// | | first/last → TidyError::EmptyGroup |
// | arrange tie-breaking | Stable sort (Rust's slice::sort_by is stable); |
// | | equal-key rows preserve original row order |
// | NaN ordering in arrange | NaN sorts LAST (greater than any finite value) |
// | null ordering | N/A â€" no null type in CJC |
// | slice_sample seed | Deterministic LCG with caller-supplied u64 seed |
// | slice_sample n > nrows | Clamp to nrows (no error) |
// | distinct ordering | First-occurrence order of distinct key combos |
// | Join left order | Preserved â€" output rows follow left row order |
// | Join right match order | Stable: sorted by right-side row index ascending |
// | Null matching in joins | N/A â€" no null type in CJC |
// | Join duplicate keys | All matches included, deterministic order |
// | many-many explosion order | Left outer loop, right inner loop (stable) |
// â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
// â"€â"€ RowIndexMap â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A permutation / selection vector over base-frame row indices.
///
/// Used by `arrange` (sort) and `slice` to represent a reordering of the
/// rows visible through a `TidyView`'s bitmask, without copying column data.
/// The indices stored here are indices into the BASE `DataFrame`, not relative
/// to the mask.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct RowIndexMap {
/// Ordered list of base-frame row indices that are visible after this op.
/// Length == number of visible rows.
pub(crate) indices: Vec<usize>,
}
impl RowIndexMap {
/// Create a new row index map from explicit indices.
pub fn new(indices: Vec<usize>) -> Self {
RowIndexMap { indices }
}
/// Returns the number of visible rows.
pub fn len(&self) -> usize {
self.indices.len()
}
/// Returns `true` if no rows are selected.
pub fn is_empty(&self) -> bool {
self.indices.is_empty()
}
/// Borrow the underlying indices as a slice.
pub fn as_slice(&self) -> &[usize] {
&self.indices
}
}
// â"€â"€ GroupMeta â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Metadata for one group in a `GroupIndex`.
#[derive(Debug, Clone)]
pub struct GroupMeta {
/// The rendered key strings (one per grouping column), in key order.
pub key_values: Vec<String>,
/// Base-frame row indices belonging to this group, in first-occurrence order.
pub row_indices: Vec<usize>,
}
// â"€â"€ GroupIndex â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A deterministic group index built from a `TidyView`.
///
/// Groups are created in **first-occurrence order**: the first time a key
/// combination appears (scanning visible rows in ascending base-row order),
/// a new group entry is appended. This guarantees a stable, reproducible output
/// ordering regardless of hash-map iteration order.
///
/// No column buffers are copied during group construction.
#[derive(Debug, Clone)]
pub struct GroupIndex {
/// Groups in first-occurrence order.
pub groups: Vec<GroupMeta>,
/// The column names used as group keys (projected names).
pub key_names: Vec<String>,
}
impl GroupIndex {
/// Build a `GroupIndex` from a materialized set of visible rows.
///
/// `key_col_indices` are indices into `base.columns`.
/// `visible_rows` are base-frame row indices in ascending order.
pub fn build(
base: &DataFrame,
key_col_indices: &[usize],
visible_rows: &[usize],
key_names: Vec<String>,
) -> Self {
// Use a Vec<(key_tuple, group_slot)> with sequential scan to preserve
// first-occurrence ordering without hash nondeterminism.
let mut group_order: Vec<Vec<String>> = Vec::new(); // unique keys, in order seen
let mut group_map: Vec<(Vec<String>, usize)> = Vec::new(); // (key → slot index)
for &row in visible_rows {
let key: Vec<String> = key_col_indices
.iter()
.map(|&ci| base.columns[ci].1.get_display(row))
.collect();
// Linear scan for existing key â€" preserves insertion order, no hash
let slot = group_map
.iter()
.position(|(k, _)| k == &key)
.unwrap_or_else(|| {
let s = group_map.len();
group_map.push((key.clone(), s));
group_order.push(key);
s
});
let _ = slot; // we'll rebuild properly below
}
// Build groups vector in insertion order
let mut groups: Vec<GroupMeta> = group_order
.iter()
.map(|k| GroupMeta {
key_values: k.clone(),
row_indices: Vec::new(),
})
.collect();
// Second pass: assign rows to groups
let key_to_slot: Vec<(Vec<String>, usize)> = group_order
.iter()
.enumerate()
.map(|(i, k)| (k.clone(), i))
.collect();
for &row in visible_rows {
let key: Vec<String> = key_col_indices
.iter()
.map(|&ci| base.columns[ci].1.get_display(row))
.collect();
if let Some((_, slot)) = key_to_slot.iter().find(|(k, _)| k == &key) {
groups[*slot].row_indices.push(row);
}
}
GroupIndex { groups, key_names }
}
}
// â"€â"€ GroupedTidyView â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A grouped view produced by `TidyView::group_by(...)`.
///
/// Holds the original `TidyView` (unchanged) plus a `GroupIndex` over its
/// visible rows. No column data is copied.
///
/// After grouping, `ungroup()` restores the plain `TidyView`.
/// `summarise()` collapses each group into one summary row.
#[derive(Debug, Clone)]
/// A TidyView that has been grouped by one or more columns.
///
/// Created by [`TidyView::group_by`]. Holds the original view plus a
/// [`GroupIndex`] that maps rows to groups. Call [`summarise`](Self::summarise)
/// to aggregate or [`ungroup`](Self::ungroup) to return to a flat view.
pub struct GroupedTidyView {
view: TidyView,
index: GroupIndex,
}
impl GroupedTidyView {
/// Return the number of groups.
pub fn ngroups(&self) -> usize {
self.index.groups.len()
}
/// Discard grouping, returning the original `TidyView` unchanged.
pub fn ungroup(self) -> TidyView {
self.view
}
/// Access the group index (for testing/inspection).
pub fn group_index(&self) -> &GroupIndex {
&self.index
}
// â"€â"€ summarise â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Collapse each group to one summary row using named aggregators.
///
/// `assignments` is an ordered list of `(output_name, aggregator)`.
/// Output rows are in first-occurrence group order (deterministic).
///
/// The result is a `TidyFrame` containing:
/// • One column per group key (in key order)
/// • One column per aggregator (in assignment order)
///
/// Aggregator semantics (empty group):
/// • count → 0
/// • sum → 0.0
/// • mean → f64::NAN
/// • min → f64::NAN
/// • max → f64::NAN
/// • first / last → TidyError::EmptyGroup
pub fn summarise(
&self,
assignments: &[(&str, TidyAgg)],
) -> Result<TidyFrame, TidyError> {
// Validate: no duplicate output names
{
let mut seen = std::collections::BTreeSet::new();
for &(name, _) in assignments {
if !seen.insert(name) {
return Err(TidyError::DuplicateColumn(name.to_string()));
}
}
}
let base = &self.view.base;
let n_groups = self.index.groups.len();
// Build key columns first (one value per group, repeated in type-matched form)
let mut result_columns: Vec<(String, Column)> = Vec::new();
for key_name in &self.index.key_names {
let base_col = base
.get_column(key_name)
.ok_or_else(|| TidyError::ColumnNotFound(key_name.clone()))?;
let col = match base_col {
Column::Int(_) => {
let vals: Vec<i64> = self
.index
.groups
.iter()
.map(|g| {
g.key_values[self
.index
.key_names
.iter()
.position(|k| k == key_name)
.unwrap()]
.parse::<i64>()
.unwrap_or(0)
})
.collect();
Column::Int(vals)
}
Column::Bool(_) => {
let vals: Vec<bool> = self
.index
.groups
.iter()
.map(|g| {
let s = &g.key_values[self
.index
.key_names
.iter()
.position(|k| k == key_name)
.unwrap()];
matches!(s.as_str(), "true" | "1")
})
.collect();
Column::Bool(vals)
}
_ => {
// Float and Str: store key as Str column for the summary
let vals: Vec<String> = self
.index
.groups
.iter()
.map(|g| {
g.key_values[self
.index
.key_names
.iter()
.position(|k| k == key_name)
.unwrap()]
.clone()
})
.collect();
Column::Str(vals)
}
};
result_columns.push((key_name.clone(), col));
}
// Build aggregator columns (O5+O9: use fast path)
for &(out_name, ref agg) in assignments {
let col_vals = self.eval_agg_over_groups_fast(agg, n_groups, base)?;
result_columns.push((out_name.to_string(), col_vals));
}
let df = DataFrame::from_columns(result_columns)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
/// Evaluate an aggregator over all groups, return a typed `Column`.
#[allow(dead_code)]
fn eval_agg_over_groups(
&self,
agg: &TidyAgg,
n_groups: usize,
base: &DataFrame,
) -> Result<Column, TidyError> {
match agg {
TidyAgg::Count => {
let counts: Vec<i64> = self
.index
.groups
.iter()
.map(|g| g.row_indices.len() as i64)
.collect();
Ok(Column::Int(counts))
}
TidyAgg::Sum(col_name) | TidyAgg::Mean(col_name)
| TidyAgg::Min(col_name) | TidyAgg::Max(col_name)
| TidyAgg::First(col_name) | TidyAgg::Last(col_name)
| TidyAgg::Median(col_name) | TidyAgg::Sd(col_name)
| TidyAgg::Var(col_name) | TidyAgg::Quantile(col_name, _)
| TidyAgg::NDistinct(col_name) | TidyAgg::Iqr(col_name) => {
let src = base
.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
let mut vals = Vec::with_capacity(n_groups);
for group in &self.index.groups {
let v = agg_reduce(agg, src, &group.row_indices)?;
vals.push(v);
}
Ok(Column::Float(vals))
}
}
}
/// O5+O9: Fast aggregation using direct index iteration and arena buffer.
/// Produces bit-identical results to `eval_agg_over_groups`.
fn eval_agg_over_groups_fast(
&self,
agg: &TidyAgg,
n_groups: usize,
base: &DataFrame,
) -> Result<Column, TidyError> {
match agg {
TidyAgg::Count => {
let counts: Vec<i64> = self
.index
.groups
.iter()
.map(|g| g.row_indices.len() as i64)
.collect();
Ok(Column::Int(counts))
}
TidyAgg::Sum(col_name) => {
let src = base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
Ok(Column::Float(fast_agg_sum(&self.index.groups, src)?))
}
TidyAgg::Mean(col_name) => {
let src = base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
Ok(Column::Float(fast_agg_mean(&self.index.groups, src)?))
}
TidyAgg::Min(col_name) => {
let src = base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
Ok(Column::Float(fast_agg_min(&self.index.groups, src)?))
}
TidyAgg::Max(col_name) => {
let src = base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
Ok(Column::Float(fast_agg_max(&self.index.groups, src)?))
}
TidyAgg::First(col_name) => {
let src = base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
Ok(Column::Float(fast_agg_first(&self.index.groups, src)?))
}
TidyAgg::Last(col_name) => {
let src = base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
Ok(Column::Float(fast_agg_last(&self.index.groups, src)?))
}
TidyAgg::Var(col_name)
| TidyAgg::Sd(col_name)
| TidyAgg::Median(col_name)
| TidyAgg::Quantile(col_name, _)
| TidyAgg::NDistinct(col_name)
| TidyAgg::Iqr(col_name) => {
let src = base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
Ok(Column::Float(fast_agg_arena(
agg, &self.index.groups, src, n_groups,
)?))
}
}
}
}
// -- O5: Direct index-based aggregation (no per-group Vec) --------------------
enum ColRef<'a> {
Float(&'a [f64]),
Int(&'a [i64]),
}
fn col_to_ref(col: &Column) -> Result<ColRef<'_>, TidyError> {
match col {
Column::Float(v) => Ok(ColRef::Float(v)),
Column::Int(v) => Ok(ColRef::Int(v)),
_ => Err(TidyError::TypeMismatch {
expected: "numeric (Int or Float)".into(),
got: col.type_name().into(),
}),
}
}
fn fast_agg_sum(groups: &[GroupMeta], col: &Column) -> Result<Vec<f64>, TidyError> {
use cjc_repro::kahan::KahanAccumulatorF64;
let cr = col_to_ref(col)?;
Ok(groups.iter().map(|g| {
let mut acc = KahanAccumulatorF64::new();
match cr {
ColRef::Float(d) => { for &i in &g.row_indices { acc.add(d[i]); } }
ColRef::Int(d) => { for &i in &g.row_indices { acc.add(d[i] as f64); } }
}
acc.finalize()
}).collect())
}
fn fast_agg_mean(groups: &[GroupMeta], col: &Column) -> Result<Vec<f64>, TidyError> {
use cjc_repro::kahan::KahanAccumulatorF64;
let cr = col_to_ref(col)?;
Ok(groups.iter().map(|g| {
if g.row_indices.is_empty() { return f64::NAN; }
let mut acc = KahanAccumulatorF64::new();
match cr {
ColRef::Float(d) => { for &i in &g.row_indices { acc.add(d[i]); } }
ColRef::Int(d) => { for &i in &g.row_indices { acc.add(d[i] as f64); } }
}
acc.finalize() / g.row_indices.len() as f64
}).collect())
}
fn fast_agg_min(groups: &[GroupMeta], col: &Column) -> Result<Vec<f64>, TidyError> {
let cr = col_to_ref(col)?;
Ok(groups.iter().map(|g| {
if g.row_indices.is_empty() { return f64::NAN; }
match cr {
ColRef::Float(d) => g.row_indices.iter().fold(f64::INFINITY, |a, &i| {
let b = d[i]; if b.is_nan() || b < a { b } else { a }
}),
ColRef::Int(d) => g.row_indices.iter().fold(f64::INFINITY, |a, &i| {
let b = d[i] as f64; if b.is_nan() || b < a { b } else { a }
}),
}
}).collect())
}
fn fast_agg_max(groups: &[GroupMeta], col: &Column) -> Result<Vec<f64>, TidyError> {
let cr = col_to_ref(col)?;
Ok(groups.iter().map(|g| {
if g.row_indices.is_empty() { return f64::NAN; }
match cr {
ColRef::Float(d) => g.row_indices.iter().fold(f64::NEG_INFINITY, |a, &i| {
let b = d[i]; if b.is_nan() || b > a { b } else { a }
}),
ColRef::Int(d) => g.row_indices.iter().fold(f64::NEG_INFINITY, |a, &i| {
let b = d[i] as f64; if b.is_nan() || b > a { b } else { a }
}),
}
}).collect())
}
fn fast_agg_first(groups: &[GroupMeta], col: &Column) -> Result<Vec<f64>, TidyError> {
let cr = col_to_ref(col)?;
let mut vals = Vec::with_capacity(groups.len());
for g in groups {
if g.row_indices.is_empty() { return Err(TidyError::EmptyGroup); }
match cr {
ColRef::Float(d) => vals.push(d[g.row_indices[0]]),
ColRef::Int(d) => vals.push(d[g.row_indices[0]] as f64),
}
}
Ok(vals)
}
fn fast_agg_last(groups: &[GroupMeta], col: &Column) -> Result<Vec<f64>, TidyError> {
let cr = col_to_ref(col)?;
let mut vals = Vec::with_capacity(groups.len());
for g in groups {
if g.row_indices.is_empty() { return Err(TidyError::EmptyGroup); }
let last = *g.row_indices.last().unwrap();
match cr {
ColRef::Float(d) => vals.push(d[last]),
ColRef::Int(d) => vals.push(d[last] as f64),
}
}
Ok(vals)
}
// -- O9: Arena-based aggregation for sort-dependent ops -----------------------
fn fast_agg_arena(
agg: &TidyAgg,
groups: &[GroupMeta],
col: &Column,
n_groups: usize,
) -> Result<Vec<f64>, TidyError> {
let cr = col_to_ref(col)?;
let max_size = groups.iter().map(|g| g.row_indices.len()).max().unwrap_or(0);
let mut arena: Vec<f64> = Vec::with_capacity(max_size);
let mut results = Vec::with_capacity(n_groups);
for group in groups {
arena.clear();
match cr {
ColRef::Float(d) => { for &i in &group.row_indices { arena.push(d[i]); } }
ColRef::Int(d) => { for &i in &group.row_indices { arena.push(d[i] as f64); } }
}
let val = agg_reduce_slice(agg, &mut arena)?;
results.push(val);
}
Ok(results)
}
/// Reduce a pre-gathered f64 slice for sort-dependent aggregators (O9).
/// Bit-identical to `agg_reduce`.
fn agg_reduce_slice(agg: &TidyAgg, values: &mut [f64]) -> Result<f64, TidyError> {
match agg {
TidyAgg::Var(_) => {
if values.len() < 2 {
Ok(f64::NAN)
} else {
let n = values.len() as f64;
let mean = kahan_sum_f64(values) / n;
let sq_diffs: Vec<f64> = values.iter().map(|v| (v - mean) * (v - mean)).collect();
Ok(kahan_sum_f64(&sq_diffs) / (n - 1.0))
}
}
TidyAgg::Sd(_) => {
if values.len() < 2 {
Ok(f64::NAN)
} else {
let n = values.len() as f64;
let mean = kahan_sum_f64(values) / n;
let sq_diffs: Vec<f64> = values.iter().map(|v| (v - mean) * (v - mean)).collect();
Ok((kahan_sum_f64(&sq_diffs) / (n - 1.0)).sqrt())
}
}
TidyAgg::Median(_) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = values.len();
if n % 2 == 1 { Ok(values[n / 2]) }
else { Ok((values[n / 2 - 1] + values[n / 2]) / 2.0) }
}
}
TidyAgg::Quantile(_, p) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = values.len();
if n == 1 { return Ok(values[0]); }
let pos = p * (n as f64 - 1.0);
let lo = pos.floor() as usize;
let hi = pos.ceil() as usize;
let frac = pos - lo as f64;
if lo == hi || hi >= n { Ok(values[lo.min(n - 1)]) }
else { Ok(values[lo] + frac * (values[hi] - values[lo])) }
}
}
TidyAgg::NDistinct(_) => {
let distinct: std::collections::BTreeSet<u64> = values.iter().map(|v| v.to_bits()).collect();
Ok(distinct.len() as f64)
}
TidyAgg::Iqr(_) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
values.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = values.len();
if n == 1 { return Ok(0.0); }
let q1 = {
let pos = 0.25 * (n as f64 - 1.0);
let lo = pos.floor() as usize;
let hi = pos.ceil() as usize;
let frac = pos - lo as f64;
if lo == hi || hi >= n { values[lo.min(n - 1)] }
else { values[lo] + frac * (values[hi] - values[lo]) }
};
let q3 = {
let pos = 0.75 * (n as f64 - 1.0);
let lo = pos.floor() as usize;
let hi = pos.ceil() as usize;
let frac = pos - lo as f64;
if lo == hi || hi >= n { values[lo.min(n - 1)] }
else { values[lo] + frac * (values[hi] - values[lo]) }
};
Ok(q3 - q1)
}
}
_ => unreachable!("agg_reduce_slice called for non-arena aggregator"),
}
}
/// Reduce one group's rows for a numeric aggregator. Returns f64.
#[allow(dead_code)]
fn agg_reduce(
agg: &TidyAgg,
col: &Column,
rows: &[usize],
) -> Result<f64, TidyError> {
// Extract f64 values for the group rows
let values: Vec<f64> = match col {
Column::Int(v) => rows.iter().map(|&r| v[r] as f64).collect(),
Column::Float(v) => rows.iter().map(|&r| v[r]).collect(),
_ => {
return Err(TidyError::TypeMismatch {
expected: "numeric (Int or Float)".into(),
got: col.type_name().into(),
})
}
};
match agg {
TidyAgg::Sum(_) => Ok(kahan_sum_f64(&values)),
TidyAgg::Mean(_) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
Ok(kahan_sum_f64(&values) / values.len() as f64)
}
}
TidyAgg::Min(_) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
Ok(values.iter().cloned().fold(f64::INFINITY, |a, b| {
if b.is_nan() || b < a { b } else { a }
}))
}
}
TidyAgg::Max(_) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
Ok(values.iter().cloned().fold(f64::NEG_INFINITY, |a, b| {
if b.is_nan() || b > a { b } else { a }
}))
}
}
TidyAgg::First(_) => {
if values.is_empty() {
Err(TidyError::EmptyGroup)
} else {
Ok(values[0])
}
}
TidyAgg::Last(_) => {
if values.is_empty() {
Err(TidyError::EmptyGroup)
} else {
Ok(*values.last().unwrap())
}
}
TidyAgg::Count => Ok(values.len() as f64),
TidyAgg::Median(_) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
let mut sorted = values.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
if n % 2 == 1 {
Ok(sorted[n / 2])
} else {
Ok((sorted[n / 2 - 1] + sorted[n / 2]) / 2.0)
}
}
}
TidyAgg::Var(_) => {
if values.len() < 2 {
Ok(f64::NAN)
} else {
let n = values.len() as f64;
let mean = kahan_sum_f64(&values) / n;
let sq_diffs: Vec<f64> = values.iter().map(|v| (v - mean) * (v - mean)).collect();
Ok(kahan_sum_f64(&sq_diffs) / (n - 1.0))
}
}
TidyAgg::Sd(_) => {
if values.len() < 2 {
Ok(f64::NAN)
} else {
let n = values.len() as f64;
let mean = kahan_sum_f64(&values) / n;
let sq_diffs: Vec<f64> = values.iter().map(|v| (v - mean) * (v - mean)).collect();
Ok((kahan_sum_f64(&sq_diffs) / (n - 1.0)).sqrt())
}
}
TidyAgg::Quantile(_, p) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
let mut sorted = values.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
if n == 1 {
return Ok(sorted[0]);
}
let pos = p * (n as f64 - 1.0);
let lo = pos.floor() as usize;
let hi = pos.ceil() as usize;
let frac = pos - lo as f64;
if lo == hi || hi >= n {
Ok(sorted[lo.min(n - 1)])
} else {
Ok(sorted[lo] + frac * (sorted[hi] - sorted[lo]))
}
}
}
TidyAgg::NDistinct(_) => {
use std::collections::BTreeSet;
let distinct: BTreeSet<u64> = values.iter().map(|v| v.to_bits()).collect();
Ok(distinct.len() as f64)
}
TidyAgg::Iqr(_) => {
if values.is_empty() {
Ok(f64::NAN)
} else {
let mut sorted = values.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let n = sorted.len();
if n == 1 {
return Ok(0.0);
}
let q1 = {
let pos = 0.25 * (n as f64 - 1.0);
let lo = pos.floor() as usize;
let hi = pos.ceil() as usize;
let frac = pos - lo as f64;
if lo == hi || hi >= n { sorted[lo.min(n - 1)] }
else { sorted[lo] + frac * (sorted[hi] - sorted[lo]) }
};
let q3 = {
let pos = 0.75 * (n as f64 - 1.0);
let lo = pos.floor() as usize;
let hi = pos.ceil() as usize;
let frac = pos - lo as f64;
if lo == hi || hi >= n { sorted[lo.min(n - 1)] }
else { sorted[lo] + frac * (sorted[hi] - sorted[lo]) }
};
Ok(q3 - q1)
}
}
}
}
// â"€â"€ TidyAgg â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// An aggregator expression for use in `summarise`.
#[derive(Debug, Clone)]
pub enum TidyAgg {
/// Row count for the group. No column argument.
Count,
/// Kahan-sum of a numeric column.
Sum(String),
/// Arithmetic mean (NaN for empty groups).
Mean(String),
/// Minimum value (NaN for empty groups). NaN inputs sort last.
Min(String),
/// Maximum value (NaN for empty groups). NaN inputs sort last.
Max(String),
/// First row's value (error for empty groups).
First(String),
/// Last row's value (error for empty groups).
Last(String),
/// Median of a numeric column.
Median(String),
/// Sample standard deviation (Kahan-based).
Sd(String),
/// Sample variance (Kahan-based).
Var(String),
/// Quantile at probability p ∈ [0, 1], using linear interpolation.
Quantile(String, f64),
/// Count of distinct values (uses BTreeSet).
NDistinct(String),
/// Interquartile range (Q3 − Q1).
Iqr(String),
}
// â"€â"€ ArrangeKey â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// One sorting key for `arrange`.
#[derive(Debug, Clone)]
pub struct ArrangeKey {
/// Column name to sort by.
pub col_name: String,
/// `true` = descending order.
pub descending: bool,
}
impl ArrangeKey {
/// Create an ascending sort key for the given column.
pub fn asc(col_name: &str) -> Self {
ArrangeKey { col_name: col_name.to_string(), descending: false }
}
/// Create a descending sort key for the given column.
pub fn desc(col_name: &str) -> Self {
ArrangeKey { col_name: col_name.to_string(), descending: true }
}
}
// â"€â"€ TidyView: group_by, arrange, slice, distinct, joins â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl TidyView {
// â"€â"€ group_by â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Group the view by one or more column names.
///
/// Returns a `GroupedTidyView`. No column buffers are copied.
/// Group order = first-occurrence order of (key_col1, key_col2, ...) tuples
/// among the currently visible rows (ascending base-row scan).
///
/// Edge cases:
/// • 0 rows → 0 groups, no error.
/// • 0 keys → every visible row becomes one group (equivalent to a
/// global aggregate).
/// • Unknown key column → `TidyError::ColumnNotFound`.
pub fn group_by(&self, keys: &[&str]) -> Result<GroupedTidyView, TidyError> {
// Validate key columns exist in base
let mut key_col_indices = Vec::with_capacity(keys.len());
for &key in keys {
let idx = self
.base
.columns
.iter()
.position(|(n, _)| n == key)
.ok_or_else(|| TidyError::ColumnNotFound(key.to_string()))?;
key_col_indices.push(idx);
}
let key_names: Vec<String> = keys.iter().map(|s| s.to_string()).collect();
// O1 optimization: use BTree-accelerated build_fast for O(N log G) instead of O(N × G).
// v2.2: pass the selection iterator directly — avoids a Vec<usize> allocation that
// can run to millions of entries on large frames.
let index = GroupIndex::build_fast(&self.base, &key_col_indices, self.mask.iter_indices(), key_names);
Ok(GroupedTidyView {
view: self.clone(),
index,
})
}
// â"€â"€ arrange â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Sort visible rows by one or more `ArrangeKey`s.
///
/// Returns a new `TidyView` backed by the same base DataFrame but with
/// a new mask that encodes the sorted row order.
///
/// Design: arrange materialises a `RowIndexMap` (sorted permutation of
/// visible row indices), then re-encodes it into a new base DataFrame
/// containing only those rows in the sorted order. This allows all
/// subsequent mask-based operations to work correctly.
///
/// Semantics:
/// • Stable sort: equal-key rows keep their original relative order.
/// • NaN sorting: NaN values sort LAST (greater than any finite value).
/// • Multi-key: sort by key[0] first, then key[1], ... (left-to-right).
/// • Unknown column → `TidyError::ColumnNotFound`.
/// • Non-numeric sort of Float col: allowed (NaN last).
/// • Mixed-type sort across columns is column-by-column (each col has one type).
pub fn arrange(&self, keys: &[ArrangeKey]) -> Result<TidyView, TidyError> {
// Validate all sort key columns exist in base
for key in keys {
if self.base.get_column(&key.col_name).is_none() {
return Err(TidyError::ColumnNotFound(key.col_name.clone()));
}
}
// Collect visible row indices in current mask order
let mut row_indices: Vec<usize> = self.mask.iter_indices().collect();
// v3 Phase 5: cat-aware arrange. For each key column that is
// `Column::Categorical` with **lex-sorted levels** (the Phase 17
// `forcats` invariant), `levels[code].cmp(&levels[other_code])`
// is byte-equal to `code.cmp(&other_code)` — so we can sort by
// u32 codes directly, skipping the per-call string lookup and
// bytewise comparison. Mixed-type or unsorted-levels key cols
// fall back to the string comparator.
//
// Pre-resolve each key once to either ("sorted-cat code slice",
// descending) or ("legacy compare via column", descending).
enum ArrangeKeyResolved<'a> {
CatCodes { codes: &'a [u32], descending: bool },
Legacy { col: &'a Column, descending: bool },
}
fn levels_are_sorted(levels: &[String]) -> bool {
levels.windows(2).all(|w| w[0] <= w[1])
}
let resolved: Vec<ArrangeKeyResolved> = keys
.iter()
.map(|key| {
let col = self.base.get_column(&key.col_name).unwrap();
match col {
Column::Categorical { levels, codes } if levels_are_sorted(levels) => {
ArrangeKeyResolved::CatCodes {
codes: codes.as_slice(),
descending: key.descending,
}
}
_ => ArrangeKeyResolved::Legacy { col, descending: key.descending },
}
})
.collect();
// Stable sort by keys left-to-right.
row_indices.sort_by(|&a, &b| {
for resolved_key in &resolved {
let ord = match resolved_key {
ArrangeKeyResolved::CatCodes { codes, descending } => {
let raw = codes[a].cmp(&codes[b]);
if *descending { raw.reverse() } else { raw }
}
ArrangeKeyResolved::Legacy { col, descending } => {
let raw = compare_column_rows(col, a, b);
if *descending { raw.reverse() } else { raw }
}
};
if ord != std::cmp::Ordering::Equal {
return ord;
}
}
std::cmp::Ordering::Equal
});
// Re-materialise into a new DataFrame (sorted), wrap as a fresh TidyView
let mut new_columns = Vec::with_capacity(self.proj.len());
for &ci in self.proj.indices() {
let (name, col) = &self.base.columns[ci];
let new_col = gather_column(col, &row_indices);
new_columns.push((name.clone(), new_col));
}
// Also include any non-projected base columns needed for future ops
// Strategy: build new base from ALL original columns in sorted order
let mut sorted_all_cols = Vec::with_capacity(self.base.ncols());
for (name, col) in &self.base.columns {
sorted_all_cols.push((name.clone(), gather_column(col, &row_indices)));
}
let new_base = DataFrame::from_columns(sorted_all_cols)
.map_err(|e| TidyError::Internal(e.to_string()))?;
let nrows = new_base.nrows();
let new_proj = self.proj.clone();
Ok(TidyView {
base: Rc::new(new_base),
mask: AdaptiveSelection::all(nrows),
proj: new_proj,
})
}
// â"€â"€ slice â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Select rows by a half-open range `[start, end)` of visible-row positions.
///
/// Positions are relative to the current visible rows (0-based).
/// Out-of-bounds: clamped to `[0, nrows]`.
pub fn slice(&self, start: usize, end: usize) -> TidyView {
let visible: Vec<usize> = self.mask.iter_indices().collect();
let n = visible.len();
let s = start.min(n);
let e = end.min(n);
let selected = if s >= e { vec![] } else { visible[s..e].to_vec() };
self.view_from_row_indices(selected)
}
/// Select the first `n` visible rows (clamped to nrows).
pub fn slice_head(&self, n: usize) -> TidyView {
self.slice(0, n)
}
/// Select the last `n` visible rows (clamped to nrows).
pub fn slice_tail(&self, n: usize) -> TidyView {
let total = self.mask.count();
let start = total.saturating_sub(n);
self.slice(start, total)
}
/// Deterministic random sample of `n` visible rows using an LCG with `seed`.
///
/// If `n >= nrows`, returns all visible rows in their original order (no error).
/// Sampling uses a Knuth shuffle variant seeded by `seed` (deterministic LCG).
pub fn slice_sample(&self, n: usize, seed: u64) -> TidyView {
let mut visible: Vec<usize> = self.mask.iter_indices().collect();
let total = visible.len();
if n >= total {
return self.view_from_row_indices(visible);
}
// Partial Fisher-Yates using LCG: deterministic with fixed seed
let mut rng = seed;
let selected_count = n;
for i in 0..selected_count {
// LCG step: multiplier and increment from Knuth
rng = rng.wrapping_mul(6364136223846793005).wrapping_add(1442695040888963407);
let j = i + (rng as usize % (total - i));
visible.swap(i, j);
}
visible.truncate(selected_count);
// Sort selected indices to restore ascending order (stable/deterministic)
visible.sort_unstable();
self.view_from_row_indices(visible)
}
// â"€â"€ distinct â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Return rows with unique combinations of the specified columns.
///
/// Output ordering: first-occurrence order (the first row with each distinct
/// key combination is kept).
///
/// Edge cases:
/// • 0 key columns → keeps first row only (all rows equal on zero keys).
/// • Unknown column → `TidyError::ColumnNotFound`.
/// • After projection/mask: only visible columns/rows are considered.
pub fn distinct(&self, cols: &[&str]) -> Result<TidyView, TidyError> {
// Validate columns exist in base
let mut col_indices = Vec::with_capacity(cols.len());
for &name in cols {
let idx = self
.base
.columns
.iter()
.position(|(n, _)| n == name)
.ok_or_else(|| TidyError::ColumnNotFound(name.to_string()))?;
col_indices.push(idx);
}
// Phase 2 cat-aware fast path: when every dedup column is
// Column::Categorical, dedup on Vec<u32> of codes instead of
// Vec<String> of display values. Bit-identical output: codes
// are in 1:1 correspondence with display strings within a single
// DataFrame.
if let Some(cat_keys) = collect_categorical_keys(&self.base, &col_indices) {
let mut seen_codes: BTreeSet<Vec<u32>> = BTreeSet::new();
let mut selected_rows: Vec<usize> = Vec::new();
let mut key_buf: Vec<u32> = Vec::with_capacity(cat_keys.codes.len());
for row in self.mask.iter_indices() {
key_buf.clear();
for c in &cat_keys.codes {
key_buf.push(c[row]);
}
if seen_codes.insert(key_buf.clone()) {
selected_rows.push(row);
}
}
return Ok(self.view_from_row_indices(selected_rows));
}
// O8 optimization: BTreeSet gives O(N log D) instead of O(N × D) linear scan
let mut seen_keys: BTreeSet<Vec<String>> = BTreeSet::new();
let mut selected_rows: Vec<usize> = Vec::new();
for row in self.mask.iter_indices() {
let key: Vec<String> = if col_indices.is_empty() {
vec!["__all__".into()]
} else {
col_indices
.iter()
.map(|&ci| self.base.columns[ci].1.get_display(row))
.collect()
};
if seen_keys.insert(key) {
selected_rows.push(row);
}
}
Ok(self.view_from_row_indices(selected_rows))
}
// â"€â"€ joins â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Inner join: rows where all `on` key columns match.
///
/// Output: left columns then right columns (excluding duplicate key cols).
/// Row order: left outer loop (preserves left order), right inner ascending.
/// Produces a materialized `TidyFrame` (joins always materialize).
///
/// Edge cases:
/// • Unknown join key → `TidyError::ColumnNotFound`.
/// • `on` empty → cross join semantics (every left × every right).
/// • Duplicate keys on left or right → all matching pairs included.
pub fn inner_join(
&self,
right: &TidyView,
on: &[(&str, &str)],
) -> Result<TidyFrame, TidyError> {
let (left_rows, right_rows) = join_match_rows(self, right, on, JoinKind::Inner)?;
build_join_frame(self, right, &left_rows, &right_rows, on, false)
}
/// Left join: all left rows; matched right rows or nulls (0/0.0/""/false).
///
/// Row order: left outer loop order preserved, right matches ascending.
pub fn left_join(
&self,
right: &TidyView,
on: &[(&str, &str)],
) -> Result<TidyFrame, TidyError> {
let (left_rows, right_rows_opt) =
join_match_rows_optional(self, right, on, JoinKind::Left)?;
build_left_join_frame(self, right, &left_rows, &right_rows_opt, on)
}
/// Semi-join: rows in `self` that have at least one match in `right`.
///
/// Returns a `TidyView` (no right columns). Row order: stable left order.
pub fn semi_join(
&self,
right: &TidyView,
on: &[(&str, &str)],
) -> Result<TidyView, TidyError> {
let included = semi_anti_match_rows(self, right, on, /*semi=*/ true)?;
Ok(self.view_from_row_indices(included))
}
/// Anti-join: rows in `self` that have NO match in `right`.
///
/// Returns a `TidyView` (no right columns). Row order: stable left order.
pub fn anti_join(
&self,
right: &TidyView,
on: &[(&str, &str)],
) -> Result<TidyView, TidyError> {
let included = semi_anti_match_rows(self, right, on, /*semi=*/ false)?;
Ok(self.view_from_row_indices(included))
}
// â"€â"€ internal helpers â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Build a new `TidyView` over the same base using explicit row indices.
/// The row indices must be valid base-frame indices.
fn view_from_row_indices(&self, row_indices: Vec<usize>) -> TidyView {
let nrows_base = self.base.nrows();
let mut words = vec![0u64; nwords_for(nrows_base)];
for &r in &row_indices {
words[r / 64] |= 1u64 << (r % 64);
}
TidyView {
base: Rc::clone(&self.base),
mask: AdaptiveSelection::from_predicate_result(words, nrows_base),
proj: self.proj.clone(),
}
}
}
// â"€â"€ Join internals â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
#[derive(Clone, Copy)]
enum JoinKind { Inner, Left }
/// Resolve join key columns from both sides. Returns (left_indices, right_indices).
fn resolve_join_keys(
left: &TidyView,
right: &TidyView,
on: &[(&str, &str)],
) -> Result<(Vec<usize>, Vec<usize>), TidyError> {
let mut li = Vec::new();
let mut ri = Vec::new();
for &(lk, rk) in on {
let l = left.base.columns.iter().position(|(n, _)| n == lk)
.ok_or_else(|| TidyError::ColumnNotFound(lk.to_string()))?;
let r = right.base.columns.iter().position(|(n, _)| n == rk)
.ok_or_else(|| TidyError::ColumnNotFound(rk.to_string()))?;
li.push(l);
ri.push(r);
}
Ok((li, ri))
}
/// Get a join key tuple for a row (as Vec<String> for deterministic comparison).
fn row_key(base: &DataFrame, col_indices: &[usize], row: usize) -> Vec<String> {
col_indices
.iter()
.map(|&ci| base.columns[ci].1.get_display(row))
.collect()
}
/// Build a deterministic right-side lookup: sorted Vec of (key_tuple, right_row_idx).
/// Sorted by key tuple first, then by row index â€" guarantees determinism.
fn build_right_lookup(
right: &TidyView,
right_key_cols: &[usize],
) -> Vec<(Vec<String>, usize)> {
let mut lookup: Vec<(Vec<String>, usize)> = right
.mask
.iter_indices()
.map(|r| (row_key(&right.base, right_key_cols, r), r))
.collect();
// Sort by key then by row index for deterministic ordering
lookup.sort_by(|a, b| a.0.cmp(&b.0).then(a.1.cmp(&b.1)));
lookup
}
/// Find matching right rows for a given left key (from sorted lookup).
fn find_matches(lookup: &[(Vec<String>, usize)], key: &[String]) -> Vec<usize> {
// Binary search for first match, then collect contiguous matches
let key_vec = key.to_vec();
let start = lookup.partition_point(|(k, _)| k.as_slice() < key_vec.as_slice());
let mut matches = Vec::new();
for (k, r) in &lookup[start..] {
if k == &key_vec {
matches.push(*r);
} else {
break;
}
}
matches
}
/// O6: BTreeMap-accelerated right-side lookup for joins.
/// Groups right rows by their key tuple, enabling O(log K) lookup per left row
/// (where K = unique keys) instead of O(log N) binary search on a flat sorted list.
/// Right rows within each key are in ascending order (iter_set() guarantee).
fn build_right_lookup_btree(
right: &TidyView,
right_key_cols: &[usize],
) -> BTreeMap<Vec<String>, Vec<usize>> {
let mut lookup: BTreeMap<Vec<String>, Vec<usize>> = BTreeMap::new();
for r in right.mask.iter_indices() {
let key = row_key(&right.base, right_key_cols, r);
lookup.entry(key).or_default().push(r);
}
lookup
}
// ── v3 Phase 4: cat-aware join key path ─────────────────────────────────
//
// When every join-key column is `Column::Categorical` on BOTH sides, we
// can probe on `Vec<u32>` codes instead of `Vec<String>` displays. Each
// DataFrame owns its own dictionary, so left-side code 3 ≠ right-side
// code 3 in general — we build a per-key-column remap
// `right_to_left[ki][right_code] -> Option<u32>`
// that translates every right code to the matching left code (or `None`
// if the level doesn't exist on the left, in which case that right row
// can never join).
//
// Bit-identity: codes ↔ levels are 1:1 within one DataFrame, so the
// BTreeMap slot assignment, output row order, and join multiplicity are
// byte-equal to the string-key path. Pinned by parity tests in
// `tests/tidy_tests/test_v3_phase4_categorical_joins.rs`.
/// Borrowed cat-aware join key metadata. Built only when every key column
/// on BOTH frames is `Column::Categorical`.
pub(crate) struct CategoricalJoinKeys<'a> {
/// `left_codes[ki]` = code array for left-side key column ki.
pub(crate) left_codes: Vec<&'a [u32]>,
/// `right_codes[ki]` = code array for right-side key column ki.
pub(crate) right_codes: Vec<&'a [u32]>,
/// `right_to_left[ki][right_code] = Some(left_code)` if the level
/// exists on the left, `None` otherwise (row cannot join).
pub(crate) right_to_left: Vec<Vec<Option<u32>>>,
}
/// Returns `Some(CategoricalJoinKeys)` when every column index in both
/// `left_cols` and `right_cols` is `Column::Categorical`. Mixed-type
/// keys, length mismatches, or empty key lists → `None` (caller falls
/// back to the string path).
pub(crate) fn collect_categorical_join_keys<'a>(
left_base: &'a DataFrame,
left_cols: &[usize],
right_base: &'a DataFrame,
right_cols: &[usize],
) -> Option<CategoricalJoinKeys<'a>> {
if left_cols.is_empty() || left_cols.len() != right_cols.len() {
return None;
}
let mut left_codes = Vec::with_capacity(left_cols.len());
let mut right_codes = Vec::with_capacity(left_cols.len());
let mut right_to_left = Vec::with_capacity(left_cols.len());
for (li, ri) in left_cols.iter().zip(right_cols.iter()) {
match (&left_base.columns[*li].1, &right_base.columns[*ri].1) {
(
Column::Categorical { levels: ll, codes: lc },
Column::Categorical { levels: rl, codes: rc },
) => {
// Build deterministic left-level→left-code lookup.
// BTreeMap not HashMap to keep build order stable across
// runs (matches the determinism contract).
let mut left_lookup: BTreeMap<&str, u32> = BTreeMap::new();
for (i, lv) in ll.iter().enumerate() {
left_lookup.insert(lv.as_str(), i as u32);
}
// remap[right_code] = Some(left_code) | None
let remap: Vec<Option<u32>> = rl
.iter()
.map(|rv| left_lookup.get(rv.as_str()).copied())
.collect();
left_codes.push(lc.as_slice());
right_codes.push(rc.as_slice());
right_to_left.push(remap);
}
_ => return None,
}
}
Some(CategoricalJoinKeys {
left_codes,
right_codes,
right_to_left,
})
}
/// Cat-aware right-side BTreeMap lookup. Keyed on `Vec<u32>` left-code
/// tuples (right-side codes are remapped to left-side codes via
/// `right_to_left`). Right rows whose remap returns `None` for any key
/// column are skipped — they cannot join.
fn build_right_lookup_btree_categorical<'a>(
cat: &CategoricalJoinKeys<'a>,
right_visible: impl Iterator<Item = usize>,
) -> BTreeMap<Vec<u32>, Vec<usize>> {
let nkeys = cat.right_codes.len();
let mut lookup: BTreeMap<Vec<u32>, Vec<usize>> = BTreeMap::new();
let mut key_buf: Vec<u32> = Vec::with_capacity(nkeys);
for r in right_visible {
key_buf.clear();
let mut all_mappable = true;
for i in 0..nkeys {
let rc = cat.right_codes[i][r];
match cat.right_to_left[i][rc as usize] {
Some(lc) => key_buf.push(lc),
None => {
all_mappable = false;
break;
}
}
}
if all_mappable {
lookup.entry(key_buf.clone()).or_default().push(r);
}
}
lookup
}
/// Build the left-side join key (in left-code space) for a row.
#[inline]
fn left_join_key_codes(cat: &CategoricalJoinKeys<'_>, row: usize, buf: &mut Vec<u32>) {
buf.clear();
for codes in &cat.left_codes {
buf.push(codes[row]);
}
}
/// Inner join: collect (left_row, right_row) pairs.
fn join_match_rows(
left: &TidyView,
right: &TidyView,
on: &[(&str, &str)],
_kind: JoinKind,
) -> Result<(Vec<usize>, Vec<usize>), TidyError> {
let (left_key_cols, right_key_cols) = resolve_join_keys(left, right, on)?;
// v3 Phase 4: cat-aware fast path when every key column is
// Column::Categorical on both frames. Bit-identical output to the
// string path; falls back automatically on mixed-type keys.
if let Some(cat) =
collect_categorical_join_keys(&left.base, &left_key_cols, &right.base, &right_key_cols)
{
let lookup = build_right_lookup_btree_categorical(&cat, right.mask.iter_indices());
let mut out_left = Vec::new();
let mut out_right = Vec::new();
let mut key_buf: Vec<u32> = Vec::with_capacity(cat.left_codes.len());
for l_row in left.mask.iter_indices() {
left_join_key_codes(&cat, l_row, &mut key_buf);
if let Some(matches) = lookup.get(&key_buf) {
for &r_row in matches {
out_left.push(l_row);
out_right.push(r_row);
}
}
}
return Ok((out_left, out_right));
}
// O6: use BTreeMap for O(log K) lookup instead of sorted-Vec binary search
let lookup = build_right_lookup_btree(right, &right_key_cols);
let mut out_left = Vec::new();
let mut out_right = Vec::new();
for l_row in left.mask.iter_indices() {
let key = row_key(&left.base, &left_key_cols, l_row);
if let Some(matches) = lookup.get(&key) {
for &r_row in matches {
out_left.push(l_row);
out_right.push(r_row);
}
}
}
Ok((out_left, out_right))
}
/// Left join: collect (left_row, Option<right_row>) pairs.
fn join_match_rows_optional(
left: &TidyView,
right: &TidyView,
on: &[(&str, &str)],
_kind: JoinKind,
) -> Result<(Vec<usize>, Vec<Option<usize>>), TidyError> {
let (left_key_cols, right_key_cols) = resolve_join_keys(left, right, on)?;
// v3 Phase 4: cat-aware fast path. See `join_match_rows`.
if let Some(cat) =
collect_categorical_join_keys(&left.base, &left_key_cols, &right.base, &right_key_cols)
{
let lookup = build_right_lookup_btree_categorical(&cat, right.mask.iter_indices());
let mut out_left = Vec::new();
let mut out_right: Vec<Option<usize>> = Vec::new();
let mut key_buf: Vec<u32> = Vec::with_capacity(cat.left_codes.len());
for l_row in left.mask.iter_indices() {
left_join_key_codes(&cat, l_row, &mut key_buf);
match lookup.get(&key_buf) {
Some(matches) if !matches.is_empty() => {
for &r_row in matches {
out_left.push(l_row);
out_right.push(Some(r_row));
}
}
_ => {
out_left.push(l_row);
out_right.push(None);
}
}
}
return Ok((out_left, out_right));
}
// O6: use BTreeMap for O(log K) lookup
let lookup = build_right_lookup_btree(right, &right_key_cols);
let mut out_left = Vec::new();
let mut out_right: Vec<Option<usize>> = Vec::new();
for l_row in left.mask.iter_indices() {
let key = row_key(&left.base, &left_key_cols, l_row);
match lookup.get(&key) {
Some(matches) if !matches.is_empty() => {
for &r_row in matches {
out_left.push(l_row);
out_right.push(Some(r_row));
}
}
_ => {
out_left.push(l_row);
out_right.push(None);
}
}
}
Ok((out_left, out_right))
}
/// Semi/anti join: return left row indices (no right columns).
fn semi_anti_match_rows(
left: &TidyView,
right: &TidyView,
on: &[(&str, &str)],
semi: bool,
) -> Result<Vec<usize>, TidyError> {
let (left_key_cols, right_key_cols) = resolve_join_keys(left, right, on)?;
// v3 Phase 4: cat-aware fast path. See `join_match_rows`.
if let Some(cat) =
collect_categorical_join_keys(&left.base, &left_key_cols, &right.base, &right_key_cols)
{
let lookup = build_right_lookup_btree_categorical(&cat, right.mask.iter_indices());
let mut out = Vec::new();
let mut key_buf: Vec<u32> = Vec::with_capacity(cat.left_codes.len());
for l_row in left.mask.iter_indices() {
left_join_key_codes(&cat, l_row, &mut key_buf);
let has_match = lookup.contains_key(&key_buf);
if has_match == semi {
out.push(l_row);
}
}
return Ok(out);
}
// O6: use BTreeMap for O(log K) lookup
let lookup = build_right_lookup_btree(right, &right_key_cols);
let mut out = Vec::new();
for l_row in left.mask.iter_indices() {
let key = row_key(&left.base, &left_key_cols, l_row);
let has_match = lookup.contains_key(&key);
if has_match == semi {
out.push(l_row);
}
}
Ok(out)
}
/// Build an inner-join result `TidyFrame`.
/// Output cols: all left projected cols, then right projected cols (excluding join-key cols).
fn build_join_frame(
left: &TidyView,
right: &TidyView,
left_rows: &[usize],
right_rows: &[usize],
on: &[(&str, &str)],
_include_unmatched: bool,
) -> Result<TidyFrame, TidyError> {
let right_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(_, rk)| *rk).collect();
let n = left_rows.len();
let mut columns: Vec<(String, Column)> = Vec::new();
// Left projected columns
for &ci in left.proj.indices() {
let (name, col) = &left.base.columns[ci];
columns.push((name.clone(), gather_column(col, left_rows)));
}
// Right projected columns (skip join keys to avoid duplication)
for &ci in right.proj.indices() {
let (name, col) = &right.base.columns[ci];
if right_key_names.contains(name.as_str()) {
continue;
}
columns.push((name.clone(), gather_column(col, right_rows)));
}
assert_eq!(n, left_rows.len());
let df = DataFrame::from_columns(columns)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
/// Build a left-join result `TidyFrame` (right side may have None = unmatched).
fn build_left_join_frame(
left: &TidyView,
right: &TidyView,
left_rows: &[usize],
right_rows_opt: &[Option<usize>],
on: &[(&str, &str)],
) -> Result<TidyFrame, TidyError> {
let right_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(_, rk)| *rk).collect();
let mut columns: Vec<(String, Column)> = Vec::new();
// Left projected columns
for &ci in left.proj.indices() {
let (name, col) = &left.base.columns[ci];
columns.push((name.clone(), gather_column(col, left_rows)));
}
// Right projected columns (None rows get null-equivalents)
for &ci in right.proj.indices() {
let (name, col) = &right.base.columns[ci];
if right_key_names.contains(name.as_str()) {
continue;
}
let new_col = gather_column_nullable(col, right_rows_opt);
columns.push((name.clone(), new_col));
}
let df = DataFrame::from_columns(columns)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
// â"€â"€ Column comparison for arrange â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Compare two rows of a `Column` for use in `arrange`.
///
/// NaN rules (for Float): NaN sorts LAST (treated as greater than any finite).
/// Tie-breaking: returns Equal (caller's stable sort handles relative order).
fn compare_column_rows(col: &Column, a: usize, b: usize) -> std::cmp::Ordering {
match col {
Column::Int(v) => v[a].cmp(&v[b]),
Column::Float(v) => {
let va = v[a];
let vb = v[b];
match (va.is_nan(), vb.is_nan()) {
(true, true) => std::cmp::Ordering::Equal,
(true, false) => std::cmp::Ordering::Greater, // NaN last
(false, true) => std::cmp::Ordering::Less,
(false, false) => va.partial_cmp(&vb).unwrap_or(std::cmp::Ordering::Equal),
}
}
Column::Bool(v) => v[a].cmp(&v[b]),
Column::Str(v) => v[a].cmp(&v[b]),
Column::Categorical { levels, codes } => {
// Compare by the level string, not the code
levels[codes[a] as usize].cmp(&levels[codes[b] as usize])
}
Column::CategoricalAdaptive(cc) => {
// Byte-lex comparison via dictionary lookup. Bytes are the
// determinism contract anchor for the byte_dict engine.
cc.get(a).cmp(&cc.get(b))
}
Column::DateTime(v) => v[a].cmp(&v[b]),
}
}
// (TidyError::EmptyGroup is defined in the TidyError enum above.)
// â"€â"€ NoGC safe-builtin registrations (Phase 11â€"12) â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
//
// Ops that only update metadata (no column buffer alloc):
// tidy_group_by : builds GroupIndex (Vec of Vec<usize>) â€" no column alloc
// tidy_ungroup : drops GroupIndex â€" no alloc
// tidy_arrange : materialises sorted base (ALLOCATES) → NOT @nogc safe
// tidy_slice : updates RowIndexMap â€" O(N) usize alloc, safe
// tidy_distinct : builds RowIndexMap â€" O(N) usize alloc, safe
// tidy_semi_join : builds RowIndexMap â€" O(N) usize alloc, safe
// tidy_anti_join : builds RowIndexMap â€" O(N) usize alloc, safe
// tidy_inner_join : materialises result â€" ALLOCATES → NOT @nogc safe
// tidy_left_join : materialises result â€" ALLOCATES → NOT @nogc safe
// tidy_summarise : materialises result â€" ALLOCATES → NOT @nogc safe
//
// Safe (registered in nogc_verify.rs):
// tidy_group_by, tidy_ungroup, tidy_slice, tidy_distinct,
// tidy_semi_join, tidy_anti_join, tidy_ngroups
#[cfg(test)]
mod phase10_unit_tests {
use super::*;
fn make_df() -> DataFrame {
DataFrame::from_columns(vec![
("x".into(), Column::Int(vec![1, 2, 3, 4, 5])),
("y".into(), Column::Float(vec![1.0, 2.0, 3.0, 4.0, 5.0])),
("flag".into(), Column::Bool(vec![true, false, true, false, true])),
])
.unwrap()
}
#[test]
fn bitmask_all_true_count() {
let m = BitMask::all_true(5);
assert_eq!(m.count_ones(), 5);
}
#[test]
fn bitmask_all_false_count() {
let m = BitMask::all_false(5);
assert_eq!(m.count_ones(), 0);
}
#[test]
fn bitmask_tail_bits_clean() {
// 65 rows â€" two words; tail must not bleed into unset bits
let m = BitMask::all_true(65);
assert_eq!(m.count_ones(), 65);
assert_eq!(m.words.len(), 2);
assert_eq!(m.words[1], 1u64); // only bit 0 of second word set
}
#[test]
fn bitmask_and_semantics() {
let a = BitMask::from_bools(&[true, true, false, true]);
let b = BitMask::from_bools(&[true, false, true, true]);
let c = a.and(&b);
assert!(c.get(0));
assert!(!c.get(1));
assert!(!c.get(2));
assert!(c.get(3));
}
#[test]
fn tidy_view_nrows_ncols() {
let df = make_df();
let v = df.tidy();
assert_eq!(v.nrows(), 5);
assert_eq!(v.ncols(), 3);
}
#[test]
fn filter_basic() {
let df = make_df();
let v = df.tidy();
let filtered = v
.filter(&DExpr::BinOp {
op: DBinOp::Gt,
left: Box::new(DExpr::Col("x".into())),
right: Box::new(DExpr::LitInt(2)),
})
.unwrap();
assert_eq!(filtered.nrows(), 3);
}
#[test]
fn filter_empty_df() {
let df = DataFrame::from_columns(vec![
("x".into(), Column::Int(vec![])),
])
.unwrap();
let v = df.tidy();
let filtered = v
.filter(&DExpr::BinOp {
op: DBinOp::Gt,
left: Box::new(DExpr::Col("x".into())),
right: Box::new(DExpr::LitInt(0)),
})
.unwrap();
assert_eq!(filtered.nrows(), 0);
}
#[test]
fn select_reorder() {
let df = make_df();
let v = df.tidy();
let s = v.select(&["y", "x"]).unwrap();
assert_eq!(s.column_names(), vec!["y", "x"]);
}
#[test]
fn select_zero_cols() {
let df = make_df();
let v = df.tidy();
let s = v.select(&[]).unwrap();
assert_eq!(s.ncols(), 0);
assert_eq!(s.nrows(), 5);
}
#[test]
fn select_unknown_col() {
let df = make_df();
let v = df.tidy();
let err = v.select(&["nonexistent"]).unwrap_err();
assert!(matches!(err, TidyError::ColumnNotFound(_)));
}
#[test]
fn select_duplicate_col() {
let df = make_df();
let v = df.tidy();
let err = v.select(&["x", "x"]).unwrap_err();
assert!(matches!(err, TidyError::DuplicateColumn(_)));
}
#[test]
fn mutate_new_col() {
let df = make_df();
let v = df.tidy();
let frame = v
.mutate(&[("z", DExpr::BinOp {
op: DBinOp::Add,
left: Box::new(DExpr::Col("x".into())),
right: Box::new(DExpr::LitInt(10)),
})])
.unwrap();
let b = frame.borrow();
let z = b.get_column("z").unwrap();
assert_eq!(z.len(), 5);
if let Column::Int(v) = z {
assert_eq!(v[0], 11);
assert_eq!(v[4], 15);
} else {
panic!("expected Int column");
}
}
#[test]
fn mutate_type_promotion() {
let df = make_df();
let v = df.tidy();
// x (Int) + y (Float) → Float column
let frame = v
.mutate(&[("promoted", DExpr::BinOp {
op: DBinOp::Add,
left: Box::new(DExpr::Col("x".into())),
right: Box::new(DExpr::Col("y".into())),
})])
.unwrap();
let b = frame.borrow();
let col = b.get_column("promoted").unwrap();
assert!(matches!(col, Column::Float(_)));
}
}
// â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
// Phase 13â€"16: Tidy Completion
// â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•â•
//
// Spec-Lock Table (invariants that tests must not regress):
//
// | Property | Rule |
// |-------------------------------|------|
// | pivot_longer row order | Original row order preserved; within each
// | | original row, columns appear in the order
// | | supplied in `value_cols`. |
// | pivot_longer col order | id_cols first, then "name" col, then
// | | "value" col. |
// | pivot_wider col order | id_cols first, then each unique key value
// | | in first-occurrence order from the source
// | | column. |
// | pivot_wider duplicate keys | Strict error: TidyError::DuplicateKey. |
// | pivot_wider null key | Null key string treated as literal "null". |
// | pivot_longer zero value_cols | TidyError::EmptySelection. |
// | pivot_longer mixed types | Strict: all value_cols must have same type;
// | | TidyError::TypeMismatch otherwise. |
// | NullCol semantics | is_null(x) is always well-defined. |
// | NullCol in group_by | Null key forms its own group (treated as
// | | equal to other nulls). |
// | NullCol in join | Null key does NOT match null key (SQL NULL
// | | semantics). |
// | NullCol in aggregation | Nulls skipped; all-null → null result. |
// | NullCol comparison | NULL op x → null result (three-valued). |
// | rename collision | TidyError::DuplicateColumn. |
// | rename unknown col | TidyError::ColumnNotFound. |
// | relocate ordering | Stable; relative order of non-moved cols
// | | preserved. |
// | select(-col) drop | TidyError::ColumnNotFound for unknown. |
// | bind_rows schema mismatch | Strict: TidyError::SchemaMismatch. |
// | bind_rows col order | Left frame column order. |
// | bind_rows row order | Left rows then right rows. |
// | bind_cols row mismatch | TidyError::LengthMismatch. |
// | bind_cols name collision | TidyError::DuplicateColumn. |
// | across expansion order | Stable column iteration (projection order).
// | across generated names | "{col}_{fn_name}" or user template. |
// | across name collision | TidyError::DuplicateColumn. |
// | join type validation | Comparable types only (Intâ†"Int, Floatâ†"Float,
// | | Strâ†"Str, Boolâ†"Bool, Intâ†"Float widened).
// | | TidyError::TypeMismatch otherwise. |
// | join suffix handling | Default ".x"/".y"; user may override. |
// | right_join / full_join | Defined; row order: see semantics section. |
// | group perf upgrade | First-occurrence order preserved; identical
// | | output to Phase 11 implementation. |
// â"€â"€ New TidyError variants â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl TidyError {
/// Construct a schema mismatch error.
pub fn schema_mismatch(msg: impl Into<String>) -> Self {
TidyError::Internal(format!("schema mismatch: {}", msg.into()))
}
/// Construct a type mismatch for join validation.
pub fn join_type_mismatch(col: &str, lt: &str, rt: &str) -> Self {
TidyError::TypeMismatch {
expected: format!("{} (from left key `{}`)", lt, col),
got: rt.to_string(),
}
}
/// Duplicate join/pivot key error.
pub fn duplicate_key(key: impl Into<String>) -> Self {
TidyError::DuplicateColumn(format!("duplicate key: {}", key.into()))
}
/// Empty selection (e.g. pivot_longer with zero value_cols).
pub fn empty_selection(msg: impl Into<String>) -> Self {
TidyError::Internal(format!("empty selection: {}", msg.into()))
}
}
// â"€â"€ Nullable column layer â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A nullable column: values buffer + validity bitmap.
///
/// The validity bitmap uses the same bit layout as `BitMask` (LSB-first,
/// tail bits zeroed). `validity.get(i) == true` means `values[i]` is valid
/// (not null). When `validity.get(i) == false`, `values[i]` holds a
/// type-appropriate zero/empty value but MUST NOT be read as valid data.
#[derive(Debug, Clone)]
pub struct NullableColumn<T: Clone> {
pub values: Vec<T>,
pub validity: BitMask,
}
impl<T: Clone + Default> NullableColumn<T> {
/// Create a fully valid (non-null) nullable column from a slice.
pub fn from_values(values: Vec<T>) -> Self {
let n = values.len();
Self {
values,
validity: BitMask::all_true(n),
}
}
/// Create a nullable column with explicit validity.
/// Panics if `values.len() != validity.nrows()`.
pub fn new(values: Vec<T>, validity: BitMask) -> Self {
assert_eq!(values.len(), validity.nrows(), "NullableColumn: length mismatch");
Self { values, validity }
}
/// Number of rows.
pub fn len(&self) -> usize {
self.values.len()
}
/// Is the value at row `i` null?
pub fn is_null(&self, i: usize) -> bool {
!self.validity.get(i)
}
/// Get the value at `i` if non-null.
pub fn get(&self, i: usize) -> Option<&T> {
if self.validity.get(i) { Some(&self.values[i]) } else { None }
}
/// Count non-null rows.
pub fn count_valid(&self) -> usize {
self.validity.count_ones()
}
/// Gather rows by index (for join/arrange materialisation).
pub fn gather(&self, indices: &[usize]) -> Self {
let mut vals = Vec::with_capacity(indices.len());
let mut bools = Vec::with_capacity(indices.len());
for &i in indices {
vals.push(self.values[i].clone());
bools.push(self.validity.get(i));
}
let validity = BitMask::from_bools(&bools);
Self { values: vals, validity }
}
}
/// A typed nullable column variant stored in a DataFrame column slot.
///
/// Phase 13-16 does not replace `Column` (which has no nulls) with `NullCol`
/// everywhere â€" that would be a breaking change across the whole codebase.
/// Instead, `NullCol` is used as the result type of operations that can
/// introduce nulls (left_join fills, pivot_wider missing combinations,
/// bind_rows on mismatched schemas, full_join unmatched rows).
///
/// Conversion: `NullCol::from_column(col)` wraps an existing `Column` as
/// fully-valid nullable. `NullCol::to_column(nc)` unwraps if fully valid,
/// else returns `TidyError::Internal` (caller must handle null columns
/// explicitly).
#[derive(Debug, Clone)]
pub enum NullCol {
/// Nullable 64-bit signed integer column.
Int(NullableColumn<i64>),
/// Nullable 64-bit floating-point column.
Float(NullableColumn<f64>),
/// Nullable UTF-8 string column.
Str(NullableColumn<String>),
/// Nullable boolean column.
Bool(NullableColumn<bool>),
}
impl NullCol {
/// Returns the number of rows (including nulls).
pub fn len(&self) -> usize {
match self {
NullCol::Int(c) => c.len(),
NullCol::Float(c) => c.len(),
NullCol::Str(c) => c.len(),
NullCol::Bool(c) => c.len(),
}
}
/// Returns `true` if row `i` is null.
pub fn is_null(&self, i: usize) -> bool {
match self {
NullCol::Int(c) => c.is_null(i),
NullCol::Float(c) => c.is_null(i),
NullCol::Str(c) => c.is_null(i),
NullCol::Bool(c) => c.is_null(i),
}
}
/// Returns the human-readable type name of this nullable column variant.
pub fn type_name(&self) -> &'static str {
match self {
NullCol::Int(_) => "Int",
NullCol::Float(_) => "Float",
NullCol::Str(_) => "Str",
NullCol::Bool(_) => "Bool",
}
}
/// Wrap a non-nullable `Column` as fully valid.
pub fn from_column(col: &Column) -> Self {
match col {
Column::Int(v) => NullCol::Int(NullableColumn::from_values(v.clone())),
Column::Float(v) => NullCol::Float(NullableColumn::from_values(v.clone())),
Column::Str(v) => NullCol::Str(NullableColumn::from_values(v.clone())),
Column::Bool(v) => NullCol::Bool(NullableColumn::from_values(v.clone())),
// Categorical is stored as its string representation for nullable contexts
Column::Categorical { levels, codes } => {
let strings: Vec<String> = codes.iter().map(|&c| levels[c as usize].clone()).collect();
NullCol::Str(NullableColumn::from_values(strings))
}
Column::CategoricalAdaptive(cc) => {
let n = cc.len();
let strings: Vec<String> = (0..n)
.map(|i| match cc.get(i) {
None => String::new(),
Some(b) => String::from_utf8_lossy(b).into_owned(),
})
.collect();
NullCol::Str(NullableColumn::from_values(strings))
}
Column::DateTime(v) => NullCol::Int(NullableColumn::from_values(v.clone())),
}
}
/// Unwrap to non-nullable `Column` only if all rows are valid (not null).
/// If any null exists, returns `Err(TidyError::Internal)`.
pub fn to_column_strict(&self) -> Result<Column, TidyError> {
match self {
NullCol::Int(nc) => {
if nc.count_valid() == nc.len() {
Ok(Column::Int(nc.values.clone()))
} else {
Err(TidyError::Internal("null values in non-nullable context".into()))
}
}
NullCol::Float(nc) => {
if nc.count_valid() == nc.len() {
Ok(Column::Float(nc.values.clone()))
} else {
Err(TidyError::Internal("null values in non-nullable context".into()))
}
}
NullCol::Str(nc) => {
if nc.count_valid() == nc.len() {
Ok(Column::Str(nc.values.clone()))
} else {
Err(TidyError::Internal("null values in non-nullable context".into()))
}
}
NullCol::Bool(nc) => {
if nc.count_valid() == nc.len() {
Ok(Column::Bool(nc.values.clone()))
} else {
Err(TidyError::Internal("null values in non-nullable context".into()))
}
}
}
}
/// Convert to `Column`, filling nulls with type-appropriate zero-value.
/// Null Int → 0, Null Float → NaN, Null Str → "", Null Bool → false.
pub fn to_column_filled(&self) -> Column {
match self {
NullCol::Int(nc) => Column::Int(nc.values.clone()),
NullCol::Float(nc) => {
let v: Vec<f64> = (0..nc.len())
.map(|i| if nc.is_null(i) { f64::NAN } else { nc.values[i] })
.collect();
Column::Float(v)
}
NullCol::Str(nc) => Column::Str(nc.values.clone()),
NullCol::Bool(nc) => Column::Bool(nc.values.clone()),
}
}
/// Get display string for a row (null → "null").
pub fn get_display(&self, i: usize) -> String {
if self.is_null(i) {
return "null".to_string();
}
match self {
NullCol::Int(nc) => format!("{}", nc.values[i]),
NullCol::Float(nc) => format!("{}", nc.values[i]),
NullCol::Str(nc) => nc.values[i].clone(),
NullCol::Bool(nc) => format!("{}", nc.values[i]),
}
}
/// Create a null-fill column of given type and length.
pub fn null_of_type(type_name: &str, len: usize) -> Self {
match type_name {
"Int" => NullCol::Int(NullableColumn {
values: vec![0i64; len],
validity: BitMask::all_false(len),
}),
"Float" => NullCol::Float(NullableColumn {
values: vec![0.0f64; len],
validity: BitMask::all_false(len),
}),
"Bool" => NullCol::Bool(NullableColumn {
values: vec![false; len],
validity: BitMask::all_false(len),
}),
_ => NullCol::Str(NullableColumn {
values: vec![String::new(); len],
validity: BitMask::all_false(len),
}),
}
}
/// Gather rows by index.
pub fn gather(&self, indices: &[usize]) -> Self {
match self {
NullCol::Int(nc) => NullCol::Int(nc.gather(indices)),
NullCol::Float(nc) => NullCol::Float(nc.gather(indices)),
NullCol::Str(nc) => NullCol::Str(nc.gather(indices)),
NullCol::Bool(nc) => NullCol::Bool(nc.gather(indices)),
}
}
}
/// A DataFrame-like frame that can hold nullable columns.
/// Used as output of joins, pivots, and bind operations that may introduce nulls.
#[derive(Debug, Clone)]
pub struct NullableFrame {
pub columns: Vec<(String, NullCol)>,
}
impl NullableFrame {
/// Create an empty NullableFrame with no columns.
pub fn new() -> Self {
Self { columns: Vec::new() }
}
/// Returns the number of rows.
pub fn nrows(&self) -> usize {
self.columns.first().map(|(_, c)| c.len()).unwrap_or(0)
}
/// Returns the number of columns.
pub fn ncols(&self) -> usize {
self.columns.len()
}
/// Returns column names in order.
pub fn column_names(&self) -> Vec<&str> {
self.columns.iter().map(|(n, _)| n.as_str()).collect()
}
/// Look up a nullable column by name.
pub fn get_column(&self, name: &str) -> Option<&NullCol> {
self.columns.iter().find(|(n, _)| n == name).map(|(_, c)| c)
}
/// Convert to a regular `DataFrame`, filling nulls with type-appropriate values.
pub fn to_dataframe_filled(&self) -> DataFrame {
let cols: Vec<(String, Column)> = self.columns.iter()
.map(|(n, c)| (n.clone(), c.to_column_filled()))
.collect();
// Safety: all columns should have same length if built correctly
DataFrame { columns: cols }
}
/// Convert to `TidyFrame` (filled), discarding null metadata.
pub fn to_tidy_frame_filled(&self) -> TidyFrame {
TidyFrame::from_df(self.to_dataframe_filled())
}
/// Convert to `TidyView` (filled), discarding null metadata.
pub fn to_tidy_view_filled(&self) -> TidyView {
TidyView::from_df(self.to_dataframe_filled())
}
}
impl Default for NullableFrame {
fn default() -> Self { Self::new() }
}
// â"€â"€ Helpers for nullable-aware gather â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Gather column rows with optional indices (None → null).
/// Used in left/right/full join output where some rows have no match.
fn gather_column_nullable_null(col: &Column, indices: &[Option<usize>]) -> NullCol {
if matches!(col, Column::CategoricalAdaptive(_)) {
return gather_column_nullable_null(&col.to_legacy_categorical(), indices);
}
match col {
Column::Int(v) => {
let mut vals = Vec::with_capacity(indices.len());
let mut valid = Vec::with_capacity(indices.len());
for &idx in indices {
match idx {
Some(i) => { vals.push(v[i]); valid.push(true); }
None => { vals.push(0); valid.push(false); }
}
}
NullCol::Int(NullableColumn::new(vals, BitMask::from_bools(&valid)))
}
Column::Float(v) => {
let mut vals = Vec::with_capacity(indices.len());
let mut valid = Vec::with_capacity(indices.len());
for &idx in indices {
match idx {
Some(i) => { vals.push(v[i]); valid.push(true); }
None => { vals.push(0.0); valid.push(false); }
}
}
NullCol::Float(NullableColumn::new(vals, BitMask::from_bools(&valid)))
}
Column::Str(v) => {
let mut vals = Vec::with_capacity(indices.len());
let mut valid = Vec::with_capacity(indices.len());
for &idx in indices {
match idx {
Some(i) => { vals.push(v[i].clone()); valid.push(true); }
None => { vals.push(String::new()); valid.push(false); }
}
}
NullCol::Str(NullableColumn::new(vals, BitMask::from_bools(&valid)))
}
Column::Bool(v) => {
let mut vals = Vec::with_capacity(indices.len());
let mut valid = Vec::with_capacity(indices.len());
for &idx in indices {
match idx {
Some(i) => { vals.push(v[i]); valid.push(true); }
None => { vals.push(false); valid.push(false); }
}
}
NullCol::Bool(NullableColumn::new(vals, BitMask::from_bools(&valid)))
}
Column::Categorical { levels, codes } => {
let mut vals = Vec::with_capacity(indices.len());
let mut valid = Vec::with_capacity(indices.len());
for &idx in indices {
match idx {
Some(i) => { vals.push(levels[codes[i] as usize].clone()); valid.push(true); }
None => { vals.push(String::new()); valid.push(false); }
}
}
NullCol::Str(NullableColumn::new(vals, BitMask::from_bools(&valid)))
}
Column::DateTime(v) => {
let mut vals = Vec::with_capacity(indices.len());
let mut valid = Vec::with_capacity(indices.len());
for &idx in indices {
match idx {
Some(i) => { vals.push(v[i]); valid.push(true); }
None => { vals.push(0); valid.push(false); }
}
}
NullCol::Int(NullableColumn::new(vals, BitMask::from_bools(&valid)))
}
Column::CategoricalAdaptive(_) => unreachable!("handled by early return"),
}
}
// â"€â"€ Across support types â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A scalar transformation function for `across()`.
///
/// Takes a column reference name (for error messages) and a `Column`, returns
/// a new `Column` of the same length or `TidyError`.
pub type AcrossFn = Box<dyn Fn(&str, &Column) -> Result<Column, TidyError>>;
/// A named across transformation.
pub struct AcrossTransform {
/// Function name (used in generated column name `{col}_{fn_name}`).
pub fn_name: String,
/// The transformation to apply.
pub func: AcrossFn,
}
impl AcrossTransform {
/// Create a new across transformation with the given name and column-mapping function.
pub fn new(fn_name: impl Into<String>, func: impl Fn(&str, &Column) -> Result<Column, TidyError> + 'static) -> Self {
Self {
fn_name: fn_name.into(),
func: Box::new(func),
}
}
}
/// An across() specification: select columns and apply one function.
pub struct AcrossSpec {
/// Columns to transform (by name).
pub cols: Vec<String>,
/// Transform to apply.
pub transform: AcrossTransform,
/// Optional output name template. None → "{col}_{fn}".
/// Use `{col}` and `{fn}` as placeholders.
pub name_template: Option<String>,
}
impl AcrossSpec {
/// Create a new across specification targeting the given columns with one transform.
pub fn new(cols: impl IntoIterator<Item = impl Into<String>>, transform: AcrossTransform) -> Self {
Self {
cols: cols.into_iter().map(|c| c.into()).collect(),
transform,
name_template: None,
}
}
/// Set a custom output-name template (use `{col}` and `{fn}` as placeholders).
pub fn with_template(mut self, tmpl: impl Into<String>) -> Self {
self.name_template = Some(tmpl.into());
self
}
/// Generate the output column name for a given input column.
pub fn output_name(&self, col_name: &str) -> String {
match &self.name_template {
Some(tmpl) => tmpl
.replace("{col}", col_name)
.replace("{fn}", &self.transform.fn_name),
None => format!("{}_{}", col_name, self.transform.fn_name),
}
}
}
// â"€â"€ Join maturity types â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Join suffix options for handling column name collisions in inner/left/right/full joins.
#[derive(Debug, Clone)]
pub struct JoinSuffix {
pub left: String,
pub right: String,
}
impl Default for JoinSuffix {
fn default() -> Self {
Self { left: ".x".into(), right: ".y".into() }
}
}
impl JoinSuffix {
/// Create custom suffixes for left and right table columns on name collision.
pub fn new(left: impl Into<String>, right: impl Into<String>) -> Self {
Self { left: left.into(), right: right.into() }
}
}
// â"€â"€ Column type comparison for join validation â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Check whether two Column types are join-compatible.
/// Int and Float are mutually compatible (numeric widening).
fn join_types_compatible(left: &Column, right: &Column) -> bool {
match (left, right) {
(Column::Int(_), Column::Int(_)) => true,
(Column::Float(_), Column::Float(_)) => true,
(Column::Int(_), Column::Float(_)) => true,
(Column::Float(_), Column::Int(_)) => true,
(Column::Str(_), Column::Str(_)) => true,
(Column::Bool(_), Column::Bool(_)) => true,
_ => false,
}
}
// â"€â"€ Phase 13-16 TidyView extensions â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl TidyView {
// â"€â"€ pivot_longer â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Pivot selected columns from wide to long format.
///
/// `value_cols`: columns to gather (must all have the same type).
/// `names_to`: name of the output "variable name" column.
/// `values_to`: name of the output "value" column.
///
/// Output schema: [id_cols..., names_to, values_to]
/// Row order: for each source row (in visible order), one output row per
/// value column (in the order they appear in `value_cols`).
///
/// Edge cases:
/// • `value_cols` empty → `TidyError::EmptySelection`.
/// • Unknown column → `TidyError::ColumnNotFound`.
/// • Duplicate in `value_cols` → `TidyError::DuplicateColumn`.
/// • Mixed types in `value_cols` → `TidyError::TypeMismatch`.
pub fn pivot_longer(
&self,
value_cols: &[&str],
names_to: &str,
values_to: &str,
) -> Result<TidyFrame, TidyError> {
if value_cols.is_empty() {
return Err(TidyError::empty_selection("pivot_longer requires at least one value_col"));
}
// Validate and resolve value column indices
let mut seen_vc: Vec<&str> = Vec::new();
let mut vc_indices: Vec<usize> = Vec::new();
for &name in value_cols {
if seen_vc.contains(&name) {
return Err(TidyError::DuplicateColumn(name.to_string()));
}
seen_vc.push(name);
let idx = self.base.columns.iter().position(|(n, _)| n == name)
.ok_or_else(|| TidyError::ColumnNotFound(name.to_string()))?;
vc_indices.push(idx);
}
// Type consistency check: all value columns must have the same type
let first_type = self.base.columns[vc_indices[0]].1.type_name();
for &idx in &vc_indices[1..] {
let t = self.base.columns[idx].1.type_name();
if t != first_type {
return Err(TidyError::TypeMismatch {
expected: first_type.to_string(),
got: t.to_string(),
});
}
}
// id_cols = projected columns excluding value_cols
let vc_set: std::collections::BTreeSet<usize> = vc_indices.iter().copied().collect();
let id_col_indices: Vec<usize> = self.proj.indices().iter()
.copied()
.filter(|i| !vc_set.contains(i))
.collect();
let visible_rows: Vec<usize> = self.mask.iter_indices().collect();
let n_out = visible_rows.len() * value_cols.len();
// Build id columns (repeated value_cols.len() times per source row)
let mut out_cols: Vec<(String, Column)> = Vec::new();
for &id_idx in &id_col_indices {
let (name, col_orig) = &self.base.columns[id_idx];
let legacy_owned;
let col: &Column = if matches!(col_orig, Column::CategoricalAdaptive(_)) {
legacy_owned = col_orig.to_legacy_categorical();
&legacy_owned
} else {
col_orig
};
let new_col = match col {
Column::Int(v) => {
let mut out = Vec::with_capacity(n_out);
for &r in &visible_rows {
for _ in 0..value_cols.len() { out.push(v[r]); }
}
Column::Int(out)
}
Column::Float(v) => {
let mut out = Vec::with_capacity(n_out);
for &r in &visible_rows {
for _ in 0..value_cols.len() { out.push(v[r]); }
}
Column::Float(out)
}
Column::Str(v) => {
let mut out = Vec::with_capacity(n_out);
for &r in &visible_rows {
for _ in 0..value_cols.len() { out.push(v[r].clone()); }
}
Column::Str(out)
}
Column::Bool(v) => {
let mut out = Vec::with_capacity(n_out);
for &r in &visible_rows {
for _ in 0..value_cols.len() { out.push(v[r]); }
}
Column::Bool(out)
}
Column::Categorical { levels, codes } => {
let mut out = Vec::with_capacity(n_out);
for &r in &visible_rows {
for _ in 0..value_cols.len() { out.push(codes[r]); }
}
Column::Categorical { levels: levels.clone(), codes: out }
}
Column::DateTime(v) => {
let mut out = Vec::with_capacity(n_out);
for &r in &visible_rows {
for _ in 0..value_cols.len() { out.push(v[r]); }
}
Column::DateTime(out)
}
Column::CategoricalAdaptive(_) => unreachable!("converted via legacy_owned"),
};
out_cols.push((name.clone(), new_col));
}
// Build "names" column (the variable name, repeated per source row)
let names_col: Vec<String> = visible_rows.iter()
.flat_map(|_| value_cols.iter().map(|s| s.to_string()))
.collect();
out_cols.push((names_to.to_string(), Column::Str(names_col)));
// Build "values" column (all types already checked equal)
match &self.base.columns[vc_indices[0]].1 {
Column::Int(_) => {
let mut vals: Vec<i64> = Vec::with_capacity(n_out);
for &r in &visible_rows {
for &vci in &vc_indices {
if let Column::Int(v) = &self.base.columns[vci].1 {
vals.push(v[r]);
}
}
}
out_cols.push((values_to.to_string(), Column::Int(vals)));
}
Column::Float(_) => {
let mut vals: Vec<f64> = Vec::with_capacity(n_out);
for &r in &visible_rows {
for &vci in &vc_indices {
if let Column::Float(v) = &self.base.columns[vci].1 {
vals.push(v[r]);
}
}
}
out_cols.push((values_to.to_string(), Column::Float(vals)));
}
Column::Str(_) => {
let mut vals: Vec<String> = Vec::with_capacity(n_out);
for &r in &visible_rows {
for &vci in &vc_indices {
if let Column::Str(v) = &self.base.columns[vci].1 {
vals.push(v[r].clone());
}
}
}
out_cols.push((values_to.to_string(), Column::Str(vals)));
}
Column::Bool(_) => {
let mut vals: Vec<bool> = Vec::with_capacity(n_out);
for &r in &visible_rows {
for &vci in &vc_indices {
if let Column::Bool(v) = &self.base.columns[vci].1 {
vals.push(v[r]);
}
}
}
out_cols.push((values_to.to_string(), Column::Bool(vals)));
}
Column::Categorical { .. } | Column::CategoricalAdaptive(_) | Column::DateTime(_) => {
// For pivot_longer, fall back to string representation
let mut vals: Vec<String> = Vec::with_capacity(n_out);
for &r in &visible_rows {
for &vci in &vc_indices {
vals.push(self.base.columns[vci].1.get_display(r));
}
}
out_cols.push((values_to.to_string(), Column::Str(vals)));
}
}
let df = DataFrame::from_columns(out_cols)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
// â"€â"€ pivot_wider â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Pivot long-format data to wide format.
///
/// `names_from`: the column whose values become new column headers.
/// `values_from`: the column whose values fill the new columns.
/// `id_cols`: columns that identify each output row.
///
/// Output schema: [id_cols..., unique_key_values... (first-occurrence order)]
/// Row order: one row per unique combination of id_col values
/// (first-occurrence order).
///
/// Edge cases:
/// • Duplicate (id_key, name_key) combo → `TidyError::DuplicateKey`.
/// • Missing combo → null fill via NullableFrame.
/// • Unknown column → `TidyError::ColumnNotFound`.
pub fn pivot_wider(
&self,
id_cols: &[&str],
names_from: &str,
values_from: &str,
) -> Result<NullableFrame, TidyError> {
// Validate columns
let _names_col_idx = self.base.columns.iter().position(|(n, _)| n == names_from)
.ok_or_else(|| TidyError::ColumnNotFound(names_from.to_string()))?;
let _values_col_idx = self.base.columns.iter().position(|(n, _)| n == values_from)
.ok_or_else(|| TidyError::ColumnNotFound(values_from.to_string()))?;
for &id in id_cols {
let _ = self.base.columns.iter().position(|(n, _)| n == id)
.ok_or_else(|| TidyError::ColumnNotFound(id.to_string()))?;
}
let visible_rows: Vec<usize> = self.mask.iter_indices().collect();
// Collect unique key values in first-occurrence order
let mut key_values: Vec<String> = Vec::new();
for &r in &visible_rows {
let kv = self.base.get_column(names_from).unwrap().get_display(r);
if !key_values.contains(&kv) {
key_values.push(kv);
}
}
// Collect unique id combinations in first-occurrence order
// Map: id_tuple → output_row_slot
let id_col_refs: Vec<&Column> = id_cols.iter()
.map(|&name| self.base.get_column(name).unwrap())
.collect();
let mut id_order: Vec<Vec<String>> = Vec::new(); // first-occurrence
let mut id_to_slot: Vec<(Vec<String>, usize)> = Vec::new(); // linear scan map
for &r in &visible_rows {
let id_key: Vec<String> = id_col_refs.iter()
.map(|col| col.get_display(r))
.collect();
if !id_to_slot.iter().any(|(k, _)| k == &id_key) {
let slot = id_order.len();
id_order.push(id_key.clone());
id_to_slot.push((id_key, slot));
}
}
let n_rows = id_order.len();
let n_keys = key_values.len();
// Cell lookup: (id_slot, key_slot) → source row index
// Detect duplicate (id, key) combinations
let mut cell_map: Vec<Vec<Option<usize>>> = vec![vec![None; n_keys]; n_rows];
for &r in &visible_rows {
let id_key: Vec<String> = id_col_refs.iter()
.map(|col| col.get_display(r))
.collect();
let id_slot = id_to_slot.iter().find(|(k, _)| k == &id_key).unwrap().1;
let kv = self.base.get_column(names_from).unwrap().get_display(r);
let key_slot = key_values.iter().position(|v| v == &kv).unwrap();
if cell_map[id_slot][key_slot].is_some() {
return Err(TidyError::duplicate_key(
format!("({}, {})", id_key.join(", "), kv)
));
}
cell_map[id_slot][key_slot] = Some(r);
}
// Build output NullableFrame
let mut out_cols: Vec<(String, NullCol)> = Vec::new();
// Id columns
for (id_idx, &id_name) in id_cols.iter().enumerate() {
let id_col = self.base.get_column(id_name).unwrap();
let id_row_indices: Vec<usize> = id_order.iter()
.map(|id_tup| {
// Find the first visible row that has this id tuple
*visible_rows.iter().find(|&&r| {
id_col_refs.iter().enumerate().all(|(i, col)| {
col.get_display(r) == id_tup[i]
})
}).unwrap()
})
.collect();
let gathered = gather_column(id_col, &id_row_indices);
out_cols.push((id_name.to_string(), NullCol::from_column(&gathered)));
let _ = id_idx;
}
// Value columns (one per unique key value)
let values_col = self.base.get_column(values_from).unwrap();
let val_type = values_col.type_name();
for (key_slot, key_val) in key_values.iter().enumerate() {
let row_opts: Vec<Option<usize>> = (0..n_rows)
.map(|id_slot| cell_map[id_slot][key_slot])
.collect();
let null_col = gather_column_nullable_null(values_col, &row_opts);
out_cols.push((key_val.clone(), null_col));
let _ = val_type;
}
Ok(NullableFrame { columns: out_cols })
}
// â"€â"€ rename â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Rename columns: `renames` is a slice of `(old_name, new_name)`.
///
/// Returns a new `TidyView` over a new base DataFrame with renamed columns.
///
/// Edge cases:
/// • Unknown `old_name` → `TidyError::ColumnNotFound`.
/// • `new_name` already exists (collision) → `TidyError::DuplicateColumn`.
/// • `old_name == new_name` → no-op for that pair.
pub fn rename(&self, renames: &[(&str, &str)]) -> Result<TidyView, TidyError> {
// Build rename map
let mut rename_map: Vec<(usize, String)> = Vec::new();
let col_names: Vec<&str> = self.base.columns.iter().map(|(n, _)| n.as_str()).collect();
for &(old, new) in renames {
let idx = col_names.iter().position(|&n| n == old)
.ok_or_else(|| TidyError::ColumnNotFound(old.to_string()))?;
// Check new name doesn't already exist (unless it's the old name itself)
if old != new {
let new_name_exists = col_names.iter().any(|&n| n == new)
|| rename_map.iter().any(|(_, n)| n == new);
if new_name_exists {
return Err(TidyError::DuplicateColumn(new.to_string()));
}
}
rename_map.push((idx, new.to_string()));
}
// Build new base with renamed columns
let mut new_cols: Vec<(String, Column)> = Vec::new();
for (i, (name, col)) in self.base.columns.iter().enumerate() {
let new_name = rename_map.iter()
.find(|(idx, _)| *idx == i)
.map(|(_, n)| n.clone())
.unwrap_or_else(|| name.clone());
new_cols.push((new_name, col.clone()));
}
let new_base = DataFrame { columns: new_cols };
Ok(TidyView {
base: Rc::new(new_base),
mask: self.mask.clone(),
proj: self.proj.clone(),
})
}
// â"€â"€ relocate â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Reorder columns so that `cols` appear at position `before` or `after`
/// another column, or at the front/back.
///
/// `cols`: columns to move.
/// `position`: `RelocatePos::Front`, `Back`, `Before(name)`, `After(name)`.
///
/// Non-moved columns keep their relative order.
/// Returns a new `TidyView` with updated projection.
///
/// Edge cases:
/// • Unknown column in `cols` → `TidyError::ColumnNotFound`.
/// • Unknown anchor column → `TidyError::ColumnNotFound`.
pub fn relocate(&self, cols: &[&str], position: RelocatePos<'_>) -> Result<TidyView, TidyError> {
// Validate cols exist in projection
let proj_names: Vec<&str> = self.column_names();
for &name in cols {
if !proj_names.contains(&name) {
return Err(TidyError::ColumnNotFound(name.to_string()));
}
}
// Build new column order in the projection
let moved_set: std::collections::BTreeSet<&str> = cols.iter().copied().collect();
let remaining: Vec<&str> = proj_names.iter()
.copied()
.filter(|n| !moved_set.contains(n))
.collect();
let new_order: Vec<&str> = match &position {
RelocatePos::Front => {
let mut v: Vec<&str> = cols.to_vec();
v.extend_from_slice(&remaining);
v
}
RelocatePos::Back => {
let mut v = remaining.clone();
v.extend_from_slice(cols);
v
}
RelocatePos::Before(anchor) => {
if !proj_names.contains(anchor) {
return Err(TidyError::ColumnNotFound(anchor.to_string()));
}
let mut v = Vec::new();
for &n in &remaining {
if n == *anchor {
v.extend_from_slice(cols);
}
v.push(n);
}
v
}
RelocatePos::After(anchor) => {
if !proj_names.contains(anchor) {
return Err(TidyError::ColumnNotFound(anchor.to_string()));
}
let mut v = Vec::new();
for &n in &remaining {
v.push(n);
if n == *anchor {
v.extend_from_slice(cols);
}
}
v
}
};
// Map new_order back to base column indices
let new_indices: Vec<usize> = new_order.iter()
.map(|&name| {
self.base.columns.iter().position(|(n, _)| n == name).unwrap()
})
.collect();
Ok(TidyView {
base: Rc::clone(&self.base),
mask: self.mask.clone(),
proj: ProjectionMap::from_indices(new_indices),
})
}
// â"€â"€ drop_cols â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Drop specified columns from the view (select-minus semantics).
///
/// Returns a new `TidyView` with those columns removed from the projection.
///
/// Edge cases:
/// • Unknown column → `TidyError::ColumnNotFound`.
/// • Dropping all columns → valid (0-col view).
pub fn drop_cols(&self, cols: &[&str]) -> Result<TidyView, TidyError> {
let proj_names = self.column_names();
for &name in cols {
if !proj_names.contains(&name) {
return Err(TidyError::ColumnNotFound(name.to_string()));
}
}
let drop_set: std::collections::BTreeSet<&str> = cols.iter().copied().collect();
let keep: Vec<&str> = proj_names.iter()
.copied()
.filter(|n| !drop_set.contains(n))
.collect();
self.select(&keep)
}
// â"€â"€ bind_rows â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Concatenate rows from `other` onto `self` (strict schema match).
///
/// Both frames must have the same column names in the same order.
/// Row order: `self` rows first, then `other` rows.
///
/// Edge cases:
/// • Column names differ → `TidyError::Internal("schema mismatch: ...")`.
/// • `other` has zero rows → returns self's rows (valid, no error).
pub fn bind_rows(&self, other: &TidyView) -> Result<TidyFrame, TidyError> {
let self_names = self.column_names();
let other_names = other.column_names();
if self_names != other_names {
return Err(TidyError::schema_mismatch(format!(
"left has {:?}, right has {:?}",
self_names, other_names
)));
}
let self_rows: Vec<usize> = self.mask.iter_indices().collect();
let other_rows: Vec<usize> = other.mask.iter_indices().collect();
let mut out_cols: Vec<(String, Column)> = Vec::new();
for &ci in self.proj.indices() {
let (name, self_col) = &self.base.columns[ci];
// Find matching column in other's projection
let other_ci = other.proj.indices().iter().copied()
.find(|&i| other.base.columns[i].0 == *name)
.ok_or_else(|| TidyError::ColumnNotFound(name.clone()))?;
let other_col = &other.base.columns[other_ci].1;
let col = concat_columns(self_col, &self_rows, other_col, &other_rows)?;
out_cols.push((name.clone(), col));
}
let df = DataFrame::from_columns(out_cols)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
// â"€â"€ bind_cols â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Concatenate columns from `other` onto `self` (strict row count match).
///
/// Both frames must have the same number of visible rows.
/// Column order: `self` columns first, then `other` columns.
///
/// Edge cases:
/// • Row count mismatch → `TidyError::LengthMismatch`.
/// • Column name collision → `TidyError::DuplicateColumn`.
pub fn bind_cols(&self, other: &TidyView) -> Result<TidyFrame, TidyError> {
let self_nrows = self.nrows();
let other_nrows = other.nrows();
if self_nrows != other_nrows {
return Err(TidyError::LengthMismatch {
expected: self_nrows,
got: other_nrows,
});
}
let self_names = self.column_names();
let other_names = other.column_names();
for name in &other_names {
if self_names.contains(name) {
return Err(TidyError::DuplicateColumn(name.to_string()));
}
}
let self_rows: Vec<usize> = self.mask.iter_indices().collect();
let other_rows: Vec<usize> = other.mask.iter_indices().collect();
let mut out_cols: Vec<(String, Column)> = Vec::new();
for &ci in self.proj.indices() {
let (name, col) = &self.base.columns[ci];
out_cols.push((name.clone(), gather_column(col, &self_rows)));
}
for &ci in other.proj.indices() {
let (name, col) = &other.base.columns[ci];
out_cols.push((name.clone(), gather_column(col, &other_rows)));
}
let df = DataFrame::from_columns(out_cols)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
// â"€â"€ mutate_across â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Apply a transformation across multiple columns, adding/replacing each
/// with a generated name `{col}_{fn}` (or a user-specified template).
///
/// Edge cases:
/// • Unknown column → `TidyError::ColumnNotFound`.
/// • Generated name collision → `TidyError::DuplicateColumn`.
/// • Empty cols list → no-op (returns materialized frame unchanged).
pub fn mutate_across(&self, specs: &[AcrossSpec]) -> Result<TidyFrame, TidyError> {
// Materialize self first
let base_df = self.materialize()?;
// Collect assignments, checking for name collisions
let mut output_names: Vec<String> = base_df.column_names()
.into_iter().map(|s| s.to_string()).collect();
let mut extra_cols: Vec<(String, Column)> = Vec::new();
for spec in specs {
for col_name in &spec.cols {
let out_name = spec.output_name(col_name);
// Check for duplicate in output
if output_names.contains(&out_name) && !base_df.column_names().contains(&out_name.as_str()) {
return Err(TidyError::DuplicateColumn(out_name));
}
let col = base_df.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
let new_col = (spec.transform.func)(col_name, col)?;
// Duplicate in output_names is overwrite for existing cols, error for new
if !base_df.column_names().contains(&out_name.as_str()) {
output_names.push(out_name.clone());
}
extra_cols.push((out_name, new_col));
}
}
// Merge: start from base_df columns, then add/overwrite extras
let mut col_map: indexmap_simple::IndexMap = indexmap_simple::IndexMap::from_df(&base_df);
for (name, col) in extra_cols {
col_map.insert(name, col);
}
let df = col_map.into_df()
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
// â"€â"€ right_join â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Right join: all rows from `right`, matched rows from `self` (left).
///
/// Output: left cols (nullable) + right cols.
/// Row order: right outer loop order preserved.
/// Unmatched right rows: left columns null-filled.
pub fn right_join(
&self,
right: &TidyView,
on: &[(&str, &str)],
suffix: &JoinSuffix,
) -> Result<NullableFrame, TidyError> {
// Validate key type compatibility
validate_join_key_types(self, right, on)?;
// Swap sides: right becomes "left" of a left join, then re-order columns
let swapped_on: Vec<(&str, &str)> = on.iter().map(|&(l, r)| (r, l)).collect();
let (right_rows, left_rows_opt) =
join_match_rows_optional(right, self, &swapped_on, JoinKind::Left)?;
build_right_join_frame(self, right, &left_rows_opt, &right_rows, on, suffix)
}
// â"€â"€ full_join â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Full outer join: all rows from both sides; null-fill for unmatched.
///
/// Row order: left rows first (matched and unmatched), then unmatched right rows.
pub fn full_join(
&self,
right: &TidyView,
on: &[(&str, &str)],
suffix: &JoinSuffix,
) -> Result<NullableFrame, TidyError> {
validate_join_key_types(self, right, on)?;
build_full_join_frame(self, right, on, suffix)
}
// â"€â"€ inner_join_typed (join maturity upgrade) â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Inner join with type validation and collision suffix support.
///
/// Same semantics as `inner_join` but:
/// • validates join key types are compatible (Int/Float widened, others exact).
/// • handles non-key column name collisions using `suffix`.
pub fn inner_join_typed(
&self,
right: &TidyView,
on: &[(&str, &str)],
suffix: &JoinSuffix,
) -> Result<TidyFrame, TidyError> {
validate_join_key_types(self, right, on)?;
let (left_rows, right_rows) = join_match_rows(self, right, on, JoinKind::Inner)?;
build_join_frame_with_suffix(self, right, &left_rows, &right_rows, on, suffix, false)
}
// â"€â"€ left_join_typed (join maturity upgrade) â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Left join with type validation and collision suffix support.
pub fn left_join_typed(
&self,
right: &TidyView,
on: &[(&str, &str)],
suffix: &JoinSuffix,
) -> Result<TidyFrame, TidyError> {
validate_join_key_types(self, right, on)?;
let (left_rows, right_rows_opt) =
join_match_rows_optional(self, right, on, JoinKind::Left)?;
build_left_join_frame_with_suffix(self, right, &left_rows, &right_rows_opt, on, suffix)
}
}
// â"€â"€ Position enum for relocate â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Position specifier for `TidyView::relocate`.
pub enum RelocatePos<'a> {
/// Move selected columns to the front.
Front,
/// Move selected columns to the back.
Back,
/// Insert selected columns immediately before the named column.
Before(&'a str),
/// Insert selected columns immediately after the named column.
After(&'a str),
}
// â"€â"€ Column concatenation helper â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
fn concat_columns(
left: &Column,
left_rows: &[usize],
right: &Column,
right_rows: &[usize],
) -> Result<Column, TidyError> {
match (left, right) {
(Column::Int(lv), Column::Int(rv)) => {
let mut out: Vec<i64> = left_rows.iter().map(|&i| lv[i]).collect();
out.extend(right_rows.iter().map(|&i| rv[i]));
Ok(Column::Int(out))
}
(Column::Float(lv), Column::Float(rv)) => {
let mut out: Vec<f64> = left_rows.iter().map(|&i| lv[i]).collect();
out.extend(right_rows.iter().map(|&i| rv[i]));
Ok(Column::Float(out))
}
(Column::Int(lv), Column::Float(rv)) => {
let mut out: Vec<f64> = left_rows.iter().map(|&i| lv[i] as f64).collect();
out.extend(right_rows.iter().map(|&i| rv[i]));
Ok(Column::Float(out))
}
(Column::Float(lv), Column::Int(rv)) => {
let mut out: Vec<f64> = left_rows.iter().map(|&i| lv[i]).collect();
out.extend(right_rows.iter().map(|&i| rv[i] as f64));
Ok(Column::Float(out))
}
(Column::Str(lv), Column::Str(rv)) => {
let mut out: Vec<String> = left_rows.iter().map(|&i| lv[i].clone()).collect();
out.extend(right_rows.iter().map(|&i| rv[i].clone()));
Ok(Column::Str(out))
}
(Column::Bool(lv), Column::Bool(rv)) => {
let mut out: Vec<bool> = left_rows.iter().map(|&i| lv[i]).collect();
out.extend(right_rows.iter().map(|&i| rv[i]));
Ok(Column::Bool(out))
}
_ => Err(TidyError::schema_mismatch(format!(
"type mismatch in bind_rows: {} vs {}",
left.type_name(), right.type_name()
))),
}
}
// â"€â"€ Join key type validation â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
fn validate_join_key_types(
left: &TidyView,
right: &TidyView,
on: &[(&str, &str)],
) -> Result<(), TidyError> {
for &(lk, rk) in on {
let l_col = left.base.get_column(lk)
.ok_or_else(|| TidyError::ColumnNotFound(lk.to_string()))?;
let r_col = right.base.get_column(rk)
.ok_or_else(|| TidyError::ColumnNotFound(rk.to_string()))?;
if !join_types_compatible(l_col, r_col) {
return Err(TidyError::join_type_mismatch(lk, l_col.type_name(), r_col.type_name()));
}
}
Ok(())
}
// â"€â"€ Join frames with suffix collision handling â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
fn build_join_frame_with_suffix(
left: &TidyView,
right: &TidyView,
left_rows: &[usize],
right_rows: &[usize],
on: &[(&str, &str)],
suffix: &JoinSuffix,
_include_unmatched: bool,
) -> Result<TidyFrame, TidyError> {
let right_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(_, rk)| *rk).collect();
// Collect left projected column names
let left_col_names: Vec<String> = left.proj.indices().iter()
.map(|&ci| left.base.columns[ci].0.clone())
.collect();
let mut columns: Vec<(String, Column)> = Vec::new();
// Left projected columns
for &ci in left.proj.indices() {
let (name, col) = &left.base.columns[ci];
columns.push((name.clone(), gather_column(col, left_rows)));
}
// Right projected columns with suffix on collision
for &ci in right.proj.indices() {
let (name, col) = &right.base.columns[ci];
if right_key_names.contains(name.as_str()) {
continue; // skip join key duplication
}
let out_name = if left_col_names.contains(name) {
format!("{}{}", name, suffix.right)
} else {
name.clone()
};
// Rename left side if it also collides
if left_col_names.contains(name) {
// Rename the left column already added
let left_pos = columns.iter().position(|(n, _)| n == name);
if let Some(pos) = left_pos {
let entry = &mut columns[pos];
entry.0 = format!("{}{}", entry.0, suffix.left);
}
}
columns.push((out_name, gather_column(col, right_rows)));
}
let df = DataFrame::from_columns(columns)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
fn build_left_join_frame_with_suffix(
left: &TidyView,
right: &TidyView,
left_rows: &[usize],
right_rows_opt: &[Option<usize>],
on: &[(&str, &str)],
suffix: &JoinSuffix,
) -> Result<TidyFrame, TidyError> {
let right_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(_, rk)| *rk).collect();
let left_col_names: Vec<String> = left.proj.indices().iter()
.map(|&ci| left.base.columns[ci].0.clone())
.collect();
let mut columns: Vec<(String, Column)> = Vec::new();
// Left projected columns
for &ci in left.proj.indices() {
let (name, col) = &left.base.columns[ci];
columns.push((name.clone(), gather_column(col, left_rows)));
}
// Right projected columns (nullable fill for unmatched)
for &ci in right.proj.indices() {
let (name, col) = &right.base.columns[ci];
if right_key_names.contains(name.as_str()) { continue; }
let out_name = if left_col_names.contains(name) {
// rename left column
let left_pos = columns.iter().position(|(n, _)| n == name);
if let Some(pos) = left_pos {
columns[pos].0 = format!("{}{}", name, suffix.left);
}
format!("{}{}", name, suffix.right)
} else {
name.clone()
};
let new_col = gather_column_nullable(col, right_rows_opt);
columns.push((out_name, new_col));
}
let df = DataFrame::from_columns(columns)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
fn build_right_join_frame(
left: &TidyView,
right: &TidyView,
left_rows_opt: &[Option<usize>],
right_rows: &[usize],
on: &[(&str, &str)],
suffix: &JoinSuffix,
) -> Result<NullableFrame, TidyError> {
let right_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(_, rk)| *rk).collect();
let left_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(lk, _)| *lk).collect();
let right_col_names: Vec<String> = right.proj.indices().iter()
.map(|&ci| right.base.columns[ci].0.clone())
.collect();
let mut columns: Vec<(String, NullCol)> = Vec::new();
// Left projected columns (nullable â€" unmatched = null)
for &ci in left.proj.indices() {
let (name, col) = &left.base.columns[ci];
if left_key_names.contains(name.as_str()) { continue; }
let out_name = if right_col_names.contains(name) {
format!("{}{}", name, suffix.left)
} else {
name.clone()
};
let null_col = gather_column_nullable_null(col, left_rows_opt);
columns.push((out_name, null_col));
}
// Right projected columns (always present)
for &ci in right.proj.indices() {
let (name, col) = &right.base.columns[ci];
let out_name = if !right_key_names.contains(name.as_str())
&& left.proj.indices().iter().any(|&lci| left.base.columns[lci].0 == *name)
&& !left_key_names.contains(name.as_str())
{
format!("{}{}", name, suffix.right)
} else {
name.clone()
};
columns.push((out_name, NullCol::from_column(&gather_column(col, right_rows))));
}
Ok(NullableFrame { columns })
}
fn build_full_join_frame(
left: &TidyView,
right: &TidyView,
on: &[(&str, &str)],
suffix: &JoinSuffix,
) -> Result<NullableFrame, TidyError> {
let (left_key_cols, right_key_cols) = resolve_join_keys(left, right, on)?;
let lookup = build_right_lookup(right, &right_key_cols);
// Phase 1: left outer loop (all left rows, with or without right match)
let mut out_left_rows: Vec<usize> = Vec::new();
let mut out_right_rows: Vec<Option<usize>> = Vec::new();
let mut right_matched: Vec<bool> = vec![false; right.base.nrows()];
for l_row in left.mask.iter_indices() {
let key = row_key(&left.base, &left_key_cols, l_row);
let matches = find_matches(&lookup, &key);
if matches.is_empty() {
out_left_rows.push(l_row);
out_right_rows.push(None);
} else {
for r_row in &matches {
out_left_rows.push(l_row);
out_right_rows.push(Some(*r_row));
if *r_row < right_matched.len() {
right_matched[*r_row] = true;
}
}
}
}
// Phase 2: unmatched right rows
let mut unmatched_right: Vec<usize> = Vec::new();
for r_row in right.mask.iter_indices() {
if r_row < right_matched.len() && !right_matched[r_row] {
unmatched_right.push(r_row);
}
}
let right_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(_, rk)| *rk).collect();
let left_key_names: std::collections::BTreeSet<&str> =
on.iter().map(|(lk, _)| *lk).collect();
let right_col_names: Vec<String> = right.proj.indices().iter()
.map(|&ci| right.base.columns[ci].0.clone())
.collect();
let n_matched = out_left_rows.len();
let n_unmatched_r = unmatched_right.len();
let total = n_matched + n_unmatched_r;
let mut columns: Vec<(String, NullCol)> = Vec::new();
// Left projected columns
for &ci in left.proj.indices() {
let (name, col) = &left.base.columns[ci];
let out_name = if right_col_names.contains(name) && !left_key_names.contains(name.as_str()) {
format!("{}{}", name, suffix.left)
} else {
name.clone()
};
let mut matched_vals: Vec<Option<usize>> = out_left_rows.iter()
.map(|&r| Some(r))
.collect();
// Extend with None for unmatched right rows
matched_vals.extend(std::iter::repeat(None).take(n_unmatched_r));
assert_eq!(matched_vals.len(), total);
columns.push((out_name, gather_column_nullable_null(col, &matched_vals)));
}
// Right projected columns (skip key cols to avoid duplication from left)
for &ci in right.proj.indices() {
let (name, col) = &right.base.columns[ci];
if right_key_names.contains(name.as_str()) { continue; }
let out_name = if left.proj.indices().iter().any(|&lci| left.base.columns[lci].0 == *name)
&& !left_key_names.contains(name.as_str())
{
format!("{}{}", name, suffix.right)
} else {
name.clone()
};
let mut row_opts: Vec<Option<usize>> = out_right_rows.clone();
// Extend with Some(r) for unmatched right rows
row_opts.extend(unmatched_right.iter().map(|&r| Some(r)));
assert_eq!(row_opts.len(), total);
columns.push((out_name, gather_column_nullable_null(col, &row_opts)));
}
// Key columns (from left where available, else from right)
// Emit left key columns first (already in left section above if not filtered)
// Actually key cols are already included via left.proj if they are projected.
// The key cols from right are skipped above. Done.
Ok(NullableFrame { columns })
}
// â"€â"€ GroupedTidyView: mutate_across + summarise_across â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl GroupedTidyView {
/// Apply across transformations then mutate each group in-place.
///
/// For each group, applies each `AcrossSpec` transform to the specified columns,
/// producing output named `{col}_{fn}` (or per-template).
/// Returns a materialized `TidyFrame` with all groups combined.
pub fn mutate_across(&self, specs: &[AcrossSpec]) -> Result<TidyFrame, TidyError> {
// Apply mutate_across to the ungrouped view (same result: group structure
// is not needed for row-wise transforms).
self.view.mutate_across(specs)
}
/// Summarise with across transforms: apply each transform to each column,
/// collecting one aggregate value per group.
///
/// The `specs` use `AcrossSpec` where the transform function must return a
/// single-element column (scalar reduction). If it returns more than one row,
/// `TidyError::LengthMismatch` is returned.
pub fn summarise_across(&self, specs: &[AcrossSpec]) -> Result<TidyFrame, TidyError> {
let n_groups = self.ngroups();
// Build output: key cols first, then across outputs
let key_names = &self.index.key_names;
let mut out_cols: Vec<(String, Column)> = Vec::new();
// Key columns (String typed â€" group key values)
for ki in 0..key_names.len() {
let col_vals: Vec<String> = self.index.groups.iter()
.map(|g| g.key_values[ki].clone())
.collect();
out_cols.push((key_names[ki].clone(), Column::Str(col_vals)));
}
// For each spec column × transform
for spec in specs {
for col_name in &spec.cols {
let out_name = spec.output_name(col_name);
// Check for duplicate output name
if out_cols.iter().any(|(n, _)| n == &out_name) {
return Err(TidyError::DuplicateColumn(out_name));
}
let base_col = self.view.base.get_column(col_name)
.ok_or_else(|| TidyError::ColumnNotFound(col_name.clone()))?;
// Apply transform per group, collecting scalar result
let mut agg_floats: Vec<f64> = Vec::with_capacity(n_groups);
for group in &self.index.groups {
let group_col = gather_column(base_col, &group.row_indices);
let result_col = (spec.transform.func)(col_name, &group_col)?;
if result_col.len() != 1 {
return Err(TidyError::LengthMismatch {
expected: 1,
got: result_col.len(),
});
}
let v = match &result_col {
Column::Float(v) => v[0],
Column::Int(v) => v[0] as f64,
_ => return Err(TidyError::TypeMismatch {
expected: "Float or Int".into(),
got: result_col.type_name().into(),
}),
};
agg_floats.push(v);
}
out_cols.push((out_name, Column::Float(agg_floats)));
}
}
let df = DataFrame::from_columns(out_cols)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
}
// â"€â"€ Simple IndexMap for mutate_across column merging â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
// We need ordered insertion with overwrite semantics. Rather than pulling in
// a dependency, implement a minimal ordered map over (String, Column).
mod indexmap_simple {
use super::{Column, DataFrame, DataError};
pub struct IndexMap {
entries: Vec<(String, Column)>,
}
impl IndexMap {
pub fn from_df(df: &DataFrame) -> Self {
Self {
entries: df.columns.iter()
.map(|(n, c)| (n.clone(), c.clone()))
.collect(),
}
}
/// Insert or overwrite a column by name.
pub fn insert(&mut self, name: String, col: Column) {
if let Some(pos) = self.entries.iter().position(|(n, _)| n == &name) {
self.entries[pos] = (name, col);
} else {
self.entries.push((name, col));
}
}
pub fn into_df(self) -> Result<DataFrame, DataError> {
DataFrame::from_columns(self.entries)
}
}
}
// â"€â"€ Group perf upgrade (deterministic hash accelerator) â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
//
// The Phase 11 GroupIndex::build() uses a linear scan (Vec<(key, slot)>)
// which is O(N × G). For large N with small G, we add a hash-accelerated
// variant that preserves first-occurrence ordering by using insertion order
// in a deterministic way.
//
// Strategy: use a BTreeMap<key_tuple, slot_index> for O(log G) lookup,
// which is fully deterministic (BTree is ordered by key, not hash seed).
// First-occurrence order is tracked via a separate Vec<slot> (unchanged).
// The BTreeMap is ONLY used for fast lookup; the output order is still
// driven by first-occurrence of groups as they appear in the scan.
//
// This is a pure internal change â€" the external API and output semantics
// are identical to Phase 11.
impl GroupIndex {
/// Build a GroupIndex using a BTree-accelerated lookup.
///
/// Semantics: identical to `GroupIndex::build()`. First-occurrence group
/// ordering is preserved. The only difference is O(N log G) vs O(N × G).
///
/// **Phase 2 (v3) cat-aware fast path**: when every key column is
/// `Column::Categorical`, the lookup BTreeMap keys are
/// `Vec<u32>` of category codes instead of `Vec<String>` of display
/// values. This eliminates `levels[code].clone()` per row per key
/// column. The fast path is bit-identical to the string path:
/// - Group slots are still assigned in first-occurrence row order.
/// - `GroupMeta::key_values` is still `Vec<String>` of display
/// values, computed once per group (not once per row).
/// - Mixed-type keys (e.g., categorical + int) fall back to the
/// string path automatically.
pub fn build_fast<I: IntoIterator<Item = usize>>(
base: &DataFrame,
key_col_indices: &[usize],
visible_rows: I,
key_names: Vec<String>,
) -> Self {
use std::collections::BTreeMap;
// Phase 2 cat-aware fast path: try categorical-only key encoding
// first. Returns Some(...) when every key col is categorical.
if let Some(cat_keys) = collect_categorical_keys(base, key_col_indices) {
return build_groupindex_categorical(cat_keys, visible_rows, key_names);
}
let mut groups: Vec<GroupMeta> = Vec::new();
let mut key_to_slot: BTreeMap<Vec<String>, usize> = BTreeMap::new();
for row in visible_rows {
let key: Vec<String> = key_col_indices.iter()
.map(|&ci| base.columns[ci].1.get_display(row))
.collect();
if let Some(&slot) = key_to_slot.get(&key) {
groups[slot].row_indices.push(row);
} else {
let slot = groups.len();
let key_values = key.clone();
key_to_slot.insert(key, slot);
groups.push(GroupMeta { key_values, row_indices: vec![row] });
}
}
GroupIndex { groups, key_names }
}
}
// Phase 2 cat-aware key encoding.
//
// When every key column is `Column::Categorical`, group_by + distinct
// build their lookup BTrees over `Vec<u32>` of codes instead of
// `Vec<String>` of display values. This eliminates `String::clone()` per
// row per key column on a hot path.
//
// Bit-identical to the string path:
// - First-occurrence group ordering is preserved (driven by row scan
// order, independent of key encoding).
// - `GroupMeta::key_values` / dedup output uses `levels[code]` lookup
// once per *group* (or once per *unique row*, in distinct), not per
// row. The display values are byte-for-byte identical.
//
// Mixed-type keys (e.g., one Categorical + one Int) cause
// `collect_categorical_keys` to return `None` so callers fall back.
/// Borrowed view onto the per-key categorical metadata.
pub(crate) struct CategoricalKeys<'a> {
/// `levels[i]` = level table for key column i.
pub(crate) levels: Vec<&'a [String]>,
/// `codes[i]` = code array for key column i.
pub(crate) codes: Vec<&'a [u32]>,
}
/// Returns `Some(CategoricalKeys)` when every column index in
/// `key_col_indices` is `Column::Categorical`. Returns `None` if any key
/// column is a non-categorical variant (caller must fall back).
pub(crate) fn collect_categorical_keys<'a>(
base: &'a DataFrame,
key_col_indices: &[usize],
) -> Option<CategoricalKeys<'a>> {
if key_col_indices.is_empty() {
return None;
}
let mut levels: Vec<&[String]> = Vec::with_capacity(key_col_indices.len());
let mut codes: Vec<&[u32]> = Vec::with_capacity(key_col_indices.len());
for &ci in key_col_indices {
match &base.columns[ci].1 {
Column::Categorical { levels: l, codes: c } => {
levels.push(l.as_slice());
codes.push(c.as_slice());
}
_ => return None,
}
}
Some(CategoricalKeys { levels, codes })
}
/// Cat-aware GroupIndex builder. Bit-identical output to the string-key
/// path: same slot assignment, same `key_values`, same `row_indices`.
fn build_groupindex_categorical<I: IntoIterator<Item = usize>>(
cat: CategoricalKeys<'_>,
visible_rows: I,
key_names: Vec<String>,
) -> GroupIndex {
use std::collections::BTreeMap;
let nkeys = cat.codes.len();
let mut groups: Vec<GroupMeta> = Vec::new();
let mut key_to_slot: BTreeMap<Vec<u32>, usize> = BTreeMap::new();
let mut key_buf: Vec<u32> = Vec::with_capacity(nkeys);
for row in visible_rows {
key_buf.clear();
for c in &cat.codes {
key_buf.push(c[row]);
}
if let Some(&slot) = key_to_slot.get(&key_buf) {
groups[slot].row_indices.push(row);
} else {
// Materialise display strings exactly once per group
let key_values: Vec<String> = (0..nkeys)
.map(|i| cat.levels[i][key_buf[i] as usize].clone())
.collect();
let slot = groups.len();
key_to_slot.insert(key_buf.clone(), slot);
groups.push(GroupMeta { key_values, row_indices: vec![row] });
}
}
GroupIndex { groups, key_names }
}
// â"€â"€ TidyView: group_by_fast (uses BTree-accelerated GroupIndex) â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl TidyView {
/// Like `group_by` but uses the BTree-accelerated `GroupIndex::build_fast`.
///
/// Semantics and output are IDENTICAL to `group_by`; this is purely an
/// internal performance upgrade. Tests should confirm identical output.
pub fn group_by_fast(&self, keys: &[&str]) -> Result<GroupedTidyView, TidyError> {
let mut key_col_indices = Vec::with_capacity(keys.len());
for &key in keys {
let idx = self.base.columns.iter().position(|(n, _)| n == key)
.ok_or_else(|| TidyError::ColumnNotFound(key.to_string()))?;
key_col_indices.push(idx);
}
let key_names: Vec<String> = keys.iter().map(|s| s.to_string()).collect();
let index = GroupIndex::build_fast(&self.base, &key_col_indices, self.mask.iter_indices(), key_names);
Ok(GroupedTidyView { view: self.clone(), index })
}
}
// ── v3 Phase 6: Streaming aggregations ─────────────────────────────────────
//
// `summarise_streaming` is a sibling of `summarise` that skips the
// `GroupIndex` materialization step entirely. The legacy path builds
// `Vec<usize>` row indices per group (8 bytes × N rows ≈ 800 MB for
// 100M rows), then walks each per-group vector once per aggregation.
// The streaming path walks visible rows ONCE, maintaining a
// `BTreeMap<key, Vec<AccState>>` where each accumulator holds
// running-state O(constant) memory: Kahan sum carry, count, min, max,
// or Welford running mean+M2 for variance/stddev.
//
// Memory: O(K · acc_size) instead of O(N · usize). For 100M rows /
// 1000 groups / 32-byte accumulator, that's ~32 KB vs ~800 MB —
// roughly 25 000× less memory.
//
// Determinism: BTreeMap (not HashMap), Kahan / Welford (not naive
// floating-point sum). Output row order is the BTreeMap iteration
// order — byte-equal to the legacy path's first-occurrence ordering
// when keys are integers/strings/sorted-Categorical (see Phase 6
// integration test `phase6_streaming_summarise_matches_legacy`).
//
// Cat-aware: when every key column is `Column::Categorical`, the key
// tuple is `Vec<u32>` codes (bit-equal to Phase 2's group_by fast
// path). Falls back to `Vec<String>` displays on mixed-type keys.
//
// Aggregation surface: Count / Sum / Mean / Min / Max / Var / Sd.
// Median / Quantile / NDistinct require the full row index list and
// are not streaming — callers fall back to the legacy `summarise`.
/// Streamable aggregation operations. Subset of `TidyAgg` that admits
/// a constant-state per-group accumulator.
#[derive(Debug, Clone)]
pub enum StreamingAgg {
/// Row count per group.
Count,
/// Kahan-sum of a numeric column.
Sum(String),
/// Arithmetic mean of a numeric column.
Mean(String),
/// Minimum value (NaN-aware: NaN never wins).
Min(String),
/// Maximum value (NaN-aware: NaN never wins).
Max(String),
/// Sample variance via Welford's algorithm (numerically stable).
Var(String),
/// Sample standard deviation (sqrt of `Var`).
Sd(String),
}
/// Per-group running state. One per `(group, agg)` pair.
#[derive(Debug, Clone)]
enum AccState {
Count {
n: u64,
},
Sum {
// Kahan running sum.
sum: f64,
c: f64,
},
Mean {
// Kahan-stable accumulator + count.
sum: f64,
c: f64,
n: u64,
},
Min {
cur: f64,
any: bool,
},
Max {
cur: f64,
any: bool,
},
/// Welford running mean + sum of squared deviations from mean.
Welford {
n: u64,
mean: f64,
m2: f64,
},
}
impl AccState {
fn from_agg(agg: &StreamingAgg) -> Self {
match agg {
StreamingAgg::Count => AccState::Count { n: 0 },
StreamingAgg::Sum(_) => AccState::Sum { sum: 0.0, c: 0.0 },
StreamingAgg::Mean(_) => AccState::Mean { sum: 0.0, c: 0.0, n: 0 },
StreamingAgg::Min(_) => AccState::Min {
cur: f64::INFINITY,
any: false,
},
StreamingAgg::Max(_) => AccState::Max {
cur: f64::NEG_INFINITY,
any: false,
},
StreamingAgg::Var(_) | StreamingAgg::Sd(_) => AccState::Welford {
n: 0,
mean: 0.0,
m2: 0.0,
},
}
}
fn update(&mut self, x: f64) {
match self {
AccState::Count { n } => *n += 1,
AccState::Sum { sum, c } => {
// Kahan summation.
let y = x - *c;
let t = *sum + y;
*c = (t - *sum) - y;
*sum = t;
}
AccState::Mean { sum, c, n } => {
let y = x - *c;
let t = *sum + y;
*c = (t - *sum) - y;
*sum = t;
*n += 1;
}
AccState::Min { cur, any } => {
if !x.is_nan() {
if !*any || x < *cur {
*cur = x;
*any = true;
}
}
}
AccState::Max { cur, any } => {
if !x.is_nan() {
if !*any || x > *cur {
*cur = x;
*any = true;
}
}
}
AccState::Welford { n, mean, m2 } => {
// Welford's online variance.
*n += 1;
let delta = x - *mean;
*mean += delta / (*n as f64);
let delta2 = x - *mean;
*m2 += delta * delta2;
}
}
}
fn finalize(&self, agg: &StreamingAgg) -> f64 {
match (self, agg) {
(AccState::Count { n }, StreamingAgg::Count) => *n as f64,
(AccState::Sum { sum, .. }, StreamingAgg::Sum(_)) => *sum,
(AccState::Mean { sum, n, .. }, StreamingAgg::Mean(_)) => {
if *n == 0 {
f64::NAN
} else {
*sum / (*n as f64)
}
}
(AccState::Min { cur, any }, StreamingAgg::Min(_)) => {
if *any {
*cur
} else {
f64::NAN
}
}
(AccState::Max { cur, any }, StreamingAgg::Max(_)) => {
if *any {
*cur
} else {
f64::NAN
}
}
(AccState::Welford { n, m2, .. }, StreamingAgg::Var(_)) => {
if *n < 2 {
f64::NAN
} else {
*m2 / ((*n - 1) as f64)
}
}
(AccState::Welford { n, m2, .. }, StreamingAgg::Sd(_)) => {
if *n < 2 {
f64::NAN
} else {
(*m2 / ((*n - 1) as f64)).sqrt()
}
}
_ => f64::NAN,
}
}
}
/// Pull a row's value from a numeric column as f64 (NaN for non-numeric).
fn row_as_f64(col: &Column, row: usize) -> f64 {
match col {
Column::Float(v) => v[row],
Column::Int(v) => v[row] as f64,
_ => f64::NAN,
}
}
impl TidyView {
/// v3 Phase 6: streaming summarise. Single-pass aggregation that
/// avoids materializing per-group row index vectors.
///
/// Returns a `TidyFrame` with key columns followed by aggregate
/// columns (one per assignment). Output row order is BTreeMap
/// iteration over key tuples — byte-equal to the legacy path when
/// keys are integers / strings / sorted-Categorical.
pub fn summarise_streaming(
&self,
keys: &[&str],
aggs: &[(&str, StreamingAgg)],
) -> Result<TidyFrame, TidyError> {
// Validate: no duplicate output names; no key/agg name collision.
{
let mut seen = std::collections::BTreeSet::new();
for &(name, _) in aggs {
if !seen.insert(name) {
return Err(TidyError::DuplicateColumn(name.to_string()));
}
}
for &k in keys {
if seen.contains(k) {
return Err(TidyError::DuplicateColumn(k.to_string()));
}
}
}
// Resolve key column indices.
let mut key_col_indices = Vec::with_capacity(keys.len());
for &key in keys {
let idx = self
.base
.columns
.iter()
.position(|(n, _)| n == key)
.ok_or_else(|| TidyError::ColumnNotFound(key.to_string()))?;
key_col_indices.push(idx);
}
// Resolve aggregation source columns once.
let agg_col_indices: Vec<Option<usize>> = aggs
.iter()
.map(|(_, agg)| match agg {
StreamingAgg::Count => None,
StreamingAgg::Sum(c)
| StreamingAgg::Mean(c)
| StreamingAgg::Min(c)
| StreamingAgg::Max(c)
| StreamingAgg::Var(c)
| StreamingAgg::Sd(c) => self.base.columns.iter().position(|(n, _)| n == c),
})
.collect();
for (i, (_, agg)) in aggs.iter().enumerate() {
if matches!(agg, StreamingAgg::Count) {
continue;
}
if agg_col_indices[i].is_none() {
let col_name = match agg {
StreamingAgg::Sum(c)
| StreamingAgg::Mean(c)
| StreamingAgg::Min(c)
| StreamingAgg::Max(c)
| StreamingAgg::Var(c)
| StreamingAgg::Sd(c) => c.clone(),
_ => String::new(),
};
return Err(TidyError::ColumnNotFound(col_name));
}
}
// Cat-aware fast path: when every key is Categorical, key on Vec<u32>.
let cat = collect_categorical_keys(&self.base, &key_col_indices);
// BTreeMap keyed on Vec<u32> (cat) or Vec<String> (legacy).
// Both produce deterministic iteration.
use std::collections::BTreeMap;
let n_aggs = aggs.len();
let init_accs = || -> Vec<AccState> {
aggs.iter().map(|(_, a)| AccState::from_state(a)).collect()
};
// Helper inside summarise_streaming to keep AccState constructor private.
// (alias to make readable above)
fn _unused() {}
// Streaming pass.
let (cat_state, str_state) = if let Some(cat) = cat.as_ref() {
let mut state: BTreeMap<Vec<u32>, Vec<AccState>> = BTreeMap::new();
let mut key_buf: Vec<u32> = Vec::with_capacity(cat.codes.len());
for row in self.mask.iter_indices() {
key_buf.clear();
for c in &cat.codes {
key_buf.push(c[row]);
}
let entry = state
.entry(key_buf.clone())
.or_insert_with(&init_accs);
for (i, (_, agg)) in aggs.iter().enumerate() {
if let Some(col_idx) = agg_col_indices[i] {
entry[i].update(row_as_f64(&self.base.columns[col_idx].1, row));
} else {
entry[i].update(0.0); // Count ignores value.
}
}
}
(Some(state), None)
} else {
let mut state: BTreeMap<Vec<String>, Vec<AccState>> = BTreeMap::new();
for row in self.mask.iter_indices() {
let key: Vec<String> = key_col_indices
.iter()
.map(|&ci| self.base.columns[ci].1.get_display(row))
.collect();
let entry = state.entry(key).or_insert_with(&init_accs);
for (i, (_, agg)) in aggs.iter().enumerate() {
if let Some(col_idx) = agg_col_indices[i] {
entry[i].update(row_as_f64(&self.base.columns[col_idx].1, row));
} else {
entry[i].update(0.0);
}
}
}
(None, Some(state))
};
// Materialize output frame.
let n_groups = cat_state
.as_ref()
.map(|s| s.len())
.unwrap_or_else(|| str_state.as_ref().unwrap().len());
let mut result_columns: Vec<(String, Column)> = Vec::with_capacity(keys.len() + n_aggs);
// Key columns. For cat-aware: rebuild via levels[code]. For str: take strings as-is.
if let Some(state) = &cat_state {
let cat = cat.as_ref().unwrap();
for (ki, &key_col_idx) in key_col_indices.iter().enumerate() {
let mut vals: Vec<String> = Vec::with_capacity(n_groups);
for key_codes in state.keys() {
let code = key_codes[ki] as usize;
vals.push(cat.levels[ki][code].clone());
}
let key_name = self.base.columns[key_col_idx].0.clone();
// Materialize as Categorical to preserve type.
let levels: Vec<String> = cat.levels[ki].to_vec();
let codes: Vec<u32> = state.keys().map(|k| k[ki]).collect();
result_columns.push((key_name, Column::Categorical { levels, codes }));
let _ = vals;
}
} else {
let state = str_state.as_ref().unwrap();
for (ki, &key_col_idx) in key_col_indices.iter().enumerate() {
let key_name = self.base.columns[key_col_idx].0.clone();
let vals: Vec<String> = state.keys().map(|k| k[ki].clone()).collect();
result_columns.push((key_name, Column::Str(vals)));
}
}
// Aggregate columns.
for (i, (out_name, agg)) in aggs.iter().enumerate() {
let vals: Vec<f64> = if let Some(state) = &cat_state {
state.values().map(|accs| accs[i].finalize(agg)).collect()
} else {
str_state
.as_ref()
.unwrap()
.values()
.map(|accs| accs[i].finalize(agg))
.collect()
};
let col = if matches!(agg, StreamingAgg::Count) {
Column::Int(vals.into_iter().map(|x| x as i64).collect())
} else {
Column::Float(vals)
};
result_columns.push((out_name.to_string(), col));
}
let df = DataFrame::from_columns(result_columns)
.map_err(|e| TidyError::Internal(e.to_string()))?;
Ok(TidyFrame::from_df(df))
}
}
impl AccState {
fn from_state(agg: &StreamingAgg) -> Self {
Self::from_agg(agg)
}
}
// â"€â"€ NoGC safe-builtin registrations (Phase 13â€"16) â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
//
// New safe (view/metadata-only, no column buffer alloc):
// tidy_rename : builds new DataFrame with renamed cols â€" O(N×K) clone
// BUT this is a metadata rebuild, not a hot-path â€" listed as ALLOC
// → NOT @nogc safe (rebuilds base)
// tidy_relocate : updates ProjectionMap only â€" O(K) â€" SAFE
// tidy_drop_cols : updates ProjectionMap only â€" O(K) â€" SAFE
// tidy_group_by_fast : BTree lookup + GroupIndex â€" no column alloc â€" SAFE
//
// New materialising (NOT @nogc safe):
// tidy_pivot_longer : allocates new column buffers
// tidy_pivot_wider : allocates NullableFrame
// tidy_bind_rows : allocates concatenated columns
// tidy_bind_cols : allocates combined columns
// tidy_mutate_across : materialises and transforms columns
// tidy_right_join : allocates NullableFrame
// tidy_full_join : allocates NullableFrame
// tidy_inner_join_typed: allocates TidyFrame
// tidy_left_join_typed : allocates TidyFrame
// tidy_summarise_across: allocates aggregate frame
// tidy_rename : rebuilds base DataFrame (included above)
//
// Registered in cjc-mir/src/nogc_verify.rs:
// tidy_relocate, tidy_drop_cols, tidy_group_by_fast
// ============================================================================
// PHASE 17: CATEGORICAL FOUNDATIONS â€" fct_encode, fct_lump, fct_reorder,
// fct_collapse, NullableFactor
// ============================================================================
//
// Design decisions (spec-lock):
//
// [S-1] Index type: u16 (max 65,535 distinct levels). TidyError::CapacityExceeded
// on overflow. A future u32 upgrade is a flag-only change.
// [S-2] Level ordering: first-occurrence order of the STRING values in the
// visible rows at encoding time. No hash involved → deterministic.
// [S-3] Null handling: null cells use a SEPARATE validity bitmap (NullableFactor).
// Null is NOT a level. Null index slot is 0 but masked out by bitmap.
// [S-4] fct_lump tie-breaking: equal-frequency levels keep first-occurrence order;
// the "Other" bucket is appended LAST in the levels Vec.
// [S-5] fct_lump "Other" collision: if a level named "Other" already exists it is
// renamed "Other_" (iterated until unique).
// [S-6] fct_reorder: stable sort (preserve first-occurrence within ties); NaN
// values in the numeric summary column sort LAST (same rule as arrange).
// [S-7] fct_collapse: metadata-only â€" never rewrites data buffer; O(L) pass over
// levels Vec only. Re-indexes in O(N) only when compacting dead indices.
// [S-8] fct_collapse duplicate level output: if two OLD levels collapse to the
// SAME NEW name that new name appears once; index remapping is canonical.
// [S-9] fct_encode is a materialising op (allocates new u16 buffer) → NOT @nogc.
// [S-10] fct_lump is materialising (reallocates levels + rebuilds counts) → NOT @nogc.
// [S-11] fct_reorder is materialising (reorders levels Vec + rebuilds index map) → NOT @nogc.
// [S-12] fct_collapse is metadata-only → SAFE in @nogc (registered as safe builtin).
// [S-13] Encoding stability: fct_encode on an already-encoded FctColumn is a no-op
// (returns clone of self) so that double-encoding is idempotent.
// [S-14] NullableFactor: FctColumn + BitMask validity. Null rows carry index=0 in
// data buffer but bitmap bit is clear; any op that reads the data MUST check
// the bitmap first.
// â"€â"€ FctColumn â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A compact categorical column: stores u16 indices into a levels table.
///
/// Invariant: `data[i] < levels.len()` for all i where bitmap is set.
/// Null rows (in NullableFactor) may carry index 0 â€" callers must check bitmap.
#[derive(Clone, Debug)]
pub struct FctColumn {
/// Mapping from index → level string. Order = first-occurrence of each string
/// in the source column (deterministic, no hashing).
pub levels: Vec<String>,
/// One u16 per row. Value is the index into `levels`.
pub data: Vec<u16>,
}
impl FctColumn {
// â"€â"€ Constructors â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Encode a string column into a FctColumn.
///
/// Level order = first-occurrence in `strings`.
/// Returns Err if more than 65,535 distinct strings are found.
pub fn encode(strings: &[String]) -> Result<Self, TidyError> {
use std::collections::BTreeMap;
let mut levels: Vec<String> = Vec::new();
// BTreeMap for O(log L) lookup; key ordering is string-lexicographic
// (deterministic across runs â€" no hash randomness).
// First-occurrence ORDER is maintained separately by `levels` Vec.
let mut level_map: BTreeMap<String, u16> = BTreeMap::new();
let mut data: Vec<u16> = Vec::with_capacity(strings.len());
for s in strings {
let idx = if let Some(&existing) = level_map.get(s.as_str()) {
existing
} else {
let next = levels.len();
if next >= 65_535 {
return Err(TidyError::CapacityExceeded {
limit: 65_535,
got: next + 1,
});
}
let idx = next as u16;
levels.push(s.clone());
level_map.insert(s.clone(), idx);
idx
};
data.push(idx);
}
Ok(FctColumn { levels, data })
}
/// Encode a `Column::Str` from a `TidyView` column (respects mask & projection).
pub fn encode_from_view(view: &TidyView, col: &str) -> Result<Self, TidyError> {
let base_idx = view.base.columns.iter()
.position(|(n, _)| n == col)
.ok_or_else(|| TidyError::ColumnNotFound(col.to_string()))?;
// Check it is in the current projection
if !view.proj.indices().contains(&base_idx) {
return Err(TidyError::ColumnNotFound(col.to_string()));
}
let col_data = &view.base.columns[base_idx].1;
let visible: Vec<usize> = view.mask.iter_indices().collect();
let strings: Vec<String> = visible.iter()
.map(|&r| col_data.get_display(r))
.collect();
Self::encode(&strings)
}
// â"€â"€ Shape â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Returns the number of rows in this factor column.
pub fn nrows(&self) -> usize { self.data.len() }
/// Returns the number of distinct levels.
pub fn nlevels(&self) -> usize { self.levels.len() }
/// Decode row i back to its string value.
pub fn decode(&self, i: usize) -> &str {
&self.levels[self.data[i] as usize]
}
// â"€â"€ fct_lump â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Lump all but the top-`n` most frequent levels into "Other".
///
/// Tie-breaking: equal-frequency levels keep first-occurrence order in
/// the top-n selection.
///
/// Edge cases:
/// - n = 0 → all levels become "Other" (one level total)
/// - n ≥ nlevels → no lumping, returns self.clone()
/// - "Other" already present → renamed to "Other_" (iterate until unique)
pub fn fct_lump(&self, n: usize) -> Result<Self, TidyError> {
if n >= self.levels.len() {
return Ok(self.clone()); // nothing to lump
}
// Count frequencies per level (O(N))
let mut freq = vec![0usize; self.levels.len()];
for &idx in &self.data {
freq[idx as usize] += 1;
}
// Build ranked list of (level_idx, freq, first_occurrence_order).
// Sort descending by freq; ties keep ascending level_idx (= first-occurrence).
let mut ranked: Vec<(usize, usize)> = freq.iter().copied().enumerate().collect();
ranked.sort_by(|a, b| b.1.cmp(&a.1).then(a.0.cmp(&b.0)));
// Top-n level indices (preserve first-occurrence ordering in output)
let mut keep_set: Vec<usize> = ranked[..n].iter().map(|(i, _)| *i).collect();
keep_set.sort_unstable(); // restore first-occurrence ordering
// Determine the "Other" bucket name (avoid collision)
let mut other_name = "Other".to_string();
while keep_set.iter().any(|&ki| self.levels[ki] == other_name) {
other_name.push('_');
}
// Build new levels: keep-set levels in first-occurrence order, then "Other" last
let mut new_levels: Vec<String> = keep_set.iter().map(|&ki| self.levels[ki].clone()).collect();
let other_idx = new_levels.len() as u16;
new_levels.push(other_name);
// Build old→new index map
let mut remap = vec![other_idx; self.levels.len()];
for (new_i, &old_i) in keep_set.iter().enumerate() {
remap[old_i] = new_i as u16;
}
let new_data: Vec<u16> = self.data.iter().map(|&d| remap[d as usize]).collect();
Ok(FctColumn { levels: new_levels, data: new_data })
}
// â"€â"€ fct_reorder â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Reorder levels by a numeric summary column from the same frame.
///
/// `summary_vals[i]` is the numeric value for level i.
/// Ascending = smallest summary value first.
/// NaN sorts LAST (same rule as arrange).
/// Tie-breaking: stable sort (original level order preserved within ties).
pub fn fct_reorder(&self, summary_vals: &[f64], descending: bool) -> Result<Self, TidyError> {
if summary_vals.len() != self.levels.len() {
return Err(TidyError::LengthMismatch {
expected: self.levels.len(),
got: summary_vals.len(),
});
}
// Build (level_idx, summary_val) and sort.
// NaN always sorts LAST regardless of direction (same rule as arrange).
// Direction only affects the finite-value comparison.
let mut order: Vec<usize> = (0..self.levels.len()).collect();
order.sort_by(|&a, &b| {
let va = summary_vals[a];
let vb = summary_vals[b];
match (va.is_nan(), vb.is_nan()) {
(true, true) => std::cmp::Ordering::Equal,
(true, false) => std::cmp::Ordering::Greater, // NaN always last
(false, true) => std::cmp::Ordering::Less, // NaN always last
(false, false) => {
let cmp = va.partial_cmp(&vb).unwrap_or(std::cmp::Ordering::Equal);
if descending { cmp.reverse() } else { cmp }
}
}
});
// Build new levels in the new order
let new_levels: Vec<String> = order.iter().map(|&i| self.levels[i].clone()).collect();
// Build old→new index map
let mut remap = vec![0u16; self.levels.len()];
for (new_i, &old_i) in order.iter().enumerate() {
remap[old_i] = new_i as u16;
}
let new_data: Vec<u16> = self.data.iter().map(|&d| remap[d as usize]).collect();
Ok(FctColumn { levels: new_levels, data: new_data })
}
/// Convenience: compute per-level mean of a numeric column, then reorder.
///
/// `numeric_col` must be Column::Float or Column::Int and same length as self.
/// NaN values in the numeric column are excluded from the mean; if all rows
/// for a level are NaN the level gets summary NaN (sorts last).
pub fn fct_reorder_by_col(&self, numeric_col: &Column, descending: bool) -> Result<Self, TidyError> {
if numeric_col.len() != self.data.len() {
return Err(TidyError::LengthMismatch {
expected: self.data.len(),
got: numeric_col.len(),
});
}
let mut sums = vec![0.0f64; self.levels.len()];
let mut counts = vec![0usize; self.levels.len()];
match numeric_col {
Column::Float(v) => {
for (i, &d) in self.data.iter().enumerate() {
let val = v[i];
if !val.is_nan() {
sums[d as usize] += val;
counts[d as usize] += 1;
}
}
}
Column::Int(v) => {
for (i, &d) in self.data.iter().enumerate() {
sums[d as usize] += v[i] as f64;
counts[d as usize] += 1;
}
}
_ => return Err(TidyError::TypeMismatch {
expected: "Float or Int".to_string(),
got: numeric_col.type_name().to_string(),
}),
}
let means: Vec<f64> = sums.iter().zip(counts.iter())
.map(|(&s, &c)| if c == 0 { f64::NAN } else { s / c as f64 })
.collect();
self.fct_reorder(&means, descending)
}
// â"€â"€ fct_collapse â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Collapse multiple old levels into single new level names.
///
/// `mapping`: slice of `(old_level_name, new_level_name)`.
/// - Levels not in mapping keep their original name.
/// - Multiple old levels can map to the same new name → merged into one index.
/// - Output level order: first-occurrence of each NEW name, following the
/// original first-occurrence order of OLD levels.
/// - Data buffer is rebuilt (O(N) remap) only when indices actually change.
/// The levels Vec is rebuilt O(L) regardless.
/// - Empty mapping → returns self.clone().
///
/// Capacity: if collapsing reduces level count the result always fits in u16.
/// The collapsed result can never exceed the original level count, so
/// CapacityExceeded cannot occur from fct_collapse.
pub fn fct_collapse(&self, mapping: &[(&str, &str)]) -> Result<Self, TidyError> {
if mapping.is_empty() {
return Ok(self.clone());
}
// Build: for each old level string, what is the new name?
let new_name_for: Vec<String> = self.levels.iter().map(|old| {
if let Some((_, new)) = mapping.iter().find(|(o, _)| *o == old.as_str()) {
new.to_string()
} else {
old.clone()
}
}).collect();
// Build new levels Vec (first-occurrence of new names, following old order).
// BTreeMap for O(log L) lookup; first-occurrence ORDER is in new_levels Vec.
use std::collections::BTreeMap;
let mut new_levels: Vec<String> = Vec::new();
let mut new_name_to_idx: BTreeMap<String, u16> = BTreeMap::new();
let mut old_to_new: Vec<u16> = Vec::with_capacity(self.levels.len());
for name in &new_name_for {
let idx = if let Some(&existing) = new_name_to_idx.get(name.as_str()) {
existing
} else {
let idx = new_levels.len() as u16;
new_levels.push(name.clone());
new_name_to_idx.insert(name.clone(), idx);
idx
};
old_to_new.push(idx);
}
// Check if any index actually changed (avoid rewriting data if noop)
let changed = old_to_new.iter().enumerate().any(|(i, &new)| new != i as u16);
let new_data = if changed {
self.data.iter().map(|&d| old_to_new[d as usize]).collect()
} else {
self.data.clone()
};
Ok(FctColumn { levels: new_levels, data: new_data })
}
// â"€â"€ Materialise back to Column::Str â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// Decode all rows back into a `Column::Str`.
pub fn to_str_column(&self) -> Column {
Column::Str(self.data.iter().map(|&d| self.levels[d as usize].clone()).collect())
}
/// Gather rows by index (supports view semantics without full materialise).
pub fn gather(&self, indices: &[usize]) -> FctColumn {
FctColumn {
levels: self.levels.clone(),
data: indices.iter().map(|&i| self.data[i]).collect(),
}
}
}
// â"€â"€ TidyError extensions for Phase 17 â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl TidyError {
/// Convenience constructor for a capacity-exceeded error.
pub fn capacity_exceeded(limit: usize, got: usize) -> Self {
TidyError::CapacityExceeded { limit, got }
}
}
// â"€â"€ NullableFactor â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
/// A FctColumn with a validity bitmap. Null cells have validity=false.
/// Null is NOT a level; `data[i]` for a null row is 0 (sentinel, must not be used).
#[derive(Clone, Debug)]
pub struct NullableFactor {
pub fct: FctColumn,
pub validity: BitMask,
}
impl NullableFactor {
/// Construct from a FctColumn (all rows valid).
pub fn from_fct(fct: FctColumn) -> Self {
let n = fct.nrows();
NullableFactor { fct, validity: BitMask::all_true(n) }
}
/// Construct from a FctColumn + validity bitmap.
pub fn new(fct: FctColumn, validity: BitMask) -> Self {
NullableFactor { fct, validity }
}
/// Encode a string slice with optional null markers.
///
/// `strings[i] = None` → null row.
pub fn encode_nullable(strings: &[Option<String>]) -> Result<Self, TidyError> {
use std::collections::BTreeMap;
let mut levels: Vec<String> = Vec::new();
let mut level_map: BTreeMap<String, u16> = BTreeMap::new();
let mut data: Vec<u16> = Vec::with_capacity(strings.len());
let mut valid_flags: Vec<bool> = Vec::with_capacity(strings.len());
for opt in strings {
match opt {
None => {
data.push(0); // sentinel (ignored due to validity bit)
valid_flags.push(false);
}
Some(s) => {
let idx = if let Some(&existing) = level_map.get(s.as_str()) {
existing
} else {
let next = levels.len();
if next >= 65_535 {
return Err(TidyError::CapacityExceeded { limit: 65_535, got: next + 1 });
}
let idx = next as u16;
levels.push(s.clone());
level_map.insert(s.clone(), idx);
idx
};
data.push(idx);
valid_flags.push(true);
}
}
}
let fct = FctColumn { levels, data };
let validity = BitMask::from_bools(&valid_flags);
Ok(NullableFactor { fct, validity })
}
/// Returns the total number of rows (including nulls).
pub fn nrows(&self) -> usize { self.fct.nrows() }
/// Returns the number of distinct factor levels.
pub fn nlevels(&self) -> usize { self.fct.nlevels() }
/// Returns `true` if row `i` is null.
pub fn is_null(&self, i: usize) -> bool { !self.validity.get(i) }
/// Returns the count of non-null rows.
pub fn count_valid(&self) -> usize { self.validity.count_ones() }
/// Decode row i, or None if null.
pub fn decode(&self, i: usize) -> Option<&str> {
if self.is_null(i) { None } else { Some(self.fct.decode(i)) }
}
/// fct_lump on non-null rows only. Null rows remain null.
pub fn fct_lump(&self, n: usize) -> Result<Self, TidyError> {
let lumped = self.fct.fct_lump(n)?;
Ok(NullableFactor { fct: lumped, validity: self.validity.clone() })
}
/// fct_reorder on non-null rows. Null rows remain null.
pub fn fct_reorder(&self, summary_vals: &[f64], descending: bool) -> Result<Self, TidyError> {
let reordered = self.fct.fct_reorder(summary_vals, descending)?;
Ok(NullableFactor { fct: reordered, validity: self.validity.clone() })
}
/// fct_collapse (metadata only, @nogc safe).
pub fn fct_collapse(&self, mapping: &[(&str, &str)]) -> Result<Self, TidyError> {
let collapsed = self.fct.fct_collapse(mapping)?;
Ok(NullableFactor { fct: collapsed, validity: self.validity.clone() })
}
}
// â"€â"€ TidyView: fct_encode integration â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
impl TidyView {
/// Encode a string column in this view into an `FctColumn`.
///
/// Only visible rows (mask) in the current projection are used.
/// This is a materialising op (allocates u16 buffer) → NOT @nogc safe.
pub fn fct_encode(&self, col: &str) -> Result<FctColumn, TidyError> {
FctColumn::encode_from_view(self, col)
}
/// Compute per-level mean of a numeric column for use with fct_reorder.
///
/// Returns a Vec<f64> of length `fct.nlevels()`, one mean per level.
/// Levels with no matching rows get NaN.
pub fn fct_summary_means(
&self,
fct: &FctColumn,
numeric_col: &str,
) -> Result<Vec<f64>, TidyError> {
let base_idx = self.base.columns.iter()
.position(|(n, _)| n == numeric_col)
.ok_or_else(|| TidyError::ColumnNotFound(numeric_col.to_string()))?;
let nc = &self.base.columns[base_idx].1;
if nc.len() != fct.nrows() {
return Err(TidyError::LengthMismatch { expected: fct.nrows(), got: nc.len() });
}
// Type-check: only Float or Int supported
match nc {
Column::Float(_) | Column::Int(_) => {}
_ => return Err(TidyError::TypeMismatch {
expected: "Float or Int".to_string(),
got: nc.type_name().to_string(),
}),
}
let mut sums = vec![0.0f64; fct.levels.len()];
let mut counts = vec![0usize; fct.levels.len()];
match nc {
Column::Float(v) => {
for (i, &d) in fct.data.iter().enumerate() {
if !v[i].is_nan() {
sums[d as usize] += v[i];
counts[d as usize] += 1;
}
}
}
Column::Int(v) => {
for (i, &d) in fct.data.iter().enumerate() {
sums[d as usize] += v[i] as f64;
counts[d as usize] += 1;
}
}
_ => unreachable!(),
}
Ok(sums.iter().zip(counts.iter())
.map(|(&s, &c)| if c == 0 { f64::NAN } else { s / c as f64 })
.collect())
}
}
// ── Categorical Encoding Functions ──────────────────────────────────────────
/// Convert a string slice into a categorical encoding with sorted unique levels
/// and integer codes.
///
/// Uses `BTreeSet` for deterministic sorted level discovery.
pub fn label_encode(col: &[String]) -> (Vec<String>, Vec<u32>) {
let unique: BTreeSet<&str> = col.iter().map(|s| s.as_str()).collect();
let levels: Vec<String> = unique.into_iter().map(|s| s.to_string()).collect();
let lookup: BTreeMap<&str, u32> = levels
.iter()
.enumerate()
.map(|(i, s)| (s.as_str(), i as u32))
.collect();
let codes: Vec<u32> = col.iter().map(|s| lookup[s.as_str()]).collect();
(levels, codes)
}
/// Convert a string slice into a categorical encoding with a user-specified
/// level order.
///
/// Returns an error if any value in `col` is not found in `order`.
pub fn ordinal_encode(col: &[String], order: &[String]) -> Result<(Vec<String>, Vec<u32>), String> {
let lookup: BTreeMap<&str, u32> = order
.iter()
.enumerate()
.map(|(i, s)| (s.as_str(), i as u32))
.collect();
let mut codes = Vec::with_capacity(col.len());
for s in col {
match lookup.get(s.as_str()) {
Some(&idx) => codes.push(idx),
None => return Err(format!("value {:?} not found in specified order", s)),
}
}
Ok((order.to_vec(), codes))
}
/// One-hot encode a categorical column into multiple boolean columns.
///
/// Returns `(column_names, columns)` where each column is `Vec<bool>` and
/// each row has exactly one `true` across all columns.
pub fn one_hot_encode(levels: &[String], codes: &[u32]) -> (Vec<String>, Vec<Vec<bool>>) {
let n_levels = levels.len();
let n_rows = codes.len();
let mut columns: Vec<Vec<bool>> = vec![vec![false; n_rows]; n_levels];
for (row, &code) in codes.iter().enumerate() {
columns[code as usize][row] = true;
}
let names: Vec<String> = levels.to_vec();
(names, columns)
}
#[cfg(test)]
mod rolling_window_tests {
use super::*;
/// Helper: build a simple DataFrame with a single float column.
fn make_df(col_name: &str, vals: Vec<f64>) -> DataFrame {
DataFrame {
columns: vec![(col_name.to_string(), Column::Float(vals))],
}
}
#[test]
fn rolling_sum_basic() {
// [1,2,3,4,5] with window=3
// Expected: [1, 3, 6, 9, 12]
let df = make_df("x", vec![1.0, 2.0, 3.0, 4.0, 5.0]);
let expr = DExpr::RollingSum("x".into(), 3);
let col = eval_expr_column(&df, &expr, 5).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 5);
assert!((v[0] - 1.0).abs() < 1e-12);
assert!((v[1] - 3.0).abs() < 1e-12);
assert!((v[2] - 6.0).abs() < 1e-12);
assert!((v[3] - 9.0).abs() < 1e-12);
assert!((v[4] - 12.0).abs() < 1e-12);
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_mean_basic() {
// [1,2,3,4,5] with window=3
// Expected: [1/1, 3/2, 6/3, 9/3, 12/3] = [1, 1.5, 2, 3, 4]
let df = make_df("x", vec![1.0, 2.0, 3.0, 4.0, 5.0]);
let expr = DExpr::RollingMean("x".into(), 3);
let col = eval_expr_column(&df, &expr, 5).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 5);
assert!((v[0] - 1.0).abs() < 1e-12);
assert!((v[1] - 1.5).abs() < 1e-12);
assert!((v[2] - 2.0).abs() < 1e-12);
assert!((v[3] - 3.0).abs() < 1e-12);
assert!((v[4] - 4.0).abs() < 1e-12);
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_min_basic() {
// [5,3,4,1,2] with window=3
// Expected: [5, 3, 3, 1, 1]
let df = make_df("x", vec![5.0, 3.0, 4.0, 1.0, 2.0]);
let expr = DExpr::RollingMin("x".into(), 3);
let col = eval_expr_column(&df, &expr, 5).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 5);
assert!((v[0] - 5.0).abs() < 1e-12);
assert!((v[1] - 3.0).abs() < 1e-12);
assert!((v[2] - 3.0).abs() < 1e-12);
assert!((v[3] - 1.0).abs() < 1e-12);
assert!((v[4] - 1.0).abs() < 1e-12);
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_max_basic() {
// [1,5,3,2,4] with window=3
// Expected: [1, 5, 5, 5, 4]
let df = make_df("x", vec![1.0, 5.0, 3.0, 2.0, 4.0]);
let expr = DExpr::RollingMax("x".into(), 3);
let col = eval_expr_column(&df, &expr, 5).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 5);
assert!((v[0] - 1.0).abs() < 1e-12);
assert!((v[1] - 5.0).abs() < 1e-12);
assert!((v[2] - 5.0).abs() < 1e-12);
assert!((v[3] - 5.0).abs() < 1e-12);
assert!((v[4] - 4.0).abs() < 1e-12);
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_var_basic() {
// [2,4,6,8] with window=3
let df = make_df("x", vec![2.0, 4.0, 6.0, 8.0]);
let expr = DExpr::RollingVar("x".into(), 3);
let col = eval_expr_column(&df, &expr, 4).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 4);
// i=0: count=1, var=0
assert!((v[0] - 0.0).abs() < 1e-12);
// i=1: count=2, sample var of [2,4] = 2.0
assert!((v[1] - 2.0).abs() < 1e-10);
// i=2: count=3, sample var of [2,4,6] = 4.0
assert!((v[2] - 4.0).abs() < 1e-10);
// i=3: count=3, sample var of [4,6,8] = 4.0
assert!((v[3] - 4.0).abs() < 1e-10);
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_sd_basic() {
let df = make_df("x", vec![2.0, 4.0, 6.0, 8.0]);
let expr = DExpr::RollingSd("x".into(), 3);
let col = eval_expr_column(&df, &expr, 4).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 4);
assert!((v[0] - 0.0).abs() < 1e-12);
assert!((v[1] - 2.0_f64.sqrt()).abs() < 1e-10);
assert!((v[2] - 2.0).abs() < 1e-10);
assert!((v[3] - 2.0).abs() < 1e-10);
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_window_larger_than_data() {
let df = make_df("x", vec![1.0, 2.0, 3.0]);
let expr = DExpr::RollingSum("x".into(), 10);
let col = eval_expr_column(&df, &expr, 3).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 3);
assert!((v[0] - 1.0).abs() < 1e-12);
assert!((v[1] - 3.0).abs() < 1e-12);
assert!((v[2] - 6.0).abs() < 1e-12);
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_window_of_one() {
let df = make_df("x", vec![3.0, 1.0, 4.0, 1.0, 5.0]);
let expr_min = DExpr::RollingMin("x".into(), 1);
let expr_max = DExpr::RollingMax("x".into(), 1);
let col_min = eval_expr_column(&df, &expr_min, 5).unwrap();
let col_max = eval_expr_column(&df, &expr_max, 5).unwrap();
match (col_min, col_max) {
(Column::Float(mins), Column::Float(maxs)) => {
let expected = [3.0, 1.0, 4.0, 1.0, 5.0];
for i in 0..5 {
assert!((mins[i] - expected[i]).abs() < 1e-12, "min[{}]", i);
assert!((maxs[i] - expected[i]).abs() < 1e-12, "max[{}]", i);
}
}
_ => panic!("expected Float columns"),
}
}
#[test]
fn rolling_sum_with_nan() {
let df = make_df("x", vec![1.0, f64::NAN, 3.0, 4.0]);
let expr = DExpr::RollingSum("x".into(), 2);
let col = eval_expr_column(&df, &expr, 4).unwrap();
match col {
Column::Float(v) => {
assert_eq!(v.len(), 4);
assert!((v[0] - 1.0).abs() < 1e-12);
assert!(v[1].is_nan());
assert!(v[2].is_nan());
assert!(v[3].is_nan()); // NaN poisons Kahan accumulator
}
_ => panic!("expected Float column"),
}
}
#[test]
fn rolling_determinism() {
let df = make_df("x", vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]);
let expr = DExpr::RollingSum("x".into(), 4);
let mut runs: Vec<Vec<f64>> = Vec::new();
for _ in 0..3 {
let col = eval_expr_column(&df, &expr, 10).unwrap();
match col {
Column::Float(v) => runs.push(v),
_ => panic!("expected Float column"),
}
}
assert_eq!(runs[0], runs[1]);
assert_eq!(runs[1], runs[2]);
}
#[test]
fn rolling_display() {
let expr = DExpr::RollingSum("val".into(), 5);
assert_eq!(format!("{}", expr), "rolling_sum(\"val\", 5)");
let expr2 = DExpr::RollingMean("col".into(), 3);
assert_eq!(format!("{}", expr2), "rolling_mean(\"col\", 3)");
}
#[test]
fn rolling_collect_columns() {
let expr = DExpr::RollingSum("revenue".into(), 7);
let mut cols = Vec::new();
collect_expr_columns(&expr, &mut cols);
assert_eq!(cols, vec!["revenue".to_string()]);
}
#[test]
fn rolling_not_allowed_in_row_context() {
let df = make_df("x", vec![1.0, 2.0, 3.0]);
let expr = DExpr::RollingSum("x".into(), 2);
let result = eval_expr_row(&df, &expr, 0);
assert!(result.is_err());
}
// ── v3 Phase 4 unit tests ─────────────────────────────────────────
fn cat_col(levels: &[&str], codes: &[u32]) -> Column {
Column::Categorical {
levels: levels.iter().map(|s| s.to_string()).collect(),
codes: codes.to_vec(),
}
}
#[test]
fn phase4_collect_cat_keys_returns_some_when_all_categorical() {
let left = DataFrame::from_columns(vec![
("k".into(), cat_col(&["a", "b", "c"], &[0, 1, 2, 0])),
])
.unwrap();
let right = DataFrame::from_columns(vec![
("k".into(), cat_col(&["b", "a"], &[0, 1, 1])),
])
.unwrap();
let cat = collect_categorical_join_keys(&left, &[0], &right, &[0]).unwrap();
// right_to_left for the right's "k" col: right code 0 = "b" → left code 1; right code 1 = "a" → left code 0.
assert_eq!(cat.right_to_left[0], vec![Some(1u32), Some(0u32)]);
}
#[test]
fn phase4_collect_cat_keys_returns_none_on_mixed_types() {
let left = DataFrame::from_columns(vec![
("k".into(), cat_col(&["a"], &[0])),
("n".into(), Column::Int(vec![1])),
])
.unwrap();
let right = DataFrame::from_columns(vec![
("k".into(), cat_col(&["a"], &[0])),
("n".into(), Column::Int(vec![1])),
])
.unwrap();
// First col cat-aware OK, second col Int → fallback.
assert!(collect_categorical_join_keys(&left, &[0, 1], &right, &[0, 1]).is_none());
}
#[test]
fn phase4_collect_cat_keys_unknown_right_level_yields_none_in_remap() {
let left = DataFrame::from_columns(vec![
("k".into(), cat_col(&["a", "b"], &[0, 1])),
])
.unwrap();
let right = DataFrame::from_columns(vec![
("k".into(), cat_col(&["a", "z"], &[0, 1])),
])
.unwrap();
let cat = collect_categorical_join_keys(&left, &[0], &right, &[0]).unwrap();
// "a" exists on the left → Some(0). "z" does not → None.
assert_eq!(cat.right_to_left[0], vec![Some(0u32), None]);
}
#[test]
fn phase4_column_to_categorical_column_roundtrip() {
let original = cat_col(&["red", "green", "blue"], &[0, 1, 2, 1, 0]);
let cc = original.to_categorical_column().unwrap();
let restored = Column::from_categorical_column(&cc).unwrap();
match (&original, &restored) {
(
Column::Categorical { levels: l1, codes: c1 },
Column::Categorical { levels: l2, codes: c2 },
) => {
assert_eq!(l1, l2);
assert_eq!(c1, c2);
}
_ => panic!("expected Categorical"),
}
}
#[test]
fn phase4_column_to_categorical_column_none_for_non_categorical() {
assert!(Column::Int(vec![1, 2, 3]).to_categorical_column().is_none());
assert!(Column::Str(vec!["a".into()]).to_categorical_column().is_none());
assert!(Column::Float(vec![1.0]).to_categorical_column().is_none());
}
#[test]
fn phase4_column_from_categorical_column_rejects_nulls() {
// CategoricalColumn with a null cannot map to Column::Categorical
// (which has no null bitmap). Verifies the safety check.
use crate::byte_dict::CategoricalColumn;
let mut cc = CategoricalColumn::new();
cc.push(b"a").unwrap();
cc.push_null();
cc.push(b"b").unwrap();
assert!(Column::from_categorical_column(&cc).is_none());
}
#[test]
fn phase4_column_from_categorical_column_rejects_non_utf8() {
use crate::byte_dict::CategoricalColumn;
let mut cc = CategoricalColumn::new();
// 0xFF is invalid as a standalone UTF-8 byte.
cc.push(&[0xFFu8]).unwrap();
assert!(Column::from_categorical_column(&cc).is_none());
}
}
// â"€â"€ Phase 17 NoGC audit notes â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€â"€
//
// Safe (@nogc â€" metadata only, O(L) or O(N) over Rust-heap Vec only):
// fct_collapse : rewrites levels Vec + remap data Vec â€" both are Rust heap,
// no GC heap involved. SAFE.
//
// NOT safe (materialising, allocates new Rust heap buffers proportional to N or L):
// fct_encode : allocates Vec<u16> of length N + Vec<String> levels
// fct_lump : allocates new levels Vec + new data Vec
// fct_reorder : allocates new levels Vec + new data Vec
//
// Registered in cjc-mir/src/nogc_verify.rs:
// SAFE: fct_collapse
// UNSAFE: fct_encode, fct_lump, fct_reorder (intentionally absent)