use crate::error::Result;
use crate::primitives::{Matrix, Vector};
#[derive(Debug, Clone)]
pub struct DataFrame {
columns: Vec<(String, Vector<f32>)>,
n_rows: usize,
}
impl DataFrame {
pub fn new(columns: Vec<(String, Vector<f32>)>) -> Result<Self> {
if columns.is_empty() {
return Err("DataFrame must have at least one column".into());
}
let n_rows = columns[0].1.len();
for (name, col) in &columns {
if col.len() != n_rows {
return Err("All columns must have the same length".into());
}
if name.is_empty() {
return Err("Column names cannot be empty".into());
}
}
let mut names: Vec<&str> = columns.iter().map(|(n, _)| n.as_str()).collect();
names.sort_unstable();
for i in 1..names.len() {
if names[i] == names[i - 1] {
return Err("Duplicate column names not allowed".into());
}
}
Ok(Self { columns, n_rows })
}
#[must_use]
pub fn shape(&self) -> (usize, usize) {
(self.n_rows, self.columns.len())
}
#[must_use]
pub fn n_rows(&self) -> usize {
self.n_rows
}
#[must_use]
pub fn n_cols(&self) -> usize {
self.columns.len()
}
#[must_use]
pub fn column_names(&self) -> Vec<&str> {
self.columns.iter().map(|(n, _)| n.as_str()).collect()
}
pub fn column(&self, name: &str) -> Result<&Vector<f32>> {
self.columns
.iter()
.find(|(n, _)| n == name)
.map(|(_, v)| v)
.ok_or_else(|| "Column not found".into())
}
pub fn select(&self, names: &[&str]) -> Result<Self> {
if names.is_empty() {
return Err("Must select at least one column".into());
}
let mut selected = Vec::with_capacity(names.len());
for &name in names {
let col = self.column(name)?;
selected.push((name.to_string(), col.clone()));
}
Self::new(selected)
}
pub fn row(&self, idx: usize) -> Result<Vector<f32>> {
if idx >= self.n_rows {
return Err("Row index out of bounds".into());
}
let data: Vec<f32> = self.columns.iter().map(|(_, col)| col[idx]).collect();
Ok(Vector::from_vec(data))
}
#[must_use]
pub fn to_matrix(&self) -> Matrix<f32> {
let mut data = Vec::with_capacity(self.n_rows * self.columns.len());
for row_idx in 0..self.n_rows {
for (_, col) in &self.columns {
data.push(col[row_idx]);
}
}
Matrix::from_vec(self.n_rows, self.columns.len(), data)
.expect("Internal error: data size mismatch")
}
pub fn iter_columns(&self) -> impl Iterator<Item = (&str, &Vector<f32>)> {
self.columns.iter().map(|(n, v)| (n.as_str(), v))
}
pub fn add_column(&mut self, name: String, data: Vector<f32>) -> Result<()> {
if data.len() != self.n_rows {
return Err("Column length must match existing rows".into());
}
if self.columns.iter().any(|(n, _)| n == &name) {
return Err("Column name already exists".into());
}
if name.is_empty() {
return Err("Column name cannot be empty".into());
}
self.columns.push((name, data));
Ok(())
}
pub fn drop_column(&mut self, name: &str) -> Result<()> {
if self.columns.len() == 1 {
return Err("Cannot drop the last column".into());
}
let idx = self
.columns
.iter()
.position(|(n, _)| n == name)
.ok_or("Column not found")?;
self.columns.remove(idx);
Ok(())
}
#[must_use]
pub fn describe(&self) -> Vec<ColumnStats> {
self.columns
.iter()
.map(|(name, col)| {
let mean = col.mean();
let variance = col.variance();
let std = variance.sqrt();
let mut sorted: Vec<f32> = col.as_slice().to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let min = sorted.first().copied().unwrap_or(0.0);
let max = sorted.last().copied().unwrap_or(0.0);
let median = if sorted.is_empty() {
0.0
} else if sorted.len().is_multiple_of(2) {
f32::midpoint(sorted[sorted.len() / 2 - 1], sorted[sorted.len() / 2])
} else {
sorted[sorted.len() / 2]
};
ColumnStats {
name: name.clone(),
count: col.len(),
mean,
std,
min,
median,
max,
}
})
.collect()
}
}
#[derive(Debug, Clone)]
pub struct ColumnStats {
pub name: String,
pub count: usize,
pub mean: f32,
pub std: f32,
pub min: f32,
pub median: f32,
pub max: f32,
}
pub mod pii;
pub mod evolve;
pub mod quality_filter;
#[cfg(test)]
#[path = "data_tests.rs"]
mod tests;