use crate::dataframe::DataFrame;
use crate::error::{Error, Result};
use crate::series::Series;
use std::collections::{HashMap, HashSet};
#[derive(Debug, Clone)]
pub struct StandardScaler {
pub means: Option<HashMap<String, f64>>,
pub stds: Option<HashMap<String, f64>>,
pub columns: Option<Vec<String>>,
}
impl StandardScaler {
pub fn new() -> Self {
StandardScaler {
means: None,
stds: None,
columns: None,
}
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.columns = Some(columns);
self
}
pub fn fit(&mut self, df: &DataFrame) -> Result<()> {
let columns = match &self.columns {
Some(cols) => cols.clone(),
None => df.column_names().into_iter().collect(),
};
let mut means = HashMap::new();
let mut stds = HashMap::new();
for col_name in columns {
if !df.has_column(&col_name) {
return Err(Error::ColumnNotFound(col_name.to_string()));
}
let col = df.get_column::<f64>(&col_name)?;
if let Ok(numeric_data) = col.as_f64() {
if numeric_data.is_empty() {
continue;
}
let mean: f64 = numeric_data.iter().sum::<f64>() / numeric_data.len() as f64;
means.insert(col_name.to_string(), mean);
let variance: f64 = numeric_data
.iter()
.map(|&x| (x - mean).powi(2))
.sum::<f64>()
/ numeric_data.len() as f64;
let std_dev = variance.sqrt();
stds.insert(col_name.to_string(), std_dev);
}
}
self.means = Some(means);
self.stds = Some(stds);
Ok(())
}
pub fn transform(&self, df: &DataFrame) -> Result<DataFrame> {
if self.means.is_none() || self.stds.is_none() {
return Err(Error::InvalidValue("StandardScaler not fitted".into()));
}
let means = self
.means
.as_ref()
.ok_or_else(|| Error::InvalidOperation("Model not fitted. Call fit() first.".into()))?;
let stds = self
.stds
.as_ref()
.ok_or_else(|| Error::InvalidOperation("Model not fitted. Call fit() first.".into()))?;
let mut result = DataFrame::new();
for col_name in df.column_names() {
let col = df.get_column::<f64>(&col_name)?;
if means.contains_key(&col_name) && stds.contains_key(&col_name) {
let mean = means[&col_name];
let std_dev = stds[&col_name];
if let Ok(numeric_data) = col.as_f64() {
if std_dev > 1e-10 {
let scaled_data: Vec<f64> =
numeric_data.iter().map(|&x| (x - mean) / std_dev).collect();
result.add_column(
col_name.to_string(),
Series::new(scaled_data, Some(col_name.to_string()))?,
)?;
} else {
let scaled_data = vec![0.0; numeric_data.len()];
result.add_column(
col_name.to_string(),
Series::new(scaled_data, Some(col_name.to_string()))?,
)?;
}
} else {
let col_clone = col.clone();
result.add_column(col_name.to_string(), col_clone)?;
}
} else {
let col_clone = col.clone();
result.add_column(col_name.to_string(), col_clone)?;
}
}
Ok(result)
}
pub fn fit_transform(&mut self, df: &DataFrame) -> Result<DataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug, Clone)]
pub struct MinMaxScaler {
pub min_values: Option<HashMap<String, f64>>,
pub max_values: Option<HashMap<String, f64>>,
pub columns: Option<Vec<String>>,
pub feature_range: (f64, f64),
}
impl MinMaxScaler {
pub fn new() -> Self {
MinMaxScaler {
min_values: None,
max_values: None,
columns: None,
feature_range: (0.0, 1.0),
}
}
pub fn new_with_range(min: f64, max: f64) -> Self {
MinMaxScaler {
min_values: None,
max_values: None,
columns: None,
feature_range: (min, max),
}
}
pub fn with_range(mut self, min: f64, max: f64) -> Self {
self.feature_range = (min, max);
self
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.columns = Some(columns);
self
}
pub fn fit(&mut self, df: &DataFrame) -> Result<()> {
let columns = match &self.columns {
Some(cols) => cols.clone(),
None => df.column_names().into_iter().collect(),
};
let mut min_values = HashMap::new();
let mut max_values = HashMap::new();
for col_name in columns {
if !df.has_column(&col_name) {
return Err(Error::ColumnNotFound(col_name.to_string()));
}
let col = df.get_column::<f64>(&col_name)?;
if let Ok(numeric_data) = col.as_f64() {
if numeric_data.is_empty() {
continue;
}
let min_val = numeric_data
.iter()
.min_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.copied()
.ok_or_else(|| {
Error::InvalidOperation("Cannot compute min of empty values".into())
})?;
let max_val = numeric_data
.iter()
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.copied()
.ok_or_else(|| {
Error::InvalidOperation("Cannot compute max of empty values".into())
})?;
min_values.insert(col_name.to_string(), min_val);
max_values.insert(col_name.to_string(), max_val);
}
}
self.min_values = Some(min_values);
self.max_values = Some(max_values);
Ok(())
}
pub fn transform(&self, df: &DataFrame) -> Result<DataFrame> {
if self.min_values.is_none() || self.max_values.is_none() {
return Err(Error::InvalidValue("MinMaxScaler not fitted".into()));
}
let min_values = self
.min_values
.as_ref()
.ok_or_else(|| Error::InvalidOperation("Model not fitted. Call fit() first.".into()))?;
let max_values = self
.max_values
.as_ref()
.ok_or_else(|| Error::InvalidOperation("Model not fitted. Call fit() first.".into()))?;
let (feature_min, feature_max) = self.feature_range;
let mut result = DataFrame::new();
for col_name in df.column_names() {
let col = df.get_column::<f64>(&col_name)?;
if min_values.contains_key(&col_name) && max_values.contains_key(&col_name) {
let min_val = min_values[&col_name];
let max_val = max_values[&col_name];
if let Ok(numeric_data) = col.as_f64() {
if (max_val - min_val).abs() > 1e-10 {
let scaled_data: Vec<f64> = numeric_data
.iter()
.map(|&x| {
let scaled = (x - min_val) / (max_val - min_val);
scaled * (feature_max - feature_min) + feature_min
})
.collect();
result.add_column(
col_name.to_string(),
Series::new(scaled_data, Some(col_name.to_string()))?,
)?;
} else {
let scaled_data = vec![feature_min; numeric_data.len()];
result.add_column(
col_name.to_string(),
Series::new(scaled_data, Some(col_name.to_string()))?,
)?;
}
} else {
let col_clone = col.clone();
result.add_column(col_name.to_string(), col_clone)?;
}
} else {
let col_clone = col.clone();
result.add_column(col_name.to_string(), col_clone)?;
}
}
Ok(result)
}
pub fn fit_transform(&mut self, df: &DataFrame) -> Result<DataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug, Clone)]
pub struct OneHotEncoder {
pub categories: Option<HashMap<String, Vec<String>>>,
pub columns: Option<Vec<String>>,
pub drop_first: bool,
pub prefix: Option<String>,
}
impl OneHotEncoder {
pub fn new() -> Self {
OneHotEncoder {
categories: None,
columns: None,
drop_first: false,
prefix: None,
}
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.columns = Some(columns);
self
}
pub fn drop_first(mut self, drop_first: bool) -> Self {
self.drop_first = drop_first;
self
}
pub fn with_prefix(mut self, prefix: String) -> Self {
self.prefix = Some(prefix);
self
}
}
#[derive(Debug, Clone)]
pub struct PolynomialFeatures {
pub degree: usize,
pub include_bias: bool,
pub interaction_only: bool,
pub columns: Option<Vec<String>>,
}
impl PolynomialFeatures {
pub fn new(degree: usize) -> Self {
PolynomialFeatures {
degree,
include_bias: true,
interaction_only: false,
columns: None,
}
}
pub fn include_bias(mut self, include_bias: bool) -> Self {
self.include_bias = include_bias;
self
}
pub fn interaction_only(mut self, interaction_only: bool) -> Self {
self.interaction_only = interaction_only;
self
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.columns = Some(columns);
self
}
}
#[derive(Debug, Clone)]
pub struct Binner {
pub n_bins: usize,
pub bin_edges: Option<HashMap<String, Vec<f64>>>,
pub strategy: String,
pub columns: Option<Vec<String>>,
}
impl Binner {
pub fn new(n_bins: usize) -> Self {
Binner {
n_bins,
bin_edges: None,
strategy: "uniform".to_string(),
columns: None,
}
}
pub fn with_strategy(mut self, strategy: &str) -> Self {
self.strategy = strategy.to_string();
self
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.columns = Some(columns);
self
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum ImputeStrategy {
Mean,
Median,
MostFrequent,
Constant(f64),
}
#[derive(Debug, Clone)]
pub struct Imputer {
pub strategy: ImputeStrategy,
pub fill_values: Option<HashMap<String, f64>>,
pub columns: Option<Vec<String>>,
}
impl Imputer {
pub fn new() -> Self {
Imputer {
strategy: ImputeStrategy::Mean,
fill_values: None,
columns: None,
}
}
pub fn with_strategy(mut self, strategy: ImputeStrategy) -> Self {
self.strategy = strategy;
self
}
pub fn with_columns(mut self, columns: Vec<String>) -> Self {
self.columns = Some(columns);
self
}
}
#[derive(Debug, Clone)]
pub struct FeatureSelector {
pub columns: Vec<String>,
}
impl FeatureSelector {
pub fn new(columns: Vec<String>) -> Self {
FeatureSelector { columns }
}
pub fn transform(&self, df: &DataFrame) -> Result<DataFrame> {
let mut result = DataFrame::new();
for col_name in &self.columns {
if !df.has_column(col_name) {
return Err(Error::ColumnNotFound(col_name.to_string()));
}
let col = df.get_column::<f64>(col_name)?;
let new_series = col.clone();
result.add_column(col_name.to_string(), new_series)?;
}
Ok(result)
}
}