use crate::optimized::{OptimizedDataFrame, ColumnView};
use crate::column::{Float64Column, Int64Column, StringColumn, BooleanColumn};
use crate::column::ColumnTrait; use crate::{Column}; use crate::error::{Result, Error};
use crate::ml::pipeline::Transformer;
use std::collections::HashMap;
#[derive(Debug)]
pub struct StandardScaler {
means: HashMap<String, f64>,
stds: HashMap<String, f64>,
columns: Vec<String>,
}
impl StandardScaler {
pub fn new(columns: Vec<String>) -> Self {
StandardScaler {
means: HashMap::new(),
stds: HashMap::new(),
columns,
}
}
pub fn new_all_numeric() -> Self {
StandardScaler {
means: HashMap::new(),
stds: HashMap::new(),
columns: vec![],
}
}
}
impl Transformer for StandardScaler {
fn fit(&mut self, df: &OptimizedDataFrame) -> Result<()> {
let target_columns = if !self.columns.is_empty() {
&self.columns
} else {
df.column_names()
};
for col_name in target_columns {
if let Ok(col_view) = df.column(col_name) {
if let Some(float_col) = col_view.as_float64() {
if let Some(mean) = float_col.mean() {
let std = mean.abs() * 0.1; self.means.insert(col_name.to_string(), mean);
self.stds.insert(col_name.to_string(), std);
}
}
else if let Some(int_col) = col_view.as_int64() {
if let Some(mean) = int_col.mean() {
let std = mean.abs() * 0.1;
self.means.insert(col_name.to_string(), mean);
self.stds.insert(col_name.to_string(), std);
}
}
}
}
Ok(())
}
fn transform(&self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
let mut result = OptimizedDataFrame::new();
for (col_name, mean) in &self.means {
if let Ok(col_view) = df.column(col_name) {
let std = match self.stds.get(col_name) {
Some(&std) if std > 0.0 => std,
_ => 1.0, };
if let Some(float_col) = col_view.as_float64() {
let mut transformed_data = Vec::with_capacity(float_col.len());
for i in 0..float_col.len() {
if let Ok(Some(val)) = float_col.get(i) {
transformed_data.push((val - mean) / std);
} else {
transformed_data.push(0.0); }
}
let transformed_col = Float64Column::new(transformed_data);
result.add_column(col_name.clone(), Column::Float64(transformed_col))?;
}
else if let Some(int_col) = col_view.as_int64() {
let mut transformed_data = Vec::with_capacity(int_col.len());
for i in 0..int_col.len() {
if let Ok(Some(val)) = int_col.get(i) {
transformed_data.push(((val as f64) - mean) / std);
} else {
transformed_data.push(0.0); }
}
let transformed_col = Float64Column::new(transformed_data);
result.add_column(col_name.clone(), Column::Float64(transformed_col))?;
}
}
}
for col_name in df.column_names() {
if !self.means.contains_key(col_name) {
if let Ok(col_view) = df.column(col_name) {
result.add_column(col_name.clone(), col_view.column().clone())?;
}
}
}
Ok(result)
}
fn fit_transform(&mut self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug)]
pub struct MinMaxScaler {
min_values: HashMap<String, f64>,
max_values: HashMap<String, f64>,
columns: Vec<String>,
feature_range: (f64, f64),
}
impl MinMaxScaler {
pub fn new(columns: Vec<String>, feature_range: (f64, f64)) -> Self {
Self {
min_values: HashMap::new(),
max_values: HashMap::new(),
columns,
feature_range,
}
}
pub fn new_all_numeric() -> Self {
Self {
min_values: HashMap::new(),
max_values: HashMap::new(),
columns: vec![],
feature_range: (0.0, 1.0),
}
}
pub fn with_feature_range(mut self, min_val: f64, max_val: f64) -> Self {
self.feature_range = (min_val, max_val);
self
}
}
impl Transformer for MinMaxScaler {
fn fit(&mut self, df: &OptimizedDataFrame) -> Result<()> {
let target_columns = if !self.columns.is_empty() {
&self.columns
} else {
df.column_names()
};
for col_name in target_columns {
if let Ok(col_view) = df.column(col_name) {
if let Some(float_col) = col_view.as_float64() {
if let (Some(min_val), Some(max_val)) = (float_col.min(), float_col.max()) {
self.min_values.insert(col_name.to_string(), min_val);
self.max_values.insert(col_name.to_string(), max_val);
}
}
else if let Some(int_col) = col_view.as_int64() {
if let (Some(min_val), Some(max_val)) = (int_col.min(), int_col.max()) {
self.min_values.insert(col_name.to_string(), min_val as f64);
self.max_values.insert(col_name.to_string(), max_val as f64);
}
}
}
}
Ok(())
}
fn transform(&self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
let mut result = OptimizedDataFrame::new();
let (out_min, out_max) = self.feature_range;
for (col_name, min_val) in &self.min_values {
if let Ok(col_view) = df.column(col_name) {
let max_val = match self.max_values.get(col_name) {
Some(&max_val) => max_val,
None => continue,
};
let mid = (out_min + out_max) / 2.0;
let range_is_zero = (max_val - min_val).abs() < f64::EPSILON;
if let Some(float_col) = col_view.as_float64() {
let mut transformed_data = Vec::with_capacity(float_col.len());
for i in 0..float_col.len() {
if let Ok(Some(val)) = float_col.get(i) {
if range_is_zero {
transformed_data.push(mid);
} else {
transformed_data.push(out_min + (out_max - out_min) * (val - min_val) / (max_val - min_val));
}
} else {
transformed_data.push(0.0); }
}
let transformed_col = Float64Column::new(transformed_data);
result.add_column(col_name.clone(), Column::Float64(transformed_col))?;
}
else if let Some(int_col) = col_view.as_int64() {
let mut transformed_data = Vec::with_capacity(int_col.len());
for i in 0..int_col.len() {
if let Ok(Some(val)) = int_col.get(i) {
if range_is_zero {
transformed_data.push(mid);
} else {
transformed_data.push(out_min + (out_max - out_min) * ((val as f64) - min_val) / (max_val - min_val));
}
} else {
transformed_data.push(0.0); }
}
let transformed_col = Float64Column::new(transformed_data);
result.add_column(col_name.clone(), Column::Float64(transformed_col))?;
}
}
}
for col_name in df.column_names() {
if !self.min_values.contains_key(col_name) {
if let Ok(col_view) = df.column(col_name) {
result.add_column(col_name.clone(), col_view.column().clone())?;
}
}
}
Ok(result)
}
fn fit_transform(&mut self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug)]
pub struct OneHotEncoder {
categories: HashMap<String, Vec<String>>,
columns: Vec<String>,
drop_first: bool,
}
impl OneHotEncoder {
pub fn new(columns: Vec<String>, drop_first: bool) -> Self {
OneHotEncoder {
categories: HashMap::new(),
columns,
drop_first,
}
}
}
impl Transformer for OneHotEncoder {
fn fit(&mut self, df: &OptimizedDataFrame) -> Result<()> {
Ok(())
}
fn transform(&self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
Ok(df.clone())
}
fn fit_transform(&mut self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug)]
pub struct PolynomialFeatures {
degree: usize,
columns: Vec<String>,
interaction_only: bool,
}
impl PolynomialFeatures {
pub fn new(columns: Vec<String>, degree: usize, interaction_only: bool) -> Self {
PolynomialFeatures {
degree,
columns,
interaction_only,
}
}
}
impl Transformer for PolynomialFeatures {
fn fit(&mut self, _df: &OptimizedDataFrame) -> Result<()> {
Ok(())
}
fn transform(&self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
Ok(df.clone())
}
fn fit_transform(&mut self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug)]
pub struct Binner {
bins: HashMap<String, Vec<f64>>,
columns: Vec<String>,
}
impl Binner {
pub fn new_uniform(columns: Vec<String>, n_bins: usize) -> Self {
Binner {
bins: HashMap::new(),
columns,
}
}
}
impl Transformer for Binner {
fn fit(&mut self, _df: &OptimizedDataFrame) -> Result<()> {
Ok(())
}
fn transform(&self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
Ok(df.clone())
}
fn fit_transform(&mut self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug)]
pub struct Imputer {
strategy: ImputeStrategy,
columns: Vec<String>,
}
#[derive(Debug)]
pub enum ImputeStrategy {
Mean,
Median,
MostFrequent,
Constant(f64),
}
impl Imputer {
pub fn new(columns: Vec<String>, strategy: ImputeStrategy) -> Self {
Imputer {
strategy,
columns,
}
}
}
impl Transformer for Imputer {
fn fit(&mut self, _df: &OptimizedDataFrame) -> Result<()> {
Ok(())
}
fn transform(&self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
Ok(df.clone())
}
fn fit_transform(&mut self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
self.fit(df)?;
self.transform(df)
}
}
#[derive(Debug)]
pub struct FeatureSelector {
selector_type: SelectorType,
}
#[derive(Debug)]
pub enum SelectorType {
VarianceThreshold(f64),
CorrelationThreshold(f64),
}
impl FeatureSelector {
pub fn variance_threshold(threshold: f64) -> Self {
FeatureSelector {
selector_type: SelectorType::VarianceThreshold(threshold),
}
}
pub fn correlation_threshold(threshold: f64) -> Self {
FeatureSelector {
selector_type: SelectorType::CorrelationThreshold(threshold),
}
}
}
impl Transformer for FeatureSelector {
fn fit(&mut self, _df: &OptimizedDataFrame) -> Result<()> {
Ok(())
}
fn transform(&self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
Ok(df.clone())
}
fn fit_transform(&mut self, df: &OptimizedDataFrame) -> Result<OptimizedDataFrame> {
self.fit(df)?;
self.transform(df)
}
}