use crate::dataframe::DataFrame;
use crate::error::{Error, Result};
use crate::optimized::OptimizedDataFrame;
use std::collections::HashMap;
#[derive(Debug, Clone)]
pub struct ModelMetrics {
pub metrics: HashMap<String, f64>,
pub training_time: f64,
pub prediction_time: Option<f64>,
}
impl ModelMetrics {
pub fn new() -> Self {
ModelMetrics {
metrics: HashMap::new(),
training_time: 0.0,
prediction_time: None,
}
}
pub fn add_metric(&mut self, name: &str, value: f64) {
self.metrics.insert(name.to_string(), value);
}
pub fn get_metric(&self, name: &str) -> Option<&f64> {
self.metrics.get(name)
}
pub fn set_training_time(&mut self, time: f64) {
self.training_time = time;
}
pub fn set_prediction_time(&mut self, time: f64) {
self.prediction_time = Some(time);
}
}
pub trait ModelEvaluator {
fn evaluate(&self, test_data: &DataFrame, test_target: &str) -> Result<ModelMetrics>;
fn cross_validate(
&self,
data: &DataFrame,
target: &str,
folds: usize,
) -> Result<Vec<ModelMetrics>>;
}
pub trait SupervisedModel: ModelEvaluator {
fn fit(&mut self, train_data: &DataFrame, target_column: &str) -> Result<()>;
fn predict(&self, data: &DataFrame) -> Result<Vec<f64>>;
fn feature_importances(&self) -> Option<HashMap<String, f64>>;
}
pub trait UnsupervisedModel: ModelEvaluator {
fn fit(&mut self, data: &DataFrame) -> Result<()>;
fn transform(&self, data: &DataFrame) -> Result<DataFrame>;
fn fit_transform(&mut self, data: &DataFrame) -> Result<DataFrame> {
self.fit(data)?;
self.transform(data)
}
}
#[derive(Debug, Clone)]
pub struct CrossValidation {
pub n_folds: usize,
pub shuffle: bool,
pub random_seed: Option<u64>,
}
impl Default for CrossValidation {
fn default() -> Self {
CrossValidation {
n_folds: 5,
shuffle: true,
random_seed: None,
}
}
}
pub fn train_test_split(
data: &DataFrame,
test_size: f64,
shuffle: bool,
random_seed: Option<u64>,
) -> Result<(DataFrame, DataFrame)> {
if test_size <= 0.0 || test_size >= 1.0 {
return Err(Error::InvalidInput(
"test_size must be between 0 and 1".into(),
));
}
let n_rows = data.nrows();
let n_test = (n_rows as f64 * test_size).round() as usize;
if n_test == 0 || n_test == n_rows {
return Err(Error::InvalidInput(format!(
"test_size {} would result in empty training or test set",
test_size
)));
}
let train_indices: Vec<usize> = (0..(n_rows - n_test)).collect();
let test_indices: Vec<usize> = ((n_rows - n_test)..n_rows).collect();
let train_data = data.sample(&train_indices)?;
let test_data = data.sample(&test_indices)?;
Ok((train_data, test_data))
}
pub fn train_test_split_opt(
data: &OptimizedDataFrame,
test_size: f64,
random_seed: Option<u64>,
) -> Result<(OptimizedDataFrame, OptimizedDataFrame)> {
if test_size <= 0.0 || test_size >= 1.0 {
return Err(Error::InvalidInput(
"test_size must be between 0 and 1".into(),
));
}
let n_rows = data.row_count();
let n_test = (n_rows as f64 * test_size).round() as usize;
if n_test == 0 || n_test == n_rows {
return Err(Error::InvalidInput(format!(
"test_size {} would result in empty training or test set",
test_size
)));
}
let train_indices: Vec<usize> = (0..(n_rows - n_test)).collect();
let test_indices: Vec<usize> = ((n_rows - n_test)..n_rows).collect();
let train_data = data.sample_rows(&train_indices)?;
let test_data = data.sample_rows(&test_indices)?;
Ok((train_data, test_data))
}
pub mod ensemble;
pub mod evaluation;
pub mod linear;
pub mod neural;
pub mod selection;
pub mod tree;
pub use ensemble::{
GradientBoostingClassifier, GradientBoostingConfig, GradientBoostingRegressor,
RandomForestClassifier, RandomForestConfig, RandomForestRegressor,
};
pub use evaluation::{cross_val_score, learning_curve, validation_curve};
pub use linear::{LinearRegression, LogisticRegression};
pub use neural::{
Activation, LossFunction, MLPClassifier, MLPConfig, MLPConfigBuilder, MLPRegressor,
};
pub use selection::{GridSearchCV, HyperparameterGrid, RandomizedSearchCV};
pub use tree::{DecisionTreeClassifier, DecisionTreeConfig, DecisionTreeRegressor, SplitCriterion};