use crate::dataframe::DataFrame;
use crate::error::{Error, Result};
use crate::ml::models::{ModelEvaluator, SupervisedModel};
pub fn cross_val_score<T: SupervisedModel + Clone>(
model: &T,
data: &DataFrame,
target: &str,
folds: usize,
metric: &str,
) -> Result<Vec<f64>> {
if folds < 2 {
return Err(Error::InvalidInput(
"Number of folds must be at least 2".into(),
));
}
let metrics = model.cross_validate(data, target, folds)?;
let mut scores = Vec::with_capacity(folds);
for fold_metrics in metrics {
if let Some(score) = fold_metrics.get_metric(metric) {
scores.push(*score);
} else {
return Err(Error::InvalidInput(format!(
"Metric '{}' not found in model evaluation",
metric
)));
}
}
Ok(scores)
}
pub fn learning_curve<T: SupervisedModel + Clone>(
model: &T,
data: &DataFrame,
target: &str,
train_sizes: &[f64],
metric: &str,
cv: usize,
) -> Result<(Vec<usize>, Vec<f64>, Vec<f64>)> {
if cv < 2 {
return Err(Error::InvalidInput(
"Number of CV folds must be at least 2".into(),
));
}
for &size in train_sizes {
if size <= 0.0 || size > 1.0 {
return Err(Error::InvalidInput(
"Training sizes must be between 0 and 1".into(),
));
}
}
let n = data.nrows();
let mut absolute_sizes = Vec::with_capacity(train_sizes.len());
let mut train_scores_out = Vec::with_capacity(train_sizes.len());
let mut test_scores_out = Vec::with_capacity(train_sizes.len());
for &size_frac in train_sizes {
let subset_n = ((n as f64 * size_frac).round() as usize).max(cv + 1).min(n);
let indices: Vec<usize> = (0..subset_n).collect();
let subset = data.sample(&indices)?;
let fold_size = subset_n / cv;
if fold_size == 0 {
absolute_sizes.push(subset_n);
train_scores_out.push(0.0);
test_scores_out.push(0.0);
continue;
}
let mut test_fold_scores: Vec<f64> = Vec::with_capacity(cv);
let mut train_fold_scores: Vec<f64> = Vec::with_capacity(cv);
for fold_i in 0..cv {
let test_start = fold_i * fold_size;
let test_end = if fold_i == cv - 1 {
subset_n
} else {
(fold_i + 1) * fold_size
};
let train_idx: Vec<usize> = (0..subset_n)
.filter(|&i| i < test_start || i >= test_end)
.collect();
let test_idx: Vec<usize> = (test_start..test_end).collect();
if train_idx.len() < 2 || test_idx.is_empty() {
continue;
}
let train_data = subset.sample(&train_idx)?;
let test_data = subset.sample(&test_idx)?;
let mut m = model.clone();
if m.fit(&train_data, target).is_err() {
continue;
}
if let Ok(tr_met) = m.evaluate(&train_data, target) {
if let Some(&s) = tr_met.get_metric(metric) {
train_fold_scores.push(s);
}
}
if let Ok(te_met) = m.evaluate(&test_data, target) {
if let Some(&s) = te_met.get_metric(metric) {
test_fold_scores.push(s);
}
}
}
let mean_train = if train_fold_scores.is_empty() {
0.0
} else {
train_fold_scores.iter().sum::<f64>() / train_fold_scores.len() as f64
};
let mean_test = if test_fold_scores.is_empty() {
0.0
} else {
test_fold_scores.iter().sum::<f64>() / test_fold_scores.len() as f64
};
absolute_sizes.push(subset_n);
train_scores_out.push(mean_train);
test_scores_out.push(mean_test);
}
Ok((absolute_sizes, train_scores_out, test_scores_out))
}
pub fn validation_curve<T, F, P>(
model_factory: F,
data: &DataFrame,
target: &str,
_param_name: &str,
param_values: &[P],
metric: &str,
cv: usize,
) -> Result<(Vec<P>, Vec<f64>, Vec<f64>)>
where
T: SupervisedModel,
F: Fn(P) -> T,
P: Clone,
{
if cv < 2 {
return Err(Error::InvalidInput(
"Number of CV folds must be at least 2".into(),
));
}
if param_values.is_empty() {
return Err(Error::InvalidInput(
"Parameter values array cannot be empty".into(),
));
}
let n = data.nrows();
let fold_size = n / cv;
let mut train_scores_out = Vec::with_capacity(param_values.len());
let mut test_scores_out = Vec::with_capacity(param_values.len());
for param_val in param_values {
let mut test_fold_scores: Vec<f64> = Vec::with_capacity(cv);
let mut train_fold_scores: Vec<f64> = Vec::with_capacity(cv);
if fold_size == 0 {
train_scores_out.push(0.0);
test_scores_out.push(0.0);
continue;
}
for fold_i in 0..cv {
let test_start = fold_i * fold_size;
let test_end = if fold_i == cv - 1 {
n
} else {
(fold_i + 1) * fold_size
};
let train_idx: Vec<usize> = (0..n)
.filter(|&i| i < test_start || i >= test_end)
.collect();
let test_idx: Vec<usize> = (test_start..test_end).collect();
if train_idx.len() < 2 || test_idx.is_empty() {
continue;
}
let train_data = data.sample(&train_idx)?;
let test_data = data.sample(&test_idx)?;
let mut m = model_factory(param_val.clone());
if m.fit(&train_data, target).is_err() {
continue;
}
if let Ok(tr_met) = m.evaluate(&train_data, target) {
if let Some(&s) = tr_met.get_metric(metric) {
train_fold_scores.push(s);
}
}
if let Ok(te_met) = m.evaluate(&test_data, target) {
if let Some(&s) = te_met.get_metric(metric) {
test_fold_scores.push(s);
}
}
}
let mean_train = if train_fold_scores.is_empty() {
0.0
} else {
train_fold_scores.iter().sum::<f64>() / train_fold_scores.len() as f64
};
let mean_test = if test_fold_scores.is_empty() {
0.0
} else {
test_fold_scores.iter().sum::<f64>() / test_fold_scores.len() as f64
};
train_scores_out.push(mean_train);
test_scores_out.push(mean_test);
}
Ok((param_values.to_vec(), train_scores_out, test_scores_out))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::dataframe::DataFrame;
use crate::ml::models::linear::LinearRegression;
use crate::series::Series;
fn make_linear_df(n: usize) -> DataFrame {
let x: Vec<f64> = (0..n).map(|i| i as f64).collect();
let y: Vec<f64> = x.iter().map(|&v| 2.0 * v + 1.0).collect();
let mut df = DataFrame::new();
df.add_column(
"x".to_string(),
Series::new(x, Some("x".to_string())).expect("Series::new"),
)
.expect("add x");
df.add_column(
"y".to_string(),
Series::new(y, Some("y".to_string())).expect("Series::new"),
)
.expect("add y");
df
}
#[test]
fn test_learning_curve_varies() {
let df = make_linear_df(20);
let model = LinearRegression::new();
let train_sizes = vec![0.5, 0.8, 1.0_f64];
let (sizes, _train_sc, test_sc) = learning_curve(&model, &df, "y", &train_sizes, "r2", 2)
.expect("learning_curve should succeed");
assert_eq!(
sizes.len(),
3,
"must return one entry per train_size fraction"
);
assert_eq!(
test_sc.len(),
3,
"test_scores length must match train_sizes"
);
let all_identical = test_sc.windows(2).all(|w| (w[0] - w[1]).abs() < 1e-12);
let all_stub = test_sc.iter().all(|&s| (s - 0.8).abs() < 1e-12);
assert!(
!all_stub,
"test_scores must not be the old hardcoded stub [0.8, 0.8, 0.8], got {:?}",
test_sc
);
let _ = all_identical; }
#[test]
fn test_validation_curve_basic() {
let df = make_linear_df(20);
let param_values = vec![1_usize, 2, 3];
let (pv_out, train_sc, test_sc) = validation_curve(
|_p: usize| LinearRegression::new(),
&df,
"y",
"dummy_param",
¶m_values,
"r2",
2,
)
.expect("validation_curve should succeed");
assert_eq!(
pv_out.len(),
param_values.len(),
"output param_values length must match input"
);
assert_eq!(
train_sc.len(),
param_values.len(),
"train_scores length must match param_values"
);
assert_eq!(
test_sc.len(),
param_values.len(),
"test_scores length must match param_values"
);
}
}