use scry_learn::dataset::Dataset;
use scry_learn::metrics::{accuracy, r2_score};
fn iris_dataset() -> Dataset {
let path = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR"))
.join("tests")
.join("fixtures");
let feat_path = path.join("iris_features.csv");
let target_path = path.join("iris_target.csv");
let mut rdr = csv::Reader::from_path(&feat_path).unwrap();
let n_cols = rdr.headers().unwrap().len();
let mut rows: Vec<Vec<f64>> = Vec::new();
for result in rdr.records() {
let record = result.unwrap();
rows.push(record.iter().map(|s| s.parse::<f64>().unwrap()).collect());
}
let n_rows = rows.len();
let mut cols = vec![vec![0.0; n_rows]; n_cols];
for (i, row) in rows.iter().enumerate() {
for (j, &val) in row.iter().enumerate() {
cols[j][i] = val;
}
}
let mut rdr = csv::Reader::from_path(&target_path).unwrap();
let target: Vec<f64> = rdr
.records()
.map(|r| r.unwrap()[0].parse::<f64>().unwrap())
.collect();
let feat_names = (0..n_cols).map(|i| format!("f{i}")).collect();
Dataset::new(cols, target, feat_names, "target")
}
fn gen_regression_dataset(n: usize, seed: u64) -> Dataset {
let mut rng = fastrand::Rng::with_seed(seed);
let x1: Vec<f64> = (0..n).map(|_| rng.f64() * 10.0).collect();
let x2: Vec<f64> = (0..n).map(|_| rng.f64() * 10.0).collect();
let y: Vec<f64> = x1
.iter()
.zip(x2.iter())
.map(|(&a, &b)| 2.0 * a + 3.0 * b + 1.0 + rng.f64() * 0.5)
.collect();
Dataset::new(vec![x1, x2], y, vec!["x1".into(), "x2".into()], "y")
}
#[test]
fn multi_seed_random_forest_classifier() {
use scry_learn::tree::RandomForestClassifier;
let data = iris_dataset();
let n_seeds = 50;
let mut accs = Vec::with_capacity(n_seeds);
for seed in 0..n_seeds as u64 {
let (train, test) = scry_learn::split::train_test_split(&data, 0.2, seed);
let mut rf = RandomForestClassifier::new()
.n_estimators(50)
.max_depth(5)
.seed(seed);
rf.fit(&train).unwrap();
let preds = rf.predict(&test.feature_matrix()).unwrap();
accs.push(accuracy(&test.target, &preds));
}
let mean = accs.iter().sum::<f64>() / accs.len() as f64;
let std = (accs.iter().map(|a| (a - mean).powi(2)).sum::<f64>() / accs.len() as f64).sqrt();
eprintln!("RF Classifier 50-seed: mean={mean:.4}, std={std:.4}");
eprintln!(
" min={:.4}, max={:.4}",
accs.iter().copied().fold(f64::INFINITY, f64::min),
accs.iter().copied().fold(f64::NEG_INFINITY, f64::max)
);
assert!(
mean >= 0.90,
"RF mean accuracy should be ≥90% (got {:.1}%)",
mean * 100.0
);
assert!(
std <= 0.10,
"RF std should be ≤10% (got {:.1}%)",
std * 100.0
);
}
#[test]
fn multi_seed_random_forest_regressor() {
use scry_learn::tree::RandomForestRegressor;
let n_seeds = 50;
let mut r2s = Vec::with_capacity(n_seeds);
for seed in 0..n_seeds as u64 {
let data = gen_regression_dataset(200, seed);
let (train, test) = scry_learn::split::train_test_split(&data, 0.2, seed);
let mut rf = RandomForestRegressor::new()
.n_estimators(50)
.max_depth(5)
.seed(seed);
rf.fit(&train).unwrap();
let preds = rf.predict(&test.feature_matrix()).unwrap();
r2s.push(r2_score(&test.target, &preds));
}
let mean = r2s.iter().sum::<f64>() / r2s.len() as f64;
let std = (r2s.iter().map(|a| (a - mean).powi(2)).sum::<f64>() / r2s.len() as f64).sqrt();
eprintln!("RF Regressor 50-seed: mean R²={mean:.4}, std={std:.4}");
assert!(
mean >= 0.85,
"RF Regressor mean R² should be ≥0.85 (got {mean:.4})"
);
assert!(
std <= 0.10,
"RF Regressor std should be ≤0.10 (got {std:.4})"
);
}
#[test]
fn multi_seed_gbt_classifier() {
use scry_learn::tree::GradientBoostingClassifier;
let data = iris_dataset();
let n_seeds = 50;
let mut accs = Vec::with_capacity(n_seeds);
for seed in 0..n_seeds as u64 {
let (train, test) = scry_learn::split::train_test_split(&data, 0.2, seed);
let mut gbt = GradientBoostingClassifier::new()
.n_estimators(100)
.learning_rate(0.1)
.max_depth(3)
.seed(seed);
gbt.fit(&train).unwrap();
let preds = gbt.predict(&test.feature_matrix()).unwrap();
accs.push(accuracy(&test.target, &preds));
}
let mean = accs.iter().sum::<f64>() / accs.len() as f64;
let std = (accs.iter().map(|a| (a - mean).powi(2)).sum::<f64>() / accs.len() as f64).sqrt();
eprintln!("GBT Classifier 50-seed: mean={mean:.4}, std={std:.4}");
assert!(
mean >= 0.90,
"GBT mean accuracy should be ≥90% (got {:.1}%)",
mean * 100.0
);
assert!(
std <= 0.10,
"GBT std should be ≤10% (got {:.1}%)",
std * 100.0
);
}
#[test]
fn multi_seed_gbt_regressor() {
use scry_learn::tree::GradientBoostingRegressor;
let n_seeds = 50;
let mut r2s = Vec::with_capacity(n_seeds);
for seed in 0..n_seeds as u64 {
let data = gen_regression_dataset(200, seed);
let (train, test) = scry_learn::split::train_test_split(&data, 0.2, seed);
let mut gbt = GradientBoostingRegressor::new()
.n_estimators(100)
.learning_rate(0.1)
.max_depth(3)
.seed(seed);
gbt.fit(&train).unwrap();
let preds = gbt.predict(&test.feature_matrix()).unwrap();
r2s.push(r2_score(&test.target, &preds));
}
let mean = r2s.iter().sum::<f64>() / r2s.len() as f64;
let std = (r2s.iter().map(|a| (a - mean).powi(2)).sum::<f64>() / r2s.len() as f64).sqrt();
eprintln!("GBT Regressor 50-seed: mean R²={mean:.4}, std={std:.4}");
assert!(
mean >= 0.85,
"GBT Regressor mean R² should be ≥0.85 (got {mean:.4})"
);
assert!(
std <= 0.10,
"GBT Regressor std should be ≤0.10 (got {std:.4})"
);
}
#[test]
fn multi_seed_kmeans() {
use scry_learn::cluster::{silhouette_score, KMeans};
let data = iris_dataset();
let n_seeds = 50;
let mut scores = Vec::with_capacity(n_seeds);
for seed in 0..n_seeds as u64 {
let mut km = KMeans::new(3).seed(seed).n_init(1).max_iter(300);
km.fit(&data).unwrap();
let features = data.feature_matrix();
let s = silhouette_score(&features, km.labels());
scores.push(s);
}
let mean = scores.iter().sum::<f64>() / scores.len() as f64;
let std = (scores.iter().map(|a| (a - mean).powi(2)).sum::<f64>() / scores.len() as f64).sqrt();
eprintln!("KMeans 50-seed: mean silhouette={mean:.4}, std={std:.4}");
assert!(
mean >= 0.40,
"KMeans mean silhouette should be ≥0.40 (got {mean:.4})"
);
assert!(std <= 0.15, "KMeans std should be ≤0.15 (got {std:.4})");
}
#[test]
fn scaling_dt_accuracy_vs_dataset_size() {
use scry_learn::tree::DecisionTreeClassifier;
let data = iris_dataset();
let sizes = [30, 60, 90, 120];
let mut prev_acc = 0.0;
let mut prev_size = 0;
for &n in &sizes {
let subset = Dataset::new(
data.features.iter().map(|col| col[..n].to_vec()).collect(),
data.target[..n].to_vec(),
data.feature_names.clone(),
&data.target_name,
);
let (train, test) = scry_learn::split::train_test_split(&subset, 0.3, 42);
let mut dt = DecisionTreeClassifier::new().max_depth(5);
dt.fit(&train).unwrap();
let preds = dt.predict(&test.feature_matrix()).unwrap();
let acc = accuracy(&test.target, &preds);
eprintln!("DT accuracy with {n} samples: {:.1}%", acc * 100.0);
if n == sizes[sizes.len() - 1] && prev_size == sizes[0] {
}
prev_acc = acc;
prev_size = n;
}
assert!(
prev_acc >= 0.70,
"DT with 120 Iris samples should achieve ≥70% (got {:.1}%)",
prev_acc * 100.0
);
}
#[test]
fn regularization_lasso_sparsity_increases_with_alpha() {
use scry_learn::linear::LassoRegression;
let data = gen_regression_dataset(300, 42);
let count_zeros =
|coefs: &[f64], thresh: f64| -> usize { coefs.iter().filter(|c| c.abs() < thresh).count() };
let mut lasso_low = LassoRegression::new().alpha(0.001).max_iter(5000);
lasso_low.fit(&data).unwrap();
let zeros_low = count_zeros(lasso_low.coefficients(), 0.01);
let mut lasso_high = LassoRegression::new().alpha(10.0).max_iter(5000);
lasso_high.fit(&data).unwrap();
let zeros_high = count_zeros(lasso_high.coefficients(), 0.01);
eprintln!("Lasso α=0.001 near-zero coeffs: {zeros_low}");
eprintln!("Lasso α=10.0 near-zero coeffs: {zeros_high}");
assert!(
zeros_high >= zeros_low,
"Higher alpha should produce ≥ as many near-zero coefficients: high={zeros_high} vs low={zeros_low}"
);
}
#[test]
fn overfitting_rf_train_vs_test() {
use scry_learn::tree::RandomForestClassifier;
let data = iris_dataset();
let (train, test) = scry_learn::split::train_test_split(&data, 0.3, 42);
let mut rf = RandomForestClassifier::new().n_estimators(100).seed(42);
rf.fit(&train).unwrap();
let train_preds = rf.predict(&train.feature_matrix()).unwrap();
let test_preds = rf.predict(&test.feature_matrix()).unwrap();
let train_acc = accuracy(&train.target, &train_preds);
let test_acc = accuracy(&test.target, &test_preds);
eprintln!(
"RF train acc: {:.1}%, test acc: {:.1}%",
train_acc * 100.0,
test_acc * 100.0
);
assert!(
train_acc >= test_acc - 0.05,
"Train accuracy should be ≥ test accuracy (minus small tolerance): train={:.1}% vs test={:.1}%",
train_acc * 100.0, test_acc * 100.0
);
assert!(
train_acc >= 0.95,
"Unrestricted RF should achieve ≥95% train accuracy (got {:.1}%)",
train_acc * 100.0
);
}
#[test]
fn determinism_rf_same_seed() {
use scry_learn::tree::RandomForestClassifier;
let data = iris_dataset();
let (train, test) = scry_learn::split::train_test_split(&data, 0.2, 42);
let mut rf1 = RandomForestClassifier::new()
.n_estimators(50)
.max_depth(5)
.seed(123);
rf1.fit(&train).unwrap();
let preds1 = rf1.predict(&test.feature_matrix()).unwrap();
let mut rf2 = RandomForestClassifier::new()
.n_estimators(50)
.max_depth(5)
.seed(123);
rf2.fit(&train).unwrap();
let preds2 = rf2.predict(&test.feature_matrix()).unwrap();
assert_eq!(preds1.len(), preds2.len());
for (i, (p1, p2)) in preds1.iter().zip(preds2.iter()).enumerate() {
assert!(
(p1 - p2).abs() < 1e-10,
"RF predictions diverged at sample {i}: {p1} vs {p2}"
);
}
eprintln!("RF determinism: identical predictions across 2 runs with same seed ✓");
}
#[test]
fn determinism_gbt_same_seed() {
use scry_learn::tree::GradientBoostingClassifier;
let data = iris_dataset();
let (train, test) = scry_learn::split::train_test_split(&data, 0.2, 42);
let mut gbt1 = GradientBoostingClassifier::new()
.n_estimators(50)
.learning_rate(0.1)
.max_depth(3)
.seed(123);
gbt1.fit(&train).unwrap();
let preds1 = gbt1.predict(&test.feature_matrix()).unwrap();
let mut gbt2 = GradientBoostingClassifier::new()
.n_estimators(50)
.learning_rate(0.1)
.max_depth(3)
.seed(123);
gbt2.fit(&train).unwrap();
let preds2 = gbt2.predict(&test.feature_matrix()).unwrap();
for (i, (p1, p2)) in preds1.iter().zip(preds2.iter()).enumerate() {
assert!(
(p1 - p2).abs() < 1e-10,
"GBT predictions diverged at sample {i}: {p1} vs {p2}"
);
}
eprintln!("GBT determinism: identical predictions across 2 runs with same seed ✓");
}
#[test]
fn determinism_kmeans_same_seed() {
use scry_learn::cluster::KMeans;
let data = iris_dataset();
let mut km1 = KMeans::new(3).seed(42).n_init(1).max_iter(300);
km1.fit(&data).unwrap();
let labels1: Vec<usize> = km1.labels().to_vec();
let mut km2 = KMeans::new(3).seed(42).n_init(1).max_iter(300);
km2.fit(&data).unwrap();
let labels2: Vec<usize> = km2.labels().to_vec();
assert_eq!(
labels1, labels2,
"KMeans with same seed should produce identical labels"
);
eprintln!("KMeans determinism: identical labels across 2 runs with same seed ✓");
}
#[test]
fn convergence_logreg_solvers_agree() {
use scry_learn::linear::LogisticRegression;
use scry_learn::preprocess::{StandardScaler, Transformer};
let data = iris_dataset();
let mut ds = data;
let mut scaler = StandardScaler::new();
scaler.fit(&ds).unwrap();
scaler.transform(&mut ds).unwrap();
let mut lr_lbfgs = LogisticRegression::new()
.alpha(0.0)
.max_iter(500)
.solver(scry_learn::linear::Solver::Lbfgs);
lr_lbfgs.fit(&ds).unwrap();
let preds_lbfgs = lr_lbfgs.predict(&ds.feature_matrix()).unwrap();
let acc_lbfgs = accuracy(&ds.target, &preds_lbfgs);
let mut lr_gd = LogisticRegression::new()
.alpha(0.0)
.max_iter(2000)
.learning_rate(0.05)
.solver(scry_learn::linear::Solver::GradientDescent);
lr_gd.fit(&ds).unwrap();
let preds_gd = lr_gd.predict(&ds.feature_matrix()).unwrap();
let acc_gd = accuracy(&ds.target, &preds_gd);
eprintln!("LogReg L-BFGS accuracy: {:.1}%", acc_lbfgs * 100.0);
eprintln!("LogReg GD accuracy: {:.1}%", acc_gd * 100.0);
assert!(
acc_lbfgs >= 0.90,
"L-BFGS should achieve ≥90% (got {:.1}%)",
acc_lbfgs * 100.0
);
assert!(
acc_gd >= 0.85,
"GD should achieve ≥85% (got {:.1}%)",
acc_gd * 100.0
);
let diff = (acc_lbfgs - acc_gd).abs();
assert!(
diff <= 0.10,
"L-BFGS and GD should converge to similar accuracy: L-BFGS={:.1}%, GD={:.1}% (gap={:.1}%)",
acc_lbfgs * 100.0,
acc_gd * 100.0,
diff * 100.0
);
}
#[test]
fn convergence_gbt_error_decreases_with_estimators() {
use scry_learn::tree::GradientBoostingRegressor;
let data = gen_regression_dataset(200, 42);
let estimator_counts = [5, 20, 50, 100];
let mut prev_mse = f64::INFINITY;
for &n_est in &estimator_counts {
let mut gbt = GradientBoostingRegressor::new()
.n_estimators(n_est)
.learning_rate(0.1)
.max_depth(3)
.seed(42);
gbt.fit(&data).unwrap();
let preds = gbt.predict(&data.feature_matrix()).unwrap();
let mse: f64 = preds
.iter()
.zip(data.target.iter())
.map(|(p, t)| (p - t).powi(2))
.sum::<f64>()
/ data.target.len() as f64;
eprintln!("GBT n_estimators={n_est}: MSE={mse:.4}");
assert!(
mse <= prev_mse + 0.1,
"GBT training error should decrease with more estimators: {n_est} est → MSE={mse:.4}, prev={prev_mse:.4}"
);
prev_mse = mse;
}
assert!(
prev_mse < 1.0,
"GBT with 100 estimators should achieve MSE < 1.0 (got {prev_mse:.4})"
);
}