use crate::dataset::core::Dataset;
use crate::error::Result;
use scirs2_core::ndarray::Array;
pub fn load_iris() -> Result<Dataset> {
let data = Array::from_shape_vec(
(6, 4),
vec![
5.1, 3.5, 1.4, 0.2, 4.9, 3.0, 1.4, 0.2, 7.0, 3.2, 4.7, 1.4, 6.4, 3.2, 4.5, 1.5, 6.3, 3.3, 6.0, 2.5, 5.8, 2.7, 5.1, 1.9, ],
)
.map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let target = Array::from_vec(vec![0.0, 0.0, 1.0, 1.0, 2.0, 2.0]);
Ok(Dataset::new(data, target)
.with_feature_names(vec![
"sepal_length".to_string(),
"sepal_width".to_string(),
"petal_length".to_string(),
"petal_width".to_string(),
])
.with_target_names(vec![
"setosa".to_string(),
"versicolor".to_string(),
"virginica".to_string(),
])
.with_description("Iris dataset (subset for testing)".to_string()))
}
pub fn make_regression(n_samples: usize, n_features: usize, noise: f64) -> Result<Dataset> {
use scirs2_core::random::thread_rng;
let mut rng = thread_rng();
let mut x_data = Vec::with_capacity(n_samples * n_features);
for _ in 0..(n_samples * n_features).div_ceil(2) {
let u1: f64 = rng.gen_range(0.0..1.0);
let u2: f64 = rng.gen_range(0.0..1.0);
let z0 = (-2.0f64 * u1.ln()).sqrt() * (2.0f64 * std::f64::consts::PI * u2).cos();
let z1 = (-2.0f64 * u1.ln()).sqrt() * (2.0f64 * std::f64::consts::PI * u2).sin();
x_data.push(z0);
if x_data.len() < n_samples * n_features {
x_data.push(z1);
}
}
x_data.truncate(n_samples * n_features);
let x = Array::from_shape_vec((n_samples, n_features), x_data)
.map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let mut coef: Vec<f64> = Vec::with_capacity(n_features);
for _ in 0..n_features {
coef.push(rng.gen_range(0.0..1.0) * 20.0 - 10.0); }
let mut y_data = Vec::with_capacity(n_samples);
for i in 0..n_samples {
let mut y_i = 0.0;
for j in 0..n_features {
y_i += x[[i, j]] * coef[j];
}
let u1: f64 = rng.gen_range(0.0..1.0);
let u2: f64 = rng.gen_range(0.0..1.0);
let noise_val =
noise * (-2.0f64 * u1.ln()).sqrt() * (2.0f64 * std::f64::consts::PI * u2).cos();
y_i += noise_val;
y_data.push(y_i);
}
let y = Array::from_vec(y_data);
Ok(Dataset::new(x, y).with_description(format!(
"Synthetic regression dataset with {n_samples} samples and {n_features} features"
)))
}
pub fn make_blobs(
n_samples: usize,
n_features: usize,
centers: usize,
cluster_std: f64,
) -> Result<Dataset> {
use scirs2_core::random::thread_rng;
let mut rng = thread_rng();
let samples_per_center = n_samples / centers;
let mut center_coords: Vec<f64> = Vec::with_capacity(centers * n_features);
for _ in 0..centers * n_features {
center_coords.push(rng.gen_range(0.0..1.0) * 20.0 - 10.0); }
let mut x_data = Vec::with_capacity(n_samples * n_features);
let mut y_data = Vec::with_capacity(n_samples);
for center_idx in 0..centers {
for _ in 0..samples_per_center {
for feature_idx in 0..n_features {
let center_value = center_coords[center_idx * n_features + feature_idx];
let u1: f64 = rng.gen_range(0.0..1.0);
let u2: f64 = rng.gen_range(0.0..1.0);
let normal_val = cluster_std
* (-2.0f64 * u1.ln()).sqrt()
* (2.0f64 * std::f64::consts::PI * u2).cos();
x_data.push(center_value + normal_val);
}
y_data.push(center_idx as f64);
}
}
let remaining = n_samples - (samples_per_center * centers);
for _ in 0..remaining {
let center_idx = centers - 1;
for feature_idx in 0..n_features {
let center_value = center_coords[center_idx * n_features + feature_idx];
let u1: f64 = rng.gen_range(0.0..1.0);
let u2: f64 = rng.gen_range(0.0..1.0);
let normal_val = cluster_std
* (-2.0f64 * u1.ln()).sqrt()
* (2.0f64 * std::f64::consts::PI * u2).cos();
x_data.push(center_value + normal_val);
}
y_data.push(center_idx as f64);
}
let x = Array::from_shape_vec((n_samples, n_features), x_data)
.map_err(|e| crate::error::SklearsError::Other(e.to_string()))?;
let y = Array::from_vec(y_data);
Ok(Dataset::new(x, y).with_description(format!(
"Synthetic blob dataset with {n_samples} samples, {n_features} features, and {centers} centers"
)))
}
pub fn make_classification(
n_samples: usize,
n_features: usize,
separation: f64,
) -> Result<Dataset> {
make_blobs(n_samples, n_features, 2, 1.0).map(|mut dataset| {
for i in 0..n_samples / 2 {
for j in 0..n_features {
dataset.data[[i, j]] += separation;
}
}
dataset.with_description(format!(
"Synthetic binary classification dataset with {n_samples} samples and {n_features} features"
))
})
}
#[allow(non_snake_case)]
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_load_iris() {
let iris = load_iris().expect("expected valid value");
assert_eq!(iris.data.dim(), (6, 4));
assert_eq!(iris.target.len(), 6);
assert_eq!(iris.feature_names.len(), 4);
assert_eq!(
iris.target_names
.as_ref()
.expect("value should be present")
.len(),
3
);
assert!(iris.description.contains("Iris"));
}
#[test]
fn test_make_regression() {
let dataset = make_regression(50, 3, 0.1).expect("expected valid value");
assert_eq!(dataset.data.dim(), (50, 3));
assert_eq!(dataset.target.len(), 50);
assert!(dataset.description.contains("regression"));
assert!(dataset.description.contains("50"));
assert!(dataset.description.contains("3"));
}
#[test]
fn test_make_blobs() {
let dataset = make_blobs(60, 2, 3, 1.0).expect("expected valid value");
assert_eq!(dataset.data.dim(), (60, 2));
assert_eq!(dataset.target.len(), 60);
assert!(dataset.description.contains("blob"));
let mut unique_targets = dataset.target.iter().cloned().collect::<Vec<_>>();
unique_targets.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
unique_targets.dedup();
assert_eq!(unique_targets.len(), 3);
}
#[test]
fn test_make_classification() {
let dataset = make_classification(100, 2, 3.0).expect("expected valid value");
assert_eq!(dataset.data.dim(), (100, 2));
assert_eq!(dataset.target.len(), 100);
assert!(dataset.description.contains("classification"));
let mut unique_targets: Vec<_> = dataset.target.iter().cloned().collect();
unique_targets.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
unique_targets.dedup_by(|a, b| (*a - *b).abs() < 1e-10);
assert!(unique_targets.len() <= 2);
}
#[test]
fn test_regression_noise_effect() {
let low_noise = make_regression(50, 2, 0.0).expect("expected valid value"); let high_noise = make_regression(50, 2, 1.0).expect("expected valid value");
assert_eq!(low_noise.data.dim(), high_noise.data.dim());
assert_eq!(low_noise.target.len(), high_noise.target.len());
let zero_noise = make_regression(10, 1, 0.0).expect("expected valid value");
let some_noise = make_regression(10, 1, 0.5).expect("expected valid value");
assert_eq!(zero_noise.data.dim().0, 10);
assert_eq!(some_noise.data.dim().0, 10);
assert!(zero_noise.description.contains("regression"));
assert!(some_noise.description.contains("regression"));
}
}