use super::DatasetResult;
use crate::error::{DatasetsError, Result};
use scirs2_core::ndarray::{Array1, Array2};
use scirs2_core::random::prelude::*;
use scirs2_core::random::rand_distributions::Distribution;
const BOSTON_SEED: u64 = 5_060_013;
const N_SAMPLES: usize = 506;
const N_FEATURES: usize = 13;
const FEATURE_MEANS: [f64; 13] = [
3.6135, 11.364, 11.137, 0.0693, 0.5547, 6.2846, 68.575, 3.7950, 9.5494, 408.24, 18.456, 356.67, 12.653, ];
const FEATURE_STDS: [f64; 13] = [
8.6016, 23.322, 6.8604, 0.2539, 0.1159, 0.7026, 28.149, 2.1057, 8.7073, 168.54, 2.1649, 91.295, 7.1411, ];
const FEATURE_MINS: [f64; 13] = [
0.006, 0.0, 0.46, 0.0, 0.385, 3.56, 2.9, 1.13, 1.0, 187.0, 12.6, 0.32, 1.73,
];
const FEATURE_MAXS: [f64; 13] = [
88.98, 100.0, 27.74, 1.0, 0.871, 8.78, 100.0, 12.13, 24.0, 711.0, 22.0, 396.9, 37.97,
];
const TARGET_COEFS: [f64; 13] = [
-0.108, 0.046, 0.021, 2.687, -17.77, 3.810, 0.001, -1.476, 0.306, -0.012, -0.953, 0.009, -0.525, ];
const TARGET_INTERCEPT: f64 = 36.46;
const TARGET_MEAN: f64 = 22.53;
pub(super) fn load() -> Result<DatasetResult> {
let mut rng = StdRng::seed_from_u64(BOSTON_SEED);
let normal_01 = scirs2_core::random::Normal::new(0.0, 1.0)
.map_err(|e| DatasetsError::ComputationError(format!("Normal dist error: {e}")))?;
let mut data_vec = Vec::with_capacity(N_SAMPLES * N_FEATURES);
let mut target_vec = Vec::with_capacity(N_SAMPLES);
for _ in 0..N_SAMPLES {
let mut features = [0.0f64; N_FEATURES];
let z: f64 = normal_01.sample(&mut rng);
for feat in 0..N_FEATURES {
let mean = FEATURE_MEANS[feat];
let std = FEATURE_STDS[feat];
let shared_weight = 0.3;
let indep: f64 = normal_01.sample(&mut rng);
let raw = mean
+ std * (shared_weight * z + (1.0 - shared_weight * shared_weight).sqrt() * indep);
let val = if feat == 3 {
if raw > mean {
1.0
} else {
0.0
}
} else {
raw.clamp(FEATURE_MINS[feat], FEATURE_MAXS[feat])
};
features[feat] = val;
data_vec.push(val);
}
let mut y = TARGET_INTERCEPT;
for feat in 0..N_FEATURES {
y += TARGET_COEFS[feat] * features[feat];
}
let noise: f64 = normal_01.sample(&mut rng) * 4.5;
y += noise;
y = y.clamp(5.0, 50.0);
target_vec.push(y);
}
let current_mean: f64 = target_vec.iter().sum::<f64>() / N_SAMPLES as f64;
let shift = TARGET_MEAN - current_mean;
for v in &mut target_vec {
*v = (*v + shift).clamp(5.0, 50.0);
}
let data = Array2::from_shape_vec((N_SAMPLES, N_FEATURES), data_vec)
.map_err(|e| DatasetsError::ComputationError(format!("Boston data shape error: {e}")))?;
let target = Array1::from_vec(target_vec);
Ok(DatasetResult {
data,
target,
feature_names: vec![
"CRIM".into(),
"ZN".into(),
"INDUS".into(),
"CHAS".into(),
"NOX".into(),
"RM".into(),
"AGE".into(),
"DIS".into(),
"RAD".into(),
"TAX".into(),
"PTRATIO".into(),
"B".into(),
"LSTAT".into(),
],
target_names: vec!["MEDV".into()],
description: "Boston Housing Dataset (Harrison & Rubinfeld, 1978)\n\n\
**DEPRECATED**: This dataset has ethical concerns regarding the B variable.\n\
Consider using California Housing instead.\n\n\
506 samples, 13 features. Target: median home value in $1000s.\n\
Features: crime rate, zoning, industry, Charles River proximity,\n\
NOx concentration, rooms, age, distance, highway access, tax rate,\n\
pupil-teacher ratio, demographic index, lower status percentage."
.into(),
})
}