1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
//! # Diabetes Data
//!
//! | Number of Instances | Number of Attributes | Missing Values? | Associated Tasks: |
//! |-|-|-|-|
//! | 442 | 10 | No | Regression |
//!
//! [Diabetes Data](https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html) was collected by Bradley Efron, Trevor Hastie, Iain Johnstone and Robert Tibshirani for the "Least Angle Regression" paper.
//! Predictive variables have been mean centered and scaled to unit variance.
//! The dataset has following attributes:
//!
//! | Predictor | Data Type | Target? |
//! |-|-|-|
//! | Age | Numerical | No |
//! | Sex | Numerical | No |
//! | Body mass index (BMI) | Numerical | No |
//! | Average blood pressure (BP) | Numerical | No |
//! | Six blood serum measurements (SR1 - SR6) | Numerical | No |
//! | A quantitative measure of disease progression one year after baseline | Numerical | Yes |
//!
//! ## References:
//! * ["Least Angle Regression", Efron B., Hastie T., Johnstone I., Tibshirani R., 2004, Annals of Statistics (with discussion), 407-499](http://statweb.stanford.edu/~tibs/ftp/lars.pdf)
use crate::dataset::deserialize_data;
use crate::dataset::Dataset;

/// Get dataset
pub fn load_dataset() -> Dataset<f32, f32> {
    let (x, y, num_samples, num_features) =
        match deserialize_data(std::include_bytes!("diabetes.xy")) {
            Err(why) => panic!("Can't deserialize diabetes.xy. {}", why),
            Ok((x, y, num_samples, num_features)) => (x, y, num_samples, num_features),
        };

    Dataset {
        data: x,
        target: y,
        num_samples,
        num_features,
        feature_names: vec![
            "Age", "Sex", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6",
        ]
        .iter()
        .map(|s| s.to_string())
        .collect(),
        target_names: vec!["Disease progression".to_string()],
        description: "Diabetes Data: https://www4.stat.ncsu.edu/~boos/var.select/diabetes.html"
            .to_string(),
    }
}

#[cfg(test)]
mod tests {

    use super::super::*;
    use super::*;

    #[test]
    #[ignore]
    fn refresh_diabetes_dataset() {
        // run this test to generate diabetes.xy file.
        let dataset = load_dataset();
        assert!(serialize_data(&dataset, "diabetes.xy").is_ok());
    }

    #[test]
    fn boston_dataset() {
        let dataset = load_dataset();
        assert_eq!(
            dataset.data.len(),
            dataset.num_features * dataset.num_samples
        );
        assert_eq!(dataset.target.len(), dataset.num_samples);
        assert_eq!(dataset.num_features, 10);
        assert_eq!(dataset.num_samples, 442);
    }
}