mlinrust/dataset/
utils.rs

1use crate::ndarray::NdArray;
2use crate::ndarray::utils::{std, mean, min, max};
3
4use super::{Dataset, TaskLabelType};
5
6/// Impute the missing value with
7/// * Mean value
8/// * Zero
9/// * given Value
10pub enum ImputeType {
11    Mean,
12    Zero,
13    Value(f32),
14}
15
16/// impute missing values in the dataset
17/// * res: should be the results of the dataset preprocessing
18/// * filled: ImputeType {Mean, Zero, Specific Value f32}
19pub fn impute_missing_values(res: Vec<Vec<Result<f32, Box<dyn std::error::Error>>>>, filled: ImputeType) -> Vec<Vec<f32>> {
20    match filled {
21        ImputeType::Mean => {
22            let counter = res.iter().fold(vec![(0.0f32, 0.0f32); res[0].len()], |mut fold, item| {
23                item.iter().enumerate().for_each(|(i, item)| {
24                    if let Ok(v) = item {
25                        fold[i].0 += 1.0;
26                        fold[i].1 += v;
27                    }
28                });
29                fold
30            });
31            let counter: Vec<f32> = counter.into_iter().map(|(num, sum)| sum / num).collect();
32            res.into_iter().map(|item| {
33                item.into_iter().enumerate().map(|(i, e)| {
34                    if let Ok(v) = e {
35                        v
36                    } else {
37                        counter[i]
38                    }
39                }).collect::<Vec<f32>>()
40            }).collect::<Vec<Vec<f32>>>()
41        },
42        ImputeType::Zero => {
43            res.into_iter().map(|item| {
44                item.into_iter().map(|e| {
45                    if let Ok(v) = e {
46                        v
47                    } else {
48                        0.0
49                    }
50                }).collect::<Vec<f32>>()
51            }).collect::<Vec<Vec<f32>>>()
52        },
53        ImputeType::Value(value) => {
54            res.into_iter().map(|item| {
55                item.into_iter().map(|e| {
56                    if let Ok(v) = e {
57                        v
58                    } else {
59                        value
60                    }
61                }).collect::<Vec<f32>>()
62            }).collect::<Vec<Vec<f32>>>()
63        },
64    }
65}
66
67/// scaler for data normalization
68pub enum ScalerType {
69    Standard,
70    MinMax,
71}
72
73pub fn normalize_dataset<T: TaskLabelType + Copy>(dataset: &mut Dataset<T>, scaler: ScalerType) {
74    let feature_len = dataset.feature_len();
75    let data = std::mem::replace(&mut dataset.features, vec![]);
76    let data = NdArray::new(data); // [num, feature_len]
77    let data = match scaler {
78        ScalerType::MinMax => {
79            let min = min(&data, 0);
80            let numerator = &data - &min;
81
82            let mut denominator = -min + max(&data, 0);
83            denominator.data_as_mut_vector().iter_mut().for_each(|i| *i = 1.0 / f32::max(1e-6, *i));
84
85            numerator.point_multiply(&denominator).destroy().1
86        },
87        ScalerType::Standard => {
88            let mu = mean(&data, 0);
89            // the dataset should have infinite data idealy
90            let mut sigma = std(&data, 0, false);
91            sigma.data_as_mut_vector().iter_mut().for_each(|i| *i = 1.0 / f32::max(1e-6, *i));
92            (data - mu).point_multiply(&sigma).destroy().1            
93        },
94    };
95    let data: Vec<Vec<f32>> = data.chunks_exact(feature_len).map(|v| v.into()).collect();
96    dataset.features = data;
97}
98
99#[cfg(test)]
100mod test {
101    use crate::{ndarray::{NdArray, utils::{mean, std}}, utils::RandGenerator, dataset};
102
103    use super::normalize_dataset;
104
105    #[test]
106    fn test_normalize_dataset() {
107        let mut rng = RandGenerator::new(0);
108        let data = (0..4).map(|_| (0..2).map(|_| rng.gen_f32()).collect()).collect();
109        let mut dataset = dataset::Dataset::new(data, vec![rng.gen_f32(); 4], None);
110
111        println!("{:?}", dataset.features);
112
113        normalize_dataset(&mut dataset, super::ScalerType::MinMax);
114        println!("min_max{:?}", dataset.features);
115
116
117        normalize_dataset(&mut dataset, super::ScalerType::Standard);
118        let mut a = NdArray::new(dataset.features.clone());
119        a.reshape(vec![dataset.len(), dataset.feature_len()]);
120        println!("std{:?}\nmean:{}, std:{}", dataset.features, mean(&a, 0), std(&a, 0, false));
121    }
122
123    #[test]
124    fn test() {
125        let a = NdArray::random(vec![4, 2], None);
126        let b = NdArray::random(vec![1, 2], Some(2));
127        println!("a can broadcast b {}", a + b);
128    }
129}