mlinrust/dataset/
utils.rs1use crate::ndarray::NdArray;
2use crate::ndarray::utils::{std, mean, min, max};
3
4use super::{Dataset, TaskLabelType};
5
6pub enum ImputeType {
11 Mean,
12 Zero,
13 Value(f32),
14}
15
16pub fn impute_missing_values(res: Vec<Vec<Result<f32, Box<dyn std::error::Error>>>>, filled: ImputeType) -> Vec<Vec<f32>> {
20 match filled {
21 ImputeType::Mean => {
22 let counter = res.iter().fold(vec![(0.0f32, 0.0f32); res[0].len()], |mut fold, item| {
23 item.iter().enumerate().for_each(|(i, item)| {
24 if let Ok(v) = item {
25 fold[i].0 += 1.0;
26 fold[i].1 += v;
27 }
28 });
29 fold
30 });
31 let counter: Vec<f32> = counter.into_iter().map(|(num, sum)| sum / num).collect();
32 res.into_iter().map(|item| {
33 item.into_iter().enumerate().map(|(i, e)| {
34 if let Ok(v) = e {
35 v
36 } else {
37 counter[i]
38 }
39 }).collect::<Vec<f32>>()
40 }).collect::<Vec<Vec<f32>>>()
41 },
42 ImputeType::Zero => {
43 res.into_iter().map(|item| {
44 item.into_iter().map(|e| {
45 if let Ok(v) = e {
46 v
47 } else {
48 0.0
49 }
50 }).collect::<Vec<f32>>()
51 }).collect::<Vec<Vec<f32>>>()
52 },
53 ImputeType::Value(value) => {
54 res.into_iter().map(|item| {
55 item.into_iter().map(|e| {
56 if let Ok(v) = e {
57 v
58 } else {
59 value
60 }
61 }).collect::<Vec<f32>>()
62 }).collect::<Vec<Vec<f32>>>()
63 },
64 }
65}
66
67pub enum ScalerType {
69 Standard,
70 MinMax,
71}
72
73pub fn normalize_dataset<T: TaskLabelType + Copy>(dataset: &mut Dataset<T>, scaler: ScalerType) {
74 let feature_len = dataset.feature_len();
75 let data = std::mem::replace(&mut dataset.features, vec![]);
76 let data = NdArray::new(data); let data = match scaler {
78 ScalerType::MinMax => {
79 let min = min(&data, 0);
80 let numerator = &data - &min;
81
82 let mut denominator = -min + max(&data, 0);
83 denominator.data_as_mut_vector().iter_mut().for_each(|i| *i = 1.0 / f32::max(1e-6, *i));
84
85 numerator.point_multiply(&denominator).destroy().1
86 },
87 ScalerType::Standard => {
88 let mu = mean(&data, 0);
89 let mut sigma = std(&data, 0, false);
91 sigma.data_as_mut_vector().iter_mut().for_each(|i| *i = 1.0 / f32::max(1e-6, *i));
92 (data - mu).point_multiply(&sigma).destroy().1
93 },
94 };
95 let data: Vec<Vec<f32>> = data.chunks_exact(feature_len).map(|v| v.into()).collect();
96 dataset.features = data;
97}
98
99#[cfg(test)]
100mod test {
101 use crate::{ndarray::{NdArray, utils::{mean, std}}, utils::RandGenerator, dataset};
102
103 use super::normalize_dataset;
104
105 #[test]
106 fn test_normalize_dataset() {
107 let mut rng = RandGenerator::new(0);
108 let data = (0..4).map(|_| (0..2).map(|_| rng.gen_f32()).collect()).collect();
109 let mut dataset = dataset::Dataset::new(data, vec![rng.gen_f32(); 4], None);
110
111 println!("{:?}", dataset.features);
112
113 normalize_dataset(&mut dataset, super::ScalerType::MinMax);
114 println!("min_max{:?}", dataset.features);
115
116
117 normalize_dataset(&mut dataset, super::ScalerType::Standard);
118 let mut a = NdArray::new(dataset.features.clone());
119 a.reshape(vec![dataset.len(), dataset.feature_len()]);
120 println!("std{:?}\nmean:{}, std:{}", dataset.features, mean(&a, 0), std(&a, 0, false));
121 }
122
123 #[test]
124 fn test() {
125 let a = NdArray::random(vec![4, 2], None);
126 let b = NdArray::random(vec![1, 2], Some(2));
127 println!("a can broadcast b {}", a + b);
128 }
129}