1use ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
7use num_traits::{Float, NumCast};
8use scirs2_core::parallel_ops::*;
9
10use crate::error::{Result, TransformError};
11
12#[derive(Debug, Clone, PartialEq)]
14pub enum ImputeStrategy {
15 Mean,
17 Median,
19 MostFrequent,
21 Constant(f64),
23}
24
25pub struct SimpleImputer {
30 strategy: ImputeStrategy,
32 missingvalues: f64,
34 statistics_: Option<Array1<f64>>,
36}
37
38impl SimpleImputer {
39 pub fn new(strategy: ImputeStrategy, missingvalues: f64) -> Self {
48 SimpleImputer {
49 strategy,
50 missingvalues,
51 statistics_: None,
52 }
53 }
54
55 #[allow(dead_code)]
63 pub fn with_strategy(strategy: ImputeStrategy) -> Self {
64 Self::new(strategy, f64::NAN)
65 }
66
67 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
75 where
76 S: Data,
77 S::Elem: Float + NumCast,
78 {
79 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
80
81 let n_samples = x_f64.shape()[0];
82 let n_features = x_f64.shape()[1];
83
84 if n_samples == 0 || n_features == 0 {
85 return Err(TransformError::InvalidInput("Empty input data".to_string()));
86 }
87
88 let mut statistics = Array1::zeros(n_features);
89
90 for j in 0..n_features {
91 let feature_data: Vec<f64> = x_f64
93 .column(j)
94 .iter()
95 .filter(|&&val| !self.is_missing(val))
96 .copied()
97 .collect();
98
99 if feature_data.is_empty() {
100 return Err(TransformError::InvalidInput(format!(
101 "All values are missing in feature {j}"
102 )));
103 }
104
105 statistics[j] = match &self.strategy {
106 ImputeStrategy::Mean => {
107 feature_data.iter().sum::<f64>() / feature_data.len() as f64
108 }
109 ImputeStrategy::Median => {
110 let mut sorted_data = feature_data.clone();
111 sorted_data
112 .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
113 let n = sorted_data.len();
114 if n % 2 == 0 {
115 (sorted_data[n / 2 - 1] + sorted_data[n / 2]) / 2.0
116 } else {
117 sorted_data[n / 2]
118 }
119 }
120 ImputeStrategy::MostFrequent => {
121 let mut counts = std::collections::HashMap::new();
124 for &val in &feature_data {
125 *counts.entry(val.to_bits()).or_insert(0) += 1;
126 }
127
128 let most_frequent_bits = counts
129 .into_iter()
130 .max_by_key(|(_, count)| *count)
131 .map(|(bits_, _)| bits_)
132 .unwrap_or(0);
133
134 f64::from_bits(most_frequent_bits)
135 }
136 ImputeStrategy::Constant(value) => *value,
137 };
138 }
139
140 self.statistics_ = Some(statistics);
141 Ok(())
142 }
143
144 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
152 where
153 S: Data,
154 S::Elem: Float + NumCast,
155 {
156 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
157
158 let n_samples = x_f64.shape()[0];
159 let n_features = x_f64.shape()[1];
160
161 if self.statistics_.is_none() {
162 return Err(TransformError::TransformationError(
163 "SimpleImputer has not been fitted".to_string(),
164 ));
165 }
166
167 let statistics = self.statistics_.as_ref().unwrap();
168
169 if n_features != statistics.len() {
170 return Err(TransformError::InvalidInput(format!(
171 "x has {} features, but SimpleImputer was fitted with {} features",
172 n_features,
173 statistics.len()
174 )));
175 }
176
177 let mut transformed = Array2::zeros((n_samples, n_features));
178
179 for i in 0..n_samples {
180 for j in 0..n_features {
181 let value = x_f64[[i, j]];
182 if self.is_missing(value) {
183 transformed[[i, j]] = statistics[j];
184 } else {
185 transformed[[i, j]] = value;
186 }
187 }
188 }
189
190 Ok(transformed)
191 }
192
193 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
201 where
202 S: Data,
203 S::Elem: Float + NumCast,
204 {
205 self.fit(x)?;
206 self.transform(x)
207 }
208
209 #[allow(dead_code)]
214 pub fn statistics(&self) -> Option<&Array1<f64>> {
215 self.statistics_.as_ref()
216 }
217
218 fn is_missing(&self, value: f64) -> bool {
226 if self.missingvalues.is_nan() {
227 value.is_nan()
228 } else {
229 (value - self.missingvalues).abs() < f64::EPSILON
230 }
231 }
232}
233
234pub struct MissingIndicator {
239 missingvalues: f64,
241 features_: Option<Vec<usize>>,
243}
244
245impl MissingIndicator {
246 pub fn new(missingvalues: f64) -> Self {
254 MissingIndicator {
255 missingvalues,
256 features_: None,
257 }
258 }
259
260 pub fn with_nan() -> Self {
262 Self::new(f64::NAN)
263 }
264
265 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
273 where
274 S: Data,
275 S::Elem: Float + NumCast,
276 {
277 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
278
279 let n_features = x_f64.shape()[1];
280 let mut features_with_missing = Vec::new();
281
282 for j in 0..n_features {
283 let has_missing = x_f64.column(j).iter().any(|&val| self.is_missing(val));
284 if has_missing {
285 features_with_missing.push(j);
286 }
287 }
288
289 self.features_ = Some(features_with_missing);
290 Ok(())
291 }
292
293 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
301 where
302 S: Data,
303 S::Elem: Float + NumCast,
304 {
305 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
306
307 let n_samples = x_f64.shape()[0];
308
309 if self.features_.is_none() {
310 return Err(TransformError::TransformationError(
311 "MissingIndicator has not been fitted".to_string(),
312 ));
313 }
314
315 let features_with_missing = self.features_.as_ref().unwrap();
316 let n_output_features = features_with_missing.len();
317
318 let mut indicators = Array2::zeros((n_samples, n_output_features));
319
320 for i in 0..n_samples {
321 for (out_j, &orig_j) in features_with_missing.iter().enumerate() {
322 if self.is_missing(x_f64[[i, orig_j]]) {
323 indicators[[i, out_j]] = 1.0;
324 }
325 }
326 }
327
328 Ok(indicators)
329 }
330
331 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
339 where
340 S: Data,
341 S::Elem: Float + NumCast,
342 {
343 self.fit(x)?;
344 self.transform(x)
345 }
346
347 pub fn features(&self) -> Option<&Vec<usize>> {
352 self.features_.as_ref()
353 }
354
355 fn is_missing(&self, value: f64) -> bool {
363 if self.missingvalues.is_nan() {
364 value.is_nan()
365 } else {
366 (value - self.missingvalues).abs() < f64::EPSILON
367 }
368 }
369}
370
371#[derive(Debug, Clone, PartialEq)]
373pub enum DistanceMetric {
374 Euclidean,
376 Manhattan,
378}
379
380#[derive(Debug, Clone, PartialEq)]
382pub enum WeightingScheme {
383 Uniform,
385 Distance,
387}
388
389pub struct KNNImputer {
395 _nneighbors: usize,
397 metric: DistanceMetric,
399 weights: WeightingScheme,
401 missingvalues: f64,
403 x_train_: Option<Array2<f64>>,
405}
406
407impl KNNImputer {
408 pub fn new(
419 _nneighbors: usize,
420 metric: DistanceMetric,
421 weights: WeightingScheme,
422 missingvalues: f64,
423 ) -> Self {
424 KNNImputer {
425 _nneighbors,
426 metric,
427 weights,
428 missingvalues,
429 x_train_: None,
430 }
431 }
432
433 pub fn with_defaults() -> Self {
437 Self::new(
438 5,
439 DistanceMetric::Euclidean,
440 WeightingScheme::Uniform,
441 f64::NAN,
442 )
443 }
444
445 pub fn with_n_neighbors(_nneighbors: usize) -> Self {
447 Self::new(
448 _nneighbors,
449 DistanceMetric::Euclidean,
450 WeightingScheme::Uniform,
451 f64::NAN,
452 )
453 }
454
455 pub fn with_distance_weighting(_nneighbors: usize) -> Self {
457 Self::new(
458 _nneighbors,
459 DistanceMetric::Euclidean,
460 WeightingScheme::Distance,
461 f64::NAN,
462 )
463 }
464
465 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
473 where
474 S: Data,
475 S::Elem: Float + NumCast,
476 {
477 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
478
479 let n_samples = x_f64.shape()[0];
481 if n_samples < self._nneighbors {
482 return Err(TransformError::InvalidInput(format!(
483 "Number of samples ({}) must be >= _nneighbors ({})",
484 n_samples, self._nneighbors
485 )));
486 }
487
488 self.x_train_ = Some(x_f64);
490 Ok(())
491 }
492
493 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
501 where
502 S: Data,
503 S::Elem: Float + NumCast,
504 {
505 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
506
507 if self.x_train_.is_none() {
508 return Err(TransformError::TransformationError(
509 "KNNImputer must be fitted before transform".to_string(),
510 ));
511 }
512
513 let x_train = self.x_train_.as_ref().unwrap();
514 let (n_samples, n_features) = x_f64.dim();
515
516 if n_features != x_train.shape()[1] {
517 return Err(TransformError::InvalidInput(format!(
518 "Number of features in transform data ({}) doesn't match training data ({})",
519 n_features,
520 x_train.shape()[1]
521 )));
522 }
523
524 let mut result = x_f64.clone();
525
526 for i in 0..n_samples {
528 let sample = x_f64.row(i);
529
530 let missing_features: Vec<usize> = (0..n_features)
532 .filter(|&j| self.is_missing(sample[j]))
533 .collect();
534
535 if missing_features.is_empty() {
536 continue; }
538
539 let neighbors =
541 self.find_nearest_neighbors_excluding(&sample.to_owned(), x_train, i)?;
542
543 for &feature_idx in &missing_features {
545 let imputed_value = self.impute_feature(feature_idx, &neighbors, x_train)?;
546 result[[i, feature_idx]] = imputed_value;
547 }
548 }
549
550 Ok(result)
551 }
552
553 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
561 where
562 S: Data,
563 S::Elem: Float + NumCast,
564 {
565 self.fit(x)?;
566 self.transform(x)
567 }
568
569 fn find_nearest_neighbors_excluding(
571 &self,
572 sample: &Array1<f64>,
573 x_train: &Array2<f64>,
574 exclude_idx: usize,
575 ) -> Result<Vec<usize>> {
576 let n_train_samples = x_train.shape()[0];
577
578 let distances: Vec<(usize, f64)> = (0..n_train_samples)
580 .into_par_iter()
581 .filter(|&i| i != exclude_idx)
582 .map(|i| {
583 let train_sample = x_train.row(i);
584 let distance = self.compute_distance(sample, &train_sample.to_owned());
585 (i, distance)
586 })
587 .collect();
588
589 let mut sorted_distances = distances;
591 sorted_distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
592
593 let neighbors: Vec<usize> = sorted_distances
594 .into_iter()
595 .take(self._nneighbors)
596 .map(|(idx_, _)| idx_)
597 .collect();
598
599 Ok(neighbors)
600 }
601
602 fn compute_distance(&self, sample1: &Array1<f64>, sample2: &Array1<f64>) -> f64 {
604 let n_features = sample1.len();
605 let mut distance = 0.0;
606 let mut valid_features = 0;
607
608 for i in 0..n_features {
609 let val1 = sample1[i];
610 let val2 = sample2[i];
611
612 if self.is_missing(val1) || self.is_missing(val2) {
614 continue;
615 }
616
617 valid_features += 1;
618 let diff = val1 - val2;
619
620 match self.metric {
621 DistanceMetric::Euclidean => {
622 distance += diff * diff;
623 }
624 DistanceMetric::Manhattan => {
625 distance += diff.abs();
626 }
627 }
628 }
629
630 if valid_features == 0 {
632 return f64::INFINITY;
633 }
634
635 distance /= valid_features as f64;
637
638 match self.metric {
639 DistanceMetric::Euclidean => distance.sqrt(),
640 DistanceMetric::Manhattan => distance,
641 }
642 }
643
644 fn impute_feature(
646 &self,
647 feature_idx: usize,
648 neighbors: &[usize],
649 x_train: &Array2<f64>,
650 ) -> Result<f64> {
651 let mut values = Vec::new();
652 let mut weights = Vec::new();
653
654 for &neighbor_idx in neighbors {
656 let neighbor_value = x_train[[neighbor_idx, feature_idx]];
657
658 if !self.is_missing(neighbor_value) {
659 values.push(neighbor_value);
660
661 let weight = match self.weights {
663 WeightingScheme::Uniform => 1.0,
664 WeightingScheme::Distance => {
665 1.0 }
669 };
670 weights.push(weight);
671 }
672 }
673
674 if values.is_empty() {
675 return Err(TransformError::TransformationError(format!(
676 "No valid neighbors found for feature {feature_idx} imputation"
677 )));
678 }
679
680 let total_weight: f64 = weights.iter().sum();
682 if total_weight == 0.0 {
683 return Err(TransformError::TransformationError(
684 "Total weight is zero for imputation".to_string(),
685 ));
686 }
687
688 let weighted_sum: f64 = values
689 .iter()
690 .zip(weights.iter())
691 .map(|(&val, &weight)| val * weight)
692 .sum();
693
694 Ok(weighted_sum / total_weight)
695 }
696
697 fn is_missing(&self, value: f64) -> bool {
699 if self.missingvalues.is_nan() {
700 value.is_nan()
701 } else {
702 (value - self.missingvalues).abs() < f64::EPSILON
703 }
704 }
705
706 pub fn _nneighbors(&self) -> usize {
708 self._nneighbors
709 }
710
711 pub fn metric(&self) -> &DistanceMetric {
713 &self.metric
714 }
715
716 pub fn weights(&self) -> &WeightingScheme {
718 &self.weights
719 }
720}
721
722#[derive(Debug, Clone)]
727struct SimpleRegressor {
728 coefficients: Option<Array1<f64>>,
730 includeintercept: bool,
732 alpha: f64,
734}
735
736impl SimpleRegressor {
737 fn new(includeintercept: bool, alpha: f64) -> Self {
739 Self {
740 coefficients: None,
741 includeintercept,
742 alpha,
743 }
744 }
745
746 fn fit(&mut self, x: &Array2<f64>, y: &Array1<f64>) -> Result<()> {
748 let (n_samples, n_features) = x.dim();
749
750 if n_samples != y.len() {
751 return Err(TransformError::InvalidInput(
752 "X and y must have the same number of samples".to_string(),
753 ));
754 }
755
756 let x_design = if self.includeintercept {
758 let mut x_with_intercept = Array2::ones((n_samples, n_features + 1));
759 x_with_intercept.slice_mut(ndarray::s![.., 1..]).assign(x);
760 x_with_intercept
761 } else {
762 x.to_owned()
763 };
764
765 let xtx = x_design.t().dot(&x_design);
767 let xty = x_design.t().dot(y);
768
769 let mut regularized_xtx = xtx;
771 let n_coeffs = regularized_xtx.shape()[0];
772 for i in 0..n_coeffs {
773 regularized_xtx[[i, i]] += self.alpha;
774 }
775
776 self.coefficients = Some(self.solve_linear_system(®ularized_xtx, &xty)?);
778
779 Ok(())
780 }
781
782 fn predict(&self, x: &Array2<f64>) -> Result<Array1<f64>> {
784 let coeffs = self.coefficients.as_ref().ok_or_else(|| {
785 TransformError::TransformationError(
786 "Regressor must be fitted before prediction".to_string(),
787 )
788 })?;
789
790 let x_design = if self.includeintercept {
791 let (n_samples, n_features) = x.dim();
792 let mut x_with_intercept = Array2::ones((n_samples, n_features + 1));
793 x_with_intercept.slice_mut(ndarray::s![.., 1..]).assign(x);
794 x_with_intercept
795 } else {
796 x.to_owned()
797 };
798
799 Ok(x_design.dot(coeffs))
800 }
801
802 fn solve_linear_system(&self, a: &Array2<f64>, b: &Array1<f64>) -> Result<Array1<f64>> {
804 let n = a.shape()[0];
805 let mut aug_matrix = Array2::zeros((n, n + 1));
806
807 aug_matrix.slice_mut(ndarray::s![.., ..n]).assign(a);
809 aug_matrix.slice_mut(ndarray::s![.., n]).assign(b);
810
811 for i in 0..n {
813 let mut max_row = i;
815 for k in (i + 1)..n {
816 if aug_matrix[[k, i]].abs() > aug_matrix[[max_row, i]].abs() {
817 max_row = k;
818 }
819 }
820
821 if max_row != i {
823 for j in 0..=n {
824 let temp = aug_matrix[[i, j]];
825 aug_matrix[[i, j]] = aug_matrix[[max_row, j]];
826 aug_matrix[[max_row, j]] = temp;
827 }
828 }
829
830 if aug_matrix[[i, i]].abs() < 1e-12 {
832 return Err(TransformError::TransformationError(
833 "Singular matrix in regression".to_string(),
834 ));
835 }
836
837 let pivot = aug_matrix[[i, i]];
839 for j in i..=n {
840 aug_matrix[[i, j]] /= pivot;
841 }
842
843 for k in 0..n {
845 if k != i {
846 let factor = aug_matrix[[k, i]];
847 for j in i..=n {
848 aug_matrix[[k, j]] -= factor * aug_matrix[[i, j]];
849 }
850 }
851 }
852 }
853
854 let mut solution = Array1::zeros(n);
856 for i in 0..n {
857 solution[i] = aug_matrix[[i, n]];
858 }
859
860 Ok(solution)
861 }
862}
863
864pub struct IterativeImputer {
875 max_iter: usize,
877 tolerance: f64,
879 initial_strategy: ImputeStrategy,
881 random_seed: Option<u64>,
883 missingvalues: f64,
885 alpha: f64,
887 min_improvement: f64,
889
890 x_train_: Option<Array2<f64>>,
893 missing_features_: Option<Vec<usize>>,
895 initial_values_: Option<Array1<f64>>,
897 is_fitted_: bool,
899}
900
901impl IterativeImputer {
902 pub fn new(
914 max_iter: usize,
915 tolerance: f64,
916 initial_strategy: ImputeStrategy,
917 missingvalues: f64,
918 alpha: f64,
919 ) -> Self {
920 IterativeImputer {
921 max_iter,
922 tolerance,
923 initial_strategy,
924 random_seed: None,
925 missingvalues,
926 alpha,
927 min_improvement: 1e-6,
928 x_train_: None,
929 missing_features_: None,
930 initial_values_: None,
931 is_fitted_: false,
932 }
933 }
934
935 pub fn with_defaults() -> Self {
940 Self::new(10, 1e-3, ImputeStrategy::Mean, f64::NAN, 1e-6)
941 }
942
943 pub fn with_max_iter(_maxiter: usize) -> Self {
945 Self::new(_maxiter, 1e-3, ImputeStrategy::Mean, f64::NAN, 1e-6)
946 }
947
948 pub fn with_random_seed(mut self, seed: u64) -> Self {
950 self.random_seed = Some(seed);
951 self
952 }
953
954 pub fn with_alpha(mut self, alpha: f64) -> Self {
956 self.alpha = alpha;
957 self
958 }
959
960 pub fn with_min_improvement(mut self, minimprovement: f64) -> Self {
962 self.min_improvement = minimprovement;
963 self
964 }
965
966 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
974 where
975 S: Data,
976 S::Elem: Float + NumCast,
977 {
978 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
979 let (n_samples, n_features) = x_f64.dim();
980
981 if n_samples == 0 || n_features == 0 {
982 return Err(TransformError::InvalidInput("Empty input data".to_string()));
983 }
984
985 let missing_features: Vec<usize> = (0..n_features)
987 .filter(|&j| x_f64.column(j).iter().any(|&val| self.is_missing(val)))
988 .collect();
989
990 if missing_features.is_empty() {
991 self.x_train_ = Some(x_f64);
993 self.missing_features_ = Some(Vec::new());
994 self.initial_values_ = Some(Array1::zeros(0));
995 self.is_fitted_ = true;
996 return Ok(());
997 }
998
999 let mut initial_values = Array1::zeros(n_features);
1001 for &feature_idx in &missing_features {
1002 let feature_data: Vec<f64> = x_f64
1003 .column(feature_idx)
1004 .iter()
1005 .filter(|&&val| !self.is_missing(val))
1006 .copied()
1007 .collect();
1008
1009 if feature_data.is_empty() {
1010 return Err(TransformError::InvalidInput(format!(
1011 "All values are missing in feature {feature_idx}"
1012 )));
1013 }
1014
1015 initial_values[feature_idx] = match &self.initial_strategy {
1016 ImputeStrategy::Mean => {
1017 feature_data.iter().sum::<f64>() / feature_data.len() as f64
1018 }
1019 ImputeStrategy::Median => {
1020 let mut sorted_data = feature_data;
1021 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1022 let len = sorted_data.len();
1023 if len % 2 == 0 {
1024 (sorted_data[len / 2 - 1] + sorted_data[len / 2]) / 2.0
1025 } else {
1026 sorted_data[len / 2]
1027 }
1028 }
1029 ImputeStrategy::MostFrequent => {
1030 feature_data.iter().sum::<f64>() / feature_data.len() as f64
1032 }
1033 ImputeStrategy::Constant(value) => *value,
1034 };
1035 }
1036
1037 self.x_train_ = Some(x_f64);
1038 self.missing_features_ = Some(missing_features);
1039 self.initial_values_ = Some(initial_values);
1040 self.is_fitted_ = true;
1041
1042 Ok(())
1043 }
1044
1045 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1053 where
1054 S: Data,
1055 S::Elem: Float + NumCast,
1056 {
1057 if !self.is_fitted_ {
1058 return Err(TransformError::TransformationError(
1059 "IterativeImputer must be fitted before transform".to_string(),
1060 ));
1061 }
1062
1063 let x_f64 = x.mapv(|x| num_traits::cast::<S::Elem, f64>(x).unwrap_or(0.0));
1064 let missing_features = self.missing_features_.as_ref().unwrap();
1065
1066 if missing_features.is_empty() {
1067 return Ok(x_f64);
1069 }
1070
1071 let initial_values = self.initial_values_.as_ref().unwrap();
1072 let (n_samples, n_features) = x_f64.dim();
1073
1074 let mut imputed_data = x_f64.clone();
1076 self.apply_initial_imputation(&mut imputed_data, initial_values)?;
1077
1078 for iteration in 0..self.max_iter {
1080 let mut max_change = 0.0;
1081 let old_imputed_data = imputed_data.clone();
1082
1083 for &feature_idx in missing_features {
1085 let missing_mask: Vec<bool> = (0..n_samples)
1087 .map(|i| self.is_missing(x_f64[[i, feature_idx]]))
1088 .collect();
1089
1090 if !missing_mask.iter().any(|&x| x) {
1091 continue; }
1093
1094 let predictor_indices: Vec<usize> =
1096 (0..n_features).filter(|&i| i != feature_idx).collect();
1097
1098 let (train_x, train_y) = self.prepare_training_data(
1100 &imputed_data,
1101 feature_idx,
1102 &predictor_indices,
1103 &missing_mask,
1104 )?;
1105
1106 if train_x.is_empty() {
1107 continue; }
1109
1110 let mut regressor = SimpleRegressor::new(true, self.alpha);
1112 regressor.fit(&train_x, &train_y)?;
1113
1114 let test_x =
1116 self.prepare_test_data(&imputed_data, &predictor_indices, &missing_mask)?;
1117
1118 if !test_x.is_empty() {
1119 let predictions = regressor.predict(&test_x)?;
1120
1121 let mut pred_idx = 0;
1123 for i in 0..n_samples {
1124 if missing_mask[i] {
1125 let old_value = imputed_data[[i, feature_idx]];
1126 let new_value = predictions[pred_idx];
1127 imputed_data[[i, feature_idx]] = new_value;
1128
1129 let change = (new_value - old_value).abs();
1130 max_change = max_change.max(change);
1131 pred_idx += 1;
1132 }
1133 }
1134 }
1135 }
1136
1137 if max_change < self.tolerance {
1139 break;
1140 }
1141
1142 if iteration > 0 {
1144 let total_change = self.compute_total_change(&old_imputed_data, &imputed_data);
1145 if total_change < self.min_improvement {
1146 break;
1147 }
1148 }
1149 }
1150
1151 Ok(imputed_data)
1152 }
1153
1154 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1162 where
1163 S: Data,
1164 S::Elem: Float + NumCast,
1165 {
1166 self.fit(x)?;
1167 self.transform(x)
1168 }
1169
1170 fn apply_initial_imputation(
1172 &self,
1173 data: &mut Array2<f64>,
1174 initial_values: &Array1<f64>,
1175 ) -> Result<()> {
1176 let (n_samples, n_features) = data.dim();
1177
1178 for i in 0..n_samples {
1179 for j in 0..n_features {
1180 if self.is_missing(data[[i, j]]) {
1181 data[[i, j]] = initial_values[j];
1182 }
1183 }
1184 }
1185
1186 Ok(())
1187 }
1188
1189 fn prepare_training_data(
1191 &self,
1192 data: &Array2<f64>,
1193 target_feature: usize,
1194 predictor_indices: &[usize],
1195 missing_mask: &[bool],
1196 ) -> Result<(Array2<f64>, Array1<f64>)> {
1197 let n_samples = data.shape()[0];
1198 let n_predictors = predictor_indices.len();
1199
1200 let non_missing_count = missing_mask.iter().filter(|&&x| !x).count();
1202
1203 if non_missing_count == 0 {
1204 return Ok((Array2::zeros((0, n_predictors)), Array1::zeros(0)));
1205 }
1206
1207 let mut train_x = Array2::zeros((non_missing_count, n_predictors));
1208 let mut train_y = Array1::zeros(non_missing_count);
1209
1210 let mut train_idx = 0;
1211 for i in 0..n_samples {
1212 if !missing_mask[i] {
1213 for (pred_j, &orig_j) in predictor_indices.iter().enumerate() {
1215 train_x[[train_idx, pred_j]] = data[[i, orig_j]];
1216 }
1217 train_y[train_idx] = data[[i, target_feature]];
1219 train_idx += 1;
1220 }
1221 }
1222
1223 Ok((train_x, train_y))
1224 }
1225
1226 fn prepare_test_data(
1228 &self,
1229 data: &Array2<f64>,
1230 predictor_indices: &[usize],
1231 missing_mask: &[bool],
1232 ) -> Result<Array2<f64>> {
1233 let n_samples = data.shape()[0];
1234 let n_predictors = predictor_indices.len();
1235
1236 let missing_count = missing_mask.iter().filter(|&&x| x).count();
1238
1239 if missing_count == 0 {
1240 return Ok(Array2::zeros((0, n_predictors)));
1241 }
1242
1243 let mut test_x = Array2::zeros((missing_count, n_predictors));
1244
1245 let mut test_idx = 0;
1246 for i in 0..n_samples {
1247 if missing_mask[i] {
1248 for (pred_j, &orig_j) in predictor_indices.iter().enumerate() {
1250 test_x[[test_idx, pred_j]] = data[[i, orig_j]];
1251 }
1252 test_idx += 1;
1253 }
1254 }
1255
1256 Ok(test_x)
1257 }
1258
1259 fn compute_total_change(&self, old_data: &Array2<f64>, newdata: &Array2<f64>) -> f64 {
1261 let diff = newdata - old_data;
1262 diff.iter().map(|&x| x * x).sum::<f64>().sqrt()
1263 }
1264
1265 fn is_missing(&self, value: f64) -> bool {
1267 if self.missingvalues.is_nan() {
1268 value.is_nan()
1269 } else {
1270 (value - self.missingvalues).abs() < f64::EPSILON
1271 }
1272 }
1273}
1274
1275#[cfg(test)]
1276mod tests {
1277 use super::*;
1278 use approx::assert_abs_diff_eq;
1279 use ndarray::Array;
1280
1281 #[test]
1282 fn test_simple_imputer_mean() {
1283 let data = Array::from_shape_vec(
1285 (4, 3),
1286 vec![
1287 1.0,
1288 2.0,
1289 3.0,
1290 f64::NAN,
1291 5.0,
1292 6.0,
1293 7.0,
1294 f64::NAN,
1295 9.0,
1296 10.0,
1297 11.0,
1298 f64::NAN,
1299 ],
1300 )
1301 .unwrap();
1302
1303 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Mean);
1304 let transformed = imputer.fit_transform(&data).unwrap();
1305
1306 assert_eq!(transformed.shape(), &[4, 3]);
1308
1309 assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1315 assert_abs_diff_eq!(transformed[[1, 0]], 6.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[2, 0]], 7.0, epsilon = 1e-10);
1317 assert_abs_diff_eq!(transformed[[3, 0]], 10.0, epsilon = 1e-10);
1318
1319 assert_abs_diff_eq!(transformed[[0, 1]], 2.0, epsilon = 1e-10);
1320 assert_abs_diff_eq!(transformed[[1, 1]], 5.0, epsilon = 1e-10);
1321 assert_abs_diff_eq!(transformed[[2, 1]], 6.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[3, 1]], 11.0, epsilon = 1e-10);
1323
1324 assert_abs_diff_eq!(transformed[[0, 2]], 3.0, epsilon = 1e-10);
1325 assert_abs_diff_eq!(transformed[[1, 2]], 6.0, epsilon = 1e-10);
1326 assert_abs_diff_eq!(transformed[[2, 2]], 9.0, epsilon = 1e-10);
1327 assert_abs_diff_eq!(transformed[[3, 2]], 6.0, epsilon = 1e-10); }
1329
1330 #[test]
1331 fn test_simple_imputer_median() {
1332 let data = Array::from_shape_vec(
1334 (5, 2),
1335 vec![
1336 1.0,
1337 10.0,
1338 f64::NAN,
1339 20.0,
1340 3.0,
1341 f64::NAN,
1342 4.0,
1343 40.0,
1344 5.0,
1345 50.0,
1346 ],
1347 )
1348 .unwrap();
1349
1350 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Median);
1351 let transformed = imputer.fit_transform(&data).unwrap();
1352
1353 assert_eq!(transformed.shape(), &[5, 2]);
1355
1356 assert_abs_diff_eq!(transformed[[1, 0]], 3.5, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[2, 1]], 30.0, epsilon = 1e-10); }
1362
1363 #[test]
1364 fn test_simple_imputer_constant() {
1365 let data =
1367 Array::from_shape_vec((3, 2), vec![1.0, f64::NAN, f64::NAN, 3.0, 4.0, 5.0]).unwrap();
1368
1369 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Constant(99.0));
1370 let transformed = imputer.fit_transform(&data).unwrap();
1371
1372 assert_abs_diff_eq!(transformed[[0, 1]], 99.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[1, 0]], 99.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1378 assert_abs_diff_eq!(transformed[[1, 1]], 3.0, epsilon = 1e-10);
1379 assert_abs_diff_eq!(transformed[[2, 0]], 4.0, epsilon = 1e-10);
1380 assert_abs_diff_eq!(transformed[[2, 1]], 5.0, epsilon = 1e-10);
1381 }
1382
1383 #[test]
1384 fn test_missing_indicator() {
1385 let data = Array::from_shape_vec(
1387 (3, 4),
1388 vec![
1389 1.0,
1390 f64::NAN,
1391 3.0,
1392 4.0,
1393 f64::NAN,
1394 6.0,
1395 f64::NAN,
1396 8.0,
1397 9.0,
1398 10.0,
1399 11.0,
1400 f64::NAN,
1401 ],
1402 )
1403 .unwrap();
1404
1405 let mut indicator = MissingIndicator::with_nan();
1406 let indicators = indicator.fit_transform(&data).unwrap();
1407
1408 assert_eq!(indicators.shape(), &[3, 4]);
1410
1411 assert_abs_diff_eq!(indicators[[0, 0]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[0, 1]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[0, 2]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[0, 3]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 0]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 1]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 2]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 3]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 0]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 1]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 2]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 3]], 1.0, epsilon = 1e-10); }
1427
1428 #[test]
1429 fn test_imputer_errors() {
1430 let data = Array::from_shape_vec((2, 2), vec![f64::NAN, 1.0, f64::NAN, 2.0]).unwrap();
1432
1433 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Mean);
1434 assert!(imputer.fit(&data).is_err());
1435 }
1436
1437 #[test]
1438 fn test_knn_imputer_basic() {
1439 let data = Array::from_shape_vec(
1446 (4, 3),
1447 vec![
1448 1.0,
1449 2.0,
1450 3.0,
1451 4.0,
1452 f64::NAN,
1453 6.0,
1454 7.0,
1455 8.0,
1456 f64::NAN,
1457 10.0,
1458 11.0,
1459 12.0,
1460 ],
1461 )
1462 .unwrap();
1463
1464 let mut imputer = KNNImputer::with_n_neighbors(2);
1465 let transformed = imputer.fit_transform(&data).unwrap();
1466
1467 assert_eq!(transformed.shape(), &[4, 3]);
1469
1470 assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1472 assert_abs_diff_eq!(transformed[[0, 1]], 2.0, epsilon = 1e-10);
1473 assert_abs_diff_eq!(transformed[[0, 2]], 3.0, epsilon = 1e-10);
1474 assert_abs_diff_eq!(transformed[[3, 0]], 10.0, epsilon = 1e-10);
1475 assert_abs_diff_eq!(transformed[[3, 1]], 11.0, epsilon = 1e-10);
1476 assert_abs_diff_eq!(transformed[[3, 2]], 12.0, epsilon = 1e-10);
1477
1478 assert!(!transformed[[1, 1]].is_nan()); assert!(!transformed[[2, 2]].is_nan()); }
1482
1483 #[test]
1484 fn test_knn_imputer_simple_case() {
1485 let data = Array::from_shape_vec((3, 2), vec![1.0, 1.0, f64::NAN, 2.0, 3.0, 3.0]).unwrap();
1487
1488 let mut imputer = KNNImputer::with_n_neighbors(2);
1489 let transformed = imputer.fit_transform(&data).unwrap();
1490
1491 assert_abs_diff_eq!(transformed[[1, 0]], 2.0, epsilon = 1e-1);
1495 }
1496
1497 #[test]
1498 fn test_knn_imputer_manhattan_distance() {
1499 let data =
1500 Array::from_shape_vec((4, 2), vec![0.0, 0.0, 1.0, f64::NAN, 2.0, 2.0, 10.0, 10.0])
1501 .unwrap();
1502
1503 let mut imputer = KNNImputer::new(
1504 2,
1505 DistanceMetric::Manhattan,
1506 WeightingScheme::Uniform,
1507 f64::NAN,
1508 );
1509 let transformed = imputer.fit_transform(&data).unwrap();
1510
1511 assert!(!transformed[[1, 1]].is_nan());
1514 assert!(transformed[[1, 1]] < 5.0); }
1517
1518 #[test]
1519 fn test_knn_imputer_validation_errors() {
1520 let small_data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1522 let mut imputer = KNNImputer::with_n_neighbors(5); assert!(imputer.fit(&small_data).is_err());
1524
1525 let data =
1527 Array::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
1528 let unfitted_imputer = KNNImputer::with_n_neighbors(2);
1529 assert!(unfitted_imputer.transform(&data).is_err());
1530 }
1531
1532 #[test]
1533 fn test_knn_imputer_no_missing_values() {
1534 let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1536
1537 let mut imputer = KNNImputer::with_n_neighbors(2);
1538 let transformed = imputer.fit_transform(&data).unwrap();
1539
1540 assert_eq!(transformed, data);
1542 }
1543
1544 #[test]
1545 fn test_knn_imputer_accessors() {
1546 let imputer = KNNImputer::new(
1547 3,
1548 DistanceMetric::Manhattan,
1549 WeightingScheme::Distance,
1550 -999.0,
1551 );
1552
1553 assert_eq!(imputer._nneighbors(), 3);
1554 assert_eq!(imputer.metric(), &DistanceMetric::Manhattan);
1555 assert_eq!(imputer.weights(), &WeightingScheme::Distance);
1556 }
1557
1558 #[test]
1559 fn test_knn_imputer_multiple_missing_features() {
1560 let data = Array::from_shape_vec(
1562 (4, 3),
1563 vec![
1564 1.0,
1565 2.0,
1566 3.0,
1567 f64::NAN,
1568 f64::NAN,
1569 6.0,
1570 7.0,
1571 8.0,
1572 9.0,
1573 10.0,
1574 11.0,
1575 12.0,
1576 ],
1577 )
1578 .unwrap();
1579
1580 let mut imputer = KNNImputer::with_n_neighbors(2);
1581 let transformed = imputer.fit_transform(&data).unwrap();
1582
1583 assert!(!transformed[[1, 0]].is_nan());
1585 assert!(!transformed[[1, 1]].is_nan());
1586 assert_abs_diff_eq!(transformed[[1, 2]], 6.0, epsilon = 1e-10);
1588 }
1589
1590 #[test]
1591 fn test_iterative_imputer_basic() {
1592 let data = Array::from_shape_vec(
1597 (4, 2),
1598 vec![1.0, 2.0, 2.0, 4.0, 3.0, f64::NAN, f64::NAN, 8.0],
1599 )
1600 .unwrap();
1601
1602 let mut imputer = IterativeImputer::with_max_iter(5);
1603 let transformed = imputer.fit_transform(&data).unwrap();
1604
1605 assert!(!transformed[[2, 1]].is_nan()); assert!(!transformed[[3, 0]].is_nan()); assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1611 assert_abs_diff_eq!(transformed[[0, 1]], 2.0, epsilon = 1e-10);
1612 assert_abs_diff_eq!(transformed[[1, 0]], 2.0, epsilon = 1e-10);
1613 assert_abs_diff_eq!(transformed[[1, 1]], 4.0, epsilon = 1e-10);
1614 assert_abs_diff_eq!(transformed[[2, 0]], 3.0, epsilon = 1e-10);
1615 assert_abs_diff_eq!(transformed[[3, 1]], 8.0, epsilon = 1e-10);
1616
1617 let imputed_f1_row2 = transformed[[2, 1]];
1620 let expected_f1_row2 = 2.0 * transformed[[2, 0]]; assert!((imputed_f1_row2 - expected_f1_row2).abs() < 1.0); let imputed_f0_row3 = transformed[[3, 0]];
1624 let expected_f0_row3 = transformed[[3, 1]] / 2.0; assert!((imputed_f0_row3 - expected_f0_row3).abs() < 1.0); }
1627
1628 #[test]
1629 fn test_iterative_imputer_no_missing_values() {
1630 let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1632
1633 let mut imputer = IterativeImputer::with_defaults();
1634 let transformed = imputer.fit_transform(&data).unwrap();
1635
1636 for i in 0..3 {
1638 for j in 0..2 {
1639 assert_abs_diff_eq!(transformed[[i, j]], data[[i, j]], epsilon = 1e-10);
1640 }
1641 }
1642 }
1643
1644 #[test]
1645 fn test_iterative_imputer_convergence() {
1646 let data = Array::from_shape_vec(
1648 (5, 3),
1649 vec![
1650 1.0,
1651 2.0,
1652 3.0,
1653 2.0,
1654 f64::NAN,
1655 6.0,
1656 3.0,
1657 6.0,
1658 f64::NAN,
1659 4.0,
1660 8.0,
1661 12.0,
1662 f64::NAN,
1663 10.0,
1664 15.0,
1665 ],
1666 )
1667 .unwrap();
1668
1669 let mut imputer = IterativeImputer::new(
1670 20, 1e-4, ImputeStrategy::Mean,
1673 f64::NAN,
1674 1e-6, );
1676
1677 let transformed = imputer.fit_transform(&data).unwrap();
1678
1679 for i in 0..5 {
1681 for j in 0..3 {
1682 assert!(!transformed[[i, j]].is_nan());
1683 }
1684 }
1685 }
1686
1687 #[test]
1688 fn test_iterative_imputer_different_strategies() {
1689 let data = Array::from_shape_vec(
1690 (4, 2),
1691 vec![1.0, f64::NAN, 2.0, 4.0, 3.0, 6.0, f64::NAN, 8.0],
1692 )
1693 .unwrap();
1694
1695 let mut imputer_median =
1697 IterativeImputer::new(5, 1e-3, ImputeStrategy::Median, f64::NAN, 1e-6);
1698 let transformed_median = imputer_median.fit_transform(&data).unwrap();
1699 assert!(!transformed_median[[0, 1]].is_nan());
1700 assert!(!transformed_median[[3, 0]].is_nan());
1701
1702 let mut imputer_constant =
1704 IterativeImputer::new(5, 1e-3, ImputeStrategy::Constant(999.0), f64::NAN, 1e-6);
1705 let transformed_constant = imputer_constant.fit_transform(&data).unwrap();
1706 assert!(!transformed_constant[[0, 1]].is_nan());
1707 assert!(!transformed_constant[[3, 0]].is_nan());
1708 }
1709
1710 #[test]
1711 fn test_iterative_imputer_builder_methods() {
1712 let imputer = IterativeImputer::with_defaults()
1713 .with_random_seed(42)
1714 .with_alpha(1e-3)
1715 .with_min_improvement(1e-5);
1716
1717 assert_eq!(imputer.random_seed, Some(42));
1718 assert_abs_diff_eq!(imputer.alpha, 1e-3, epsilon = 1e-10);
1719 assert_abs_diff_eq!(imputer.min_improvement, 1e-5, epsilon = 1e-10);
1720 }
1721
1722 #[test]
1723 fn test_iterative_imputer_errors() {
1724 let imputer = IterativeImputer::with_defaults();
1726 let test_data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1727 assert!(imputer.transform(&test_data).is_err());
1728
1729 let bad_data =
1731 Array::from_shape_vec((3, 2), vec![f64::NAN, 1.0, f64::NAN, 2.0, f64::NAN, 3.0])
1732 .unwrap();
1733 let mut imputer = IterativeImputer::with_defaults();
1734 assert!(imputer.fit(&bad_data).is_err());
1735 }
1736
1737 #[test]
1738 fn test_simple_regressor() {
1739 let x = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0]).unwrap();
1741 let y = Array::from_vec(vec![5.0, 8.0, 11.0]); let mut regressor = SimpleRegressor::new(true, 1e-6);
1744 regressor.fit(&x, &y).unwrap();
1745
1746 let test_x = Array::from_shape_vec((2, 2), vec![4.0, 5.0, 5.0, 6.0]).unwrap();
1747 let predictions = regressor.predict(&test_x).unwrap();
1748
1749 assert_eq!(predictions.len(), 2);
1751 assert!(!predictions[0].is_nan());
1752 assert!(!predictions[1].is_nan());
1753 }
1754}