1use scirs2_core::ndarray::{Array1, Array2, ArrayBase, Data, Ix2};
7use scirs2_core::numeric::{Float, NumCast};
8use scirs2_core::parallel_ops::*;
9
10use crate::error::{Result, TransformError};
11
12#[derive(Debug, Clone, PartialEq)]
14pub enum ImputeStrategy {
15 Mean,
17 Median,
19 MostFrequent,
21 Constant(f64),
23}
24
25pub struct SimpleImputer {
30 strategy: ImputeStrategy,
32 missingvalues: f64,
34 statistics_: Option<Array1<f64>>,
36}
37
38impl SimpleImputer {
39 pub fn new(strategy: ImputeStrategy, missingvalues: f64) -> Self {
48 SimpleImputer {
49 strategy,
50 missingvalues,
51 statistics_: None,
52 }
53 }
54
55 #[allow(dead_code)]
63 pub fn with_strategy(strategy: ImputeStrategy) -> Self {
64 Self::new(strategy, f64::NAN)
65 }
66
67 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
75 where
76 S: Data,
77 S::Elem: Float + NumCast,
78 {
79 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
80
81 let n_samples = x_f64.shape()[0];
82 let n_features = x_f64.shape()[1];
83
84 if n_samples == 0 || n_features == 0 {
85 return Err(TransformError::InvalidInput("Empty input data".to_string()));
86 }
87
88 let mut statistics = Array1::zeros(n_features);
89
90 for j in 0..n_features {
91 let feature_data: Vec<f64> = x_f64
93 .column(j)
94 .iter()
95 .filter(|&&val| !self.is_missing(val))
96 .copied()
97 .collect();
98
99 if feature_data.is_empty() {
100 return Err(TransformError::InvalidInput(format!(
101 "All values are missing in feature {j}"
102 )));
103 }
104
105 statistics[j] = match &self.strategy {
106 ImputeStrategy::Mean => {
107 feature_data.iter().sum::<f64>() / feature_data.len() as f64
108 }
109 ImputeStrategy::Median => {
110 let mut sorted_data = feature_data.clone();
111 sorted_data
112 .sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
113 let n = sorted_data.len();
114 if n.is_multiple_of(2) {
115 (sorted_data[n / 2 - 1] + sorted_data[n / 2]) / 2.0
116 } else {
117 sorted_data[n / 2]
118 }
119 }
120 ImputeStrategy::MostFrequent => {
121 let mut counts = std::collections::HashMap::new();
124 for &val in &feature_data {
125 *counts.entry(val.to_bits()).or_insert(0) += 1;
126 }
127
128 let most_frequent_bits = counts
129 .into_iter()
130 .max_by_key(|(_, count)| *count)
131 .map(|(bits_, _)| bits_)
132 .unwrap_or(0);
133
134 f64::from_bits(most_frequent_bits)
135 }
136 ImputeStrategy::Constant(value) => *value,
137 };
138 }
139
140 self.statistics_ = Some(statistics);
141 Ok(())
142 }
143
144 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
152 where
153 S: Data,
154 S::Elem: Float + NumCast,
155 {
156 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
157
158 let n_samples = x_f64.shape()[0];
159 let n_features = x_f64.shape()[1];
160
161 if self.statistics_.is_none() {
162 return Err(TransformError::TransformationError(
163 "SimpleImputer has not been fitted".to_string(),
164 ));
165 }
166
167 let statistics = self.statistics_.as_ref().unwrap();
168
169 if n_features != statistics.len() {
170 return Err(TransformError::InvalidInput(format!(
171 "x has {} features, but SimpleImputer was fitted with {} features",
172 n_features,
173 statistics.len()
174 )));
175 }
176
177 let mut transformed = Array2::zeros((n_samples, n_features));
178
179 for i in 0..n_samples {
180 for j in 0..n_features {
181 let value = x_f64[[i, j]];
182 if self.is_missing(value) {
183 transformed[[i, j]] = statistics[j];
184 } else {
185 transformed[[i, j]] = value;
186 }
187 }
188 }
189
190 Ok(transformed)
191 }
192
193 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
201 where
202 S: Data,
203 S::Elem: Float + NumCast,
204 {
205 self.fit(x)?;
206 self.transform(x)
207 }
208
209 #[allow(dead_code)]
214 pub fn statistics(&self) -> Option<&Array1<f64>> {
215 self.statistics_.as_ref()
216 }
217
218 fn is_missing(&self, value: f64) -> bool {
226 if self.missingvalues.is_nan() {
227 value.is_nan()
228 } else {
229 (value - self.missingvalues).abs() < f64::EPSILON
230 }
231 }
232}
233
234pub struct MissingIndicator {
239 missingvalues: f64,
241 features_: Option<Vec<usize>>,
243}
244
245impl MissingIndicator {
246 pub fn new(missingvalues: f64) -> Self {
254 MissingIndicator {
255 missingvalues,
256 features_: None,
257 }
258 }
259
260 pub fn with_nan() -> Self {
262 Self::new(f64::NAN)
263 }
264
265 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
273 where
274 S: Data,
275 S::Elem: Float + NumCast,
276 {
277 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
278
279 let n_features = x_f64.shape()[1];
280 let mut features_with_missing = Vec::new();
281
282 for j in 0..n_features {
283 let has_missing = x_f64.column(j).iter().any(|&val| self.is_missing(val));
284 if has_missing {
285 features_with_missing.push(j);
286 }
287 }
288
289 self.features_ = Some(features_with_missing);
290 Ok(())
291 }
292
293 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
301 where
302 S: Data,
303 S::Elem: Float + NumCast,
304 {
305 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
306
307 let n_samples = x_f64.shape()[0];
308
309 if self.features_.is_none() {
310 return Err(TransformError::TransformationError(
311 "MissingIndicator has not been fitted".to_string(),
312 ));
313 }
314
315 let features_with_missing = self.features_.as_ref().unwrap();
316 let n_output_features = features_with_missing.len();
317
318 let mut indicators = Array2::zeros((n_samples, n_output_features));
319
320 for i in 0..n_samples {
321 for (out_j, &orig_j) in features_with_missing.iter().enumerate() {
322 if self.is_missing(x_f64[[i, orig_j]]) {
323 indicators[[i, out_j]] = 1.0;
324 }
325 }
326 }
327
328 Ok(indicators)
329 }
330
331 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
339 where
340 S: Data,
341 S::Elem: Float + NumCast,
342 {
343 self.fit(x)?;
344 self.transform(x)
345 }
346
347 pub fn features(&self) -> Option<&Vec<usize>> {
352 self.features_.as_ref()
353 }
354
355 fn is_missing(&self, value: f64) -> bool {
363 if self.missingvalues.is_nan() {
364 value.is_nan()
365 } else {
366 (value - self.missingvalues).abs() < f64::EPSILON
367 }
368 }
369}
370
371#[derive(Debug, Clone, PartialEq)]
373pub enum DistanceMetric {
374 Euclidean,
376 Manhattan,
378}
379
380#[derive(Debug, Clone, PartialEq)]
382pub enum WeightingScheme {
383 Uniform,
385 Distance,
387}
388
389pub struct KNNImputer {
395 _nneighbors: usize,
397 metric: DistanceMetric,
399 weights: WeightingScheme,
401 missingvalues: f64,
403 x_train_: Option<Array2<f64>>,
405}
406
407impl KNNImputer {
408 pub fn new(
419 _nneighbors: usize,
420 metric: DistanceMetric,
421 weights: WeightingScheme,
422 missingvalues: f64,
423 ) -> Self {
424 KNNImputer {
425 _nneighbors,
426 metric,
427 weights,
428 missingvalues,
429 x_train_: None,
430 }
431 }
432
433 pub fn with_defaults() -> Self {
437 Self::new(
438 5,
439 DistanceMetric::Euclidean,
440 WeightingScheme::Uniform,
441 f64::NAN,
442 )
443 }
444
445 pub fn with_n_neighbors(_nneighbors: usize) -> Self {
447 Self::new(
448 _nneighbors,
449 DistanceMetric::Euclidean,
450 WeightingScheme::Uniform,
451 f64::NAN,
452 )
453 }
454
455 pub fn with_distance_weighting(_nneighbors: usize) -> Self {
457 Self::new(
458 _nneighbors,
459 DistanceMetric::Euclidean,
460 WeightingScheme::Distance,
461 f64::NAN,
462 )
463 }
464
465 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
473 where
474 S: Data,
475 S::Elem: Float + NumCast,
476 {
477 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
478
479 let n_samples = x_f64.shape()[0];
481 if n_samples < self._nneighbors {
482 return Err(TransformError::InvalidInput(format!(
483 "Number of samples ({}) must be >= _nneighbors ({})",
484 n_samples, self._nneighbors
485 )));
486 }
487
488 self.x_train_ = Some(x_f64);
490 Ok(())
491 }
492
493 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
501 where
502 S: Data,
503 S::Elem: Float + NumCast,
504 {
505 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
506
507 if self.x_train_.is_none() {
508 return Err(TransformError::TransformationError(
509 "KNNImputer must be fitted before transform".to_string(),
510 ));
511 }
512
513 let x_train = self.x_train_.as_ref().unwrap();
514 let (n_samples, n_features) = x_f64.dim();
515
516 if n_features != x_train.shape()[1] {
517 return Err(TransformError::InvalidInput(format!(
518 "Number of features in transform data ({}) doesn't match training data ({})",
519 n_features,
520 x_train.shape()[1]
521 )));
522 }
523
524 let mut result = x_f64.clone();
525
526 for i in 0..n_samples {
528 let sample = x_f64.row(i);
529
530 let missing_features: Vec<usize> = (0..n_features)
532 .filter(|&j| self.is_missing(sample[j]))
533 .collect();
534
535 if missing_features.is_empty() {
536 continue; }
538
539 let neighbors =
541 self.find_nearest_neighbors_excluding(&sample.to_owned(), x_train, i)?;
542
543 for &feature_idx in &missing_features {
545 let imputed_value = self.impute_feature(feature_idx, &neighbors, x_train)?;
546 result[[i, feature_idx]] = imputed_value;
547 }
548 }
549
550 Ok(result)
551 }
552
553 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
561 where
562 S: Data,
563 S::Elem: Float + NumCast,
564 {
565 self.fit(x)?;
566 self.transform(x)
567 }
568
569 fn find_nearest_neighbors_excluding(
571 &self,
572 sample: &Array1<f64>,
573 x_train: &Array2<f64>,
574 exclude_idx: usize,
575 ) -> Result<Vec<usize>> {
576 let n_train_samples = x_train.shape()[0];
577
578 let distances: Vec<(usize, f64)> = (0..n_train_samples)
580 .into_par_iter()
581 .filter(|&i| i != exclude_idx)
582 .map(|i| {
583 let train_sample = x_train.row(i);
584 let distance = self.compute_distance(sample, &train_sample.to_owned());
585 (i, distance)
586 })
587 .collect();
588
589 let mut sorted_distances = distances;
591 sorted_distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
592
593 let neighbors: Vec<usize> = sorted_distances
594 .into_iter()
595 .take(self._nneighbors)
596 .map(|(idx_, _)| idx_)
597 .collect();
598
599 Ok(neighbors)
600 }
601
602 fn compute_distance(&self, sample1: &Array1<f64>, sample2: &Array1<f64>) -> f64 {
604 let n_features = sample1.len();
605 let mut distance = 0.0;
606 let mut valid_features = 0;
607
608 for i in 0..n_features {
609 let val1 = sample1[i];
610 let val2 = sample2[i];
611
612 if self.is_missing(val1) || self.is_missing(val2) {
614 continue;
615 }
616
617 valid_features += 1;
618 let diff = val1 - val2;
619
620 match self.metric {
621 DistanceMetric::Euclidean => {
622 distance += diff * diff;
623 }
624 DistanceMetric::Manhattan => {
625 distance += diff.abs();
626 }
627 }
628 }
629
630 if valid_features == 0 {
632 return f64::INFINITY;
633 }
634
635 distance /= valid_features as f64;
637
638 match self.metric {
639 DistanceMetric::Euclidean => distance.sqrt(),
640 DistanceMetric::Manhattan => distance,
641 }
642 }
643
644 fn impute_feature(
646 &self,
647 feature_idx: usize,
648 neighbors: &[usize],
649 x_train: &Array2<f64>,
650 ) -> Result<f64> {
651 let mut values = Vec::new();
652 let mut weights = Vec::new();
653
654 for &neighbor_idx in neighbors {
656 let neighbor_value = x_train[[neighbor_idx, feature_idx]];
657
658 if !self.is_missing(neighbor_value) {
659 values.push(neighbor_value);
660
661 let weight = match self.weights {
663 WeightingScheme::Uniform => 1.0,
664 WeightingScheme::Distance => {
665 1.0 }
669 };
670 weights.push(weight);
671 }
672 }
673
674 if values.is_empty() {
675 return Err(TransformError::TransformationError(format!(
676 "No valid neighbors found for feature {feature_idx} imputation"
677 )));
678 }
679
680 let total_weight: f64 = weights.iter().sum();
682 if total_weight == 0.0 {
683 return Err(TransformError::TransformationError(
684 "Total weight is zero for imputation".to_string(),
685 ));
686 }
687
688 let weighted_sum: f64 = values
689 .iter()
690 .zip(weights.iter())
691 .map(|(&val, &weight)| val * weight)
692 .sum();
693
694 Ok(weighted_sum / total_weight)
695 }
696
697 fn is_missing(&self, value: f64) -> bool {
699 if self.missingvalues.is_nan() {
700 value.is_nan()
701 } else {
702 (value - self.missingvalues).abs() < f64::EPSILON
703 }
704 }
705
706 pub fn _nneighbors(&self) -> usize {
708 self._nneighbors
709 }
710
711 pub fn metric(&self) -> &DistanceMetric {
713 &self.metric
714 }
715
716 pub fn weights(&self) -> &WeightingScheme {
718 &self.weights
719 }
720}
721
722#[derive(Debug, Clone)]
727struct SimpleRegressor {
728 coefficients: Option<Array1<f64>>,
730 includeintercept: bool,
732 alpha: f64,
734}
735
736impl SimpleRegressor {
737 fn new(includeintercept: bool, alpha: f64) -> Self {
739 Self {
740 coefficients: None,
741 includeintercept,
742 alpha,
743 }
744 }
745
746 fn fit(&mut self, x: &Array2<f64>, y: &Array1<f64>) -> Result<()> {
748 let (n_samples, n_features) = x.dim();
749
750 if n_samples != y.len() {
751 return Err(TransformError::InvalidInput(
752 "X and y must have the same number of samples".to_string(),
753 ));
754 }
755
756 let x_design = if self.includeintercept {
758 let mut x_with_intercept = Array2::ones((n_samples, n_features + 1));
759 x_with_intercept
760 .slice_mut(scirs2_core::ndarray::s![.., 1..])
761 .assign(x);
762 x_with_intercept
763 } else {
764 x.to_owned()
765 };
766
767 let xtx = x_design.t().dot(&x_design);
769 let xty = x_design.t().dot(y);
770
771 let mut regularized_xtx = xtx;
773 let n_coeffs = regularized_xtx.shape()[0];
774 for i in 0..n_coeffs {
775 regularized_xtx[[i, i]] += self.alpha;
776 }
777
778 self.coefficients = Some(self.solve_linear_system(®ularized_xtx, &xty)?);
780
781 Ok(())
782 }
783
784 fn predict(&self, x: &Array2<f64>) -> Result<Array1<f64>> {
786 let coeffs = self.coefficients.as_ref().ok_or_else(|| {
787 TransformError::TransformationError(
788 "Regressor must be fitted before prediction".to_string(),
789 )
790 })?;
791
792 let x_design = if self.includeintercept {
793 let (n_samples, n_features) = x.dim();
794 let mut x_with_intercept = Array2::ones((n_samples, n_features + 1));
795 x_with_intercept
796 .slice_mut(scirs2_core::ndarray::s![.., 1..])
797 .assign(x);
798 x_with_intercept
799 } else {
800 x.to_owned()
801 };
802
803 Ok(x_design.dot(coeffs))
804 }
805
806 fn solve_linear_system(&self, a: &Array2<f64>, b: &Array1<f64>) -> Result<Array1<f64>> {
808 let n = a.shape()[0];
809 let mut aug_matrix = Array2::zeros((n, n + 1));
810
811 aug_matrix
813 .slice_mut(scirs2_core::ndarray::s![.., ..n])
814 .assign(a);
815 aug_matrix
816 .slice_mut(scirs2_core::ndarray::s![.., n])
817 .assign(b);
818
819 for i in 0..n {
821 let mut max_row = i;
823 for k in (i + 1)..n {
824 if aug_matrix[[k, i]].abs() > aug_matrix[[max_row, i]].abs() {
825 max_row = k;
826 }
827 }
828
829 if max_row != i {
831 for j in 0..=n {
832 let temp = aug_matrix[[i, j]];
833 aug_matrix[[i, j]] = aug_matrix[[max_row, j]];
834 aug_matrix[[max_row, j]] = temp;
835 }
836 }
837
838 if aug_matrix[[i, i]].abs() < 1e-12 {
840 return Err(TransformError::TransformationError(
841 "Singular matrix in regression".to_string(),
842 ));
843 }
844
845 let pivot = aug_matrix[[i, i]];
847 for j in i..=n {
848 aug_matrix[[i, j]] /= pivot;
849 }
850
851 for k in 0..n {
853 if k != i {
854 let factor = aug_matrix[[k, i]];
855 for j in i..=n {
856 aug_matrix[[k, j]] -= factor * aug_matrix[[i, j]];
857 }
858 }
859 }
860 }
861
862 let mut solution = Array1::zeros(n);
864 for i in 0..n {
865 solution[i] = aug_matrix[[i, n]];
866 }
867
868 Ok(solution)
869 }
870}
871
872pub struct IterativeImputer {
883 max_iter: usize,
885 tolerance: f64,
887 initial_strategy: ImputeStrategy,
889 random_seed: Option<u64>,
891 missingvalues: f64,
893 alpha: f64,
895 min_improvement: f64,
897
898 x_train_: Option<Array2<f64>>,
901 missing_features_: Option<Vec<usize>>,
903 initial_values_: Option<Array1<f64>>,
905 is_fitted_: bool,
907}
908
909impl IterativeImputer {
910 pub fn new(
922 max_iter: usize,
923 tolerance: f64,
924 initial_strategy: ImputeStrategy,
925 missingvalues: f64,
926 alpha: f64,
927 ) -> Self {
928 IterativeImputer {
929 max_iter,
930 tolerance,
931 initial_strategy,
932 random_seed: None,
933 missingvalues,
934 alpha,
935 min_improvement: 1e-6,
936 x_train_: None,
937 missing_features_: None,
938 initial_values_: None,
939 is_fitted_: false,
940 }
941 }
942
943 pub fn with_defaults() -> Self {
948 Self::new(10, 1e-3, ImputeStrategy::Mean, f64::NAN, 1e-6)
949 }
950
951 pub fn with_max_iter(_maxiter: usize) -> Self {
953 Self::new(_maxiter, 1e-3, ImputeStrategy::Mean, f64::NAN, 1e-6)
954 }
955
956 pub fn with_random_seed(mut self, seed: u64) -> Self {
958 self.random_seed = Some(seed);
959 self
960 }
961
962 pub fn with_alpha(mut self, alpha: f64) -> Self {
964 self.alpha = alpha;
965 self
966 }
967
968 pub fn with_min_improvement(mut self, minimprovement: f64) -> Self {
970 self.min_improvement = minimprovement;
971 self
972 }
973
974 pub fn fit<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<()>
982 where
983 S: Data,
984 S::Elem: Float + NumCast,
985 {
986 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
987 let (n_samples, n_features) = x_f64.dim();
988
989 if n_samples == 0 || n_features == 0 {
990 return Err(TransformError::InvalidInput("Empty input data".to_string()));
991 }
992
993 let missing_features: Vec<usize> = (0..n_features)
995 .filter(|&j| x_f64.column(j).iter().any(|&val| self.is_missing(val)))
996 .collect();
997
998 if missing_features.is_empty() {
999 self.x_train_ = Some(x_f64);
1001 self.missing_features_ = Some(Vec::new());
1002 self.initial_values_ = Some(Array1::zeros(0));
1003 self.is_fitted_ = true;
1004 return Ok(());
1005 }
1006
1007 let mut initial_values = Array1::zeros(n_features);
1009 for &feature_idx in &missing_features {
1010 let feature_data: Vec<f64> = x_f64
1011 .column(feature_idx)
1012 .iter()
1013 .filter(|&&val| !self.is_missing(val))
1014 .copied()
1015 .collect();
1016
1017 if feature_data.is_empty() {
1018 return Err(TransformError::InvalidInput(format!(
1019 "All values are missing in feature {feature_idx}"
1020 )));
1021 }
1022
1023 initial_values[feature_idx] = match &self.initial_strategy {
1024 ImputeStrategy::Mean => {
1025 feature_data.iter().sum::<f64>() / feature_data.len() as f64
1026 }
1027 ImputeStrategy::Median => {
1028 let mut sorted_data = feature_data;
1029 sorted_data.sort_by(|a, b| a.partial_cmp(b).unwrap());
1030 let len = sorted_data.len();
1031 if len.is_multiple_of(2) {
1032 (sorted_data[len / 2 - 1] + sorted_data[len / 2]) / 2.0
1033 } else {
1034 sorted_data[len / 2]
1035 }
1036 }
1037 ImputeStrategy::MostFrequent => {
1038 feature_data.iter().sum::<f64>() / feature_data.len() as f64
1040 }
1041 ImputeStrategy::Constant(value) => *value,
1042 };
1043 }
1044
1045 self.x_train_ = Some(x_f64);
1046 self.missing_features_ = Some(missing_features);
1047 self.initial_values_ = Some(initial_values);
1048 self.is_fitted_ = true;
1049
1050 Ok(())
1051 }
1052
1053 pub fn transform<S>(&self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1061 where
1062 S: Data,
1063 S::Elem: Float + NumCast,
1064 {
1065 if !self.is_fitted_ {
1066 return Err(TransformError::TransformationError(
1067 "IterativeImputer must be fitted before transform".to_string(),
1068 ));
1069 }
1070
1071 let x_f64 = x.mapv(|x| NumCast::from(x).unwrap_or(0.0));
1072 let missing_features = self.missing_features_.as_ref().unwrap();
1073
1074 if missing_features.is_empty() {
1075 return Ok(x_f64);
1077 }
1078
1079 let initial_values = self.initial_values_.as_ref().unwrap();
1080 let (n_samples, n_features) = x_f64.dim();
1081
1082 let mut imputed_data = x_f64.clone();
1084 self.apply_initial_imputation(&mut imputed_data, initial_values)?;
1085
1086 for iteration in 0..self.max_iter {
1088 let mut max_change = 0.0;
1089 let old_imputed_data = imputed_data.clone();
1090
1091 for &feature_idx in missing_features {
1093 let missing_mask: Vec<bool> = (0..n_samples)
1095 .map(|i| self.is_missing(x_f64[[i, feature_idx]]))
1096 .collect();
1097
1098 if !missing_mask.iter().any(|&x| x) {
1099 continue; }
1101
1102 let predictor_indices: Vec<usize> =
1104 (0..n_features).filter(|&i| i != feature_idx).collect();
1105
1106 let (train_x, train_y) = self.prepare_training_data(
1108 &imputed_data,
1109 feature_idx,
1110 &predictor_indices,
1111 &missing_mask,
1112 )?;
1113
1114 if train_x.is_empty() {
1115 continue; }
1117
1118 let mut regressor = SimpleRegressor::new(true, self.alpha);
1120 regressor.fit(&train_x, &train_y)?;
1121
1122 let test_x =
1124 self.prepare_test_data(&imputed_data, &predictor_indices, &missing_mask)?;
1125
1126 if !test_x.is_empty() {
1127 let predictions = regressor.predict(&test_x)?;
1128
1129 let mut pred_idx = 0;
1131 for i in 0..n_samples {
1132 if missing_mask[i] {
1133 let old_value = imputed_data[[i, feature_idx]];
1134 let new_value = predictions[pred_idx];
1135 imputed_data[[i, feature_idx]] = new_value;
1136
1137 let change = (new_value - old_value).abs();
1138 max_change = max_change.max(change);
1139 pred_idx += 1;
1140 }
1141 }
1142 }
1143 }
1144
1145 if max_change < self.tolerance {
1147 break;
1148 }
1149
1150 if iteration > 0 {
1152 let total_change = self.compute_total_change(&old_imputed_data, &imputed_data);
1153 if total_change < self.min_improvement {
1154 break;
1155 }
1156 }
1157 }
1158
1159 Ok(imputed_data)
1160 }
1161
1162 pub fn fit_transform<S>(&mut self, x: &ArrayBase<S, Ix2>) -> Result<Array2<f64>>
1170 where
1171 S: Data,
1172 S::Elem: Float + NumCast,
1173 {
1174 self.fit(x)?;
1175 self.transform(x)
1176 }
1177
1178 fn apply_initial_imputation(
1180 &self,
1181 data: &mut Array2<f64>,
1182 initial_values: &Array1<f64>,
1183 ) -> Result<()> {
1184 let (n_samples, n_features) = data.dim();
1185
1186 for i in 0..n_samples {
1187 for j in 0..n_features {
1188 if self.is_missing(data[[i, j]]) {
1189 data[[i, j]] = initial_values[j];
1190 }
1191 }
1192 }
1193
1194 Ok(())
1195 }
1196
1197 fn prepare_training_data(
1199 &self,
1200 data: &Array2<f64>,
1201 target_feature: usize,
1202 predictor_indices: &[usize],
1203 missing_mask: &[bool],
1204 ) -> Result<(Array2<f64>, Array1<f64>)> {
1205 let n_samples = data.shape()[0];
1206 let n_predictors = predictor_indices.len();
1207
1208 let non_missing_count = missing_mask.iter().filter(|&&x| !x).count();
1210
1211 if non_missing_count == 0 {
1212 return Ok((Array2::zeros((0, n_predictors)), Array1::zeros(0)));
1213 }
1214
1215 let mut train_x = Array2::zeros((non_missing_count, n_predictors));
1216 let mut train_y = Array1::zeros(non_missing_count);
1217
1218 let mut train_idx = 0;
1219 for i in 0..n_samples {
1220 if !missing_mask[i] {
1221 for (pred_j, &orig_j) in predictor_indices.iter().enumerate() {
1223 train_x[[train_idx, pred_j]] = data[[i, orig_j]];
1224 }
1225 train_y[train_idx] = data[[i, target_feature]];
1227 train_idx += 1;
1228 }
1229 }
1230
1231 Ok((train_x, train_y))
1232 }
1233
1234 fn prepare_test_data(
1236 &self,
1237 data: &Array2<f64>,
1238 predictor_indices: &[usize],
1239 missing_mask: &[bool],
1240 ) -> Result<Array2<f64>> {
1241 let n_samples = data.shape()[0];
1242 let n_predictors = predictor_indices.len();
1243
1244 let missing_count = missing_mask.iter().filter(|&&x| x).count();
1246
1247 if missing_count == 0 {
1248 return Ok(Array2::zeros((0, n_predictors)));
1249 }
1250
1251 let mut test_x = Array2::zeros((missing_count, n_predictors));
1252
1253 let mut test_idx = 0;
1254 for i in 0..n_samples {
1255 if missing_mask[i] {
1256 for (pred_j, &orig_j) in predictor_indices.iter().enumerate() {
1258 test_x[[test_idx, pred_j]] = data[[i, orig_j]];
1259 }
1260 test_idx += 1;
1261 }
1262 }
1263
1264 Ok(test_x)
1265 }
1266
1267 fn compute_total_change(&self, old_data: &Array2<f64>, newdata: &Array2<f64>) -> f64 {
1269 let diff = newdata - old_data;
1270 diff.iter().map(|&x| x * x).sum::<f64>().sqrt()
1271 }
1272
1273 fn is_missing(&self, value: f64) -> bool {
1275 if self.missingvalues.is_nan() {
1276 value.is_nan()
1277 } else {
1278 (value - self.missingvalues).abs() < f64::EPSILON
1279 }
1280 }
1281}
1282
1283#[cfg(test)]
1284mod tests {
1285 use super::*;
1286 use approx::assert_abs_diff_eq;
1287 use scirs2_core::ndarray::Array;
1288
1289 #[test]
1290 fn test_simple_imputer_mean() {
1291 let data = Array::from_shape_vec(
1293 (4, 3),
1294 vec![
1295 1.0,
1296 2.0,
1297 3.0,
1298 f64::NAN,
1299 5.0,
1300 6.0,
1301 7.0,
1302 f64::NAN,
1303 9.0,
1304 10.0,
1305 11.0,
1306 f64::NAN,
1307 ],
1308 )
1309 .unwrap();
1310
1311 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Mean);
1312 let transformed = imputer.fit_transform(&data).unwrap();
1313
1314 assert_eq!(transformed.shape(), &[4, 3]);
1316
1317 assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1323 assert_abs_diff_eq!(transformed[[1, 0]], 6.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[2, 0]], 7.0, epsilon = 1e-10);
1325 assert_abs_diff_eq!(transformed[[3, 0]], 10.0, epsilon = 1e-10);
1326
1327 assert_abs_diff_eq!(transformed[[0, 1]], 2.0, epsilon = 1e-10);
1328 assert_abs_diff_eq!(transformed[[1, 1]], 5.0, epsilon = 1e-10);
1329 assert_abs_diff_eq!(transformed[[2, 1]], 6.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[3, 1]], 11.0, epsilon = 1e-10);
1331
1332 assert_abs_diff_eq!(transformed[[0, 2]], 3.0, epsilon = 1e-10);
1333 assert_abs_diff_eq!(transformed[[1, 2]], 6.0, epsilon = 1e-10);
1334 assert_abs_diff_eq!(transformed[[2, 2]], 9.0, epsilon = 1e-10);
1335 assert_abs_diff_eq!(transformed[[3, 2]], 6.0, epsilon = 1e-10); }
1337
1338 #[test]
1339 fn test_simple_imputer_median() {
1340 let data = Array::from_shape_vec(
1342 (5, 2),
1343 vec![
1344 1.0,
1345 10.0,
1346 f64::NAN,
1347 20.0,
1348 3.0,
1349 f64::NAN,
1350 4.0,
1351 40.0,
1352 5.0,
1353 50.0,
1354 ],
1355 )
1356 .unwrap();
1357
1358 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Median);
1359 let transformed = imputer.fit_transform(&data).unwrap();
1360
1361 assert_eq!(transformed.shape(), &[5, 2]);
1363
1364 assert_abs_diff_eq!(transformed[[1, 0]], 3.5, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[2, 1]], 30.0, epsilon = 1e-10); }
1370
1371 #[test]
1372 fn test_simple_imputer_constant() {
1373 let data =
1375 Array::from_shape_vec((3, 2), vec![1.0, f64::NAN, f64::NAN, 3.0, 4.0, 5.0]).unwrap();
1376
1377 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Constant(99.0));
1378 let transformed = imputer.fit_transform(&data).unwrap();
1379
1380 assert_abs_diff_eq!(transformed[[0, 1]], 99.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[1, 0]], 99.0, epsilon = 1e-10); assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1386 assert_abs_diff_eq!(transformed[[1, 1]], 3.0, epsilon = 1e-10);
1387 assert_abs_diff_eq!(transformed[[2, 0]], 4.0, epsilon = 1e-10);
1388 assert_abs_diff_eq!(transformed[[2, 1]], 5.0, epsilon = 1e-10);
1389 }
1390
1391 #[test]
1392 fn test_missing_indicator() {
1393 let data = Array::from_shape_vec(
1395 (3, 4),
1396 vec![
1397 1.0,
1398 f64::NAN,
1399 3.0,
1400 4.0,
1401 f64::NAN,
1402 6.0,
1403 f64::NAN,
1404 8.0,
1405 9.0,
1406 10.0,
1407 11.0,
1408 f64::NAN,
1409 ],
1410 )
1411 .unwrap();
1412
1413 let mut indicator = MissingIndicator::with_nan();
1414 let indicators = indicator.fit_transform(&data).unwrap();
1415
1416 assert_eq!(indicators.shape(), &[3, 4]);
1418
1419 assert_abs_diff_eq!(indicators[[0, 0]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[0, 1]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[0, 2]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[0, 3]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 0]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 1]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 2]], 1.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[1, 3]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 0]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 1]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 2]], 0.0, epsilon = 1e-10); assert_abs_diff_eq!(indicators[[2, 3]], 1.0, epsilon = 1e-10); }
1435
1436 #[test]
1437 fn test_imputer_errors() {
1438 let data = Array::from_shape_vec((2, 2), vec![f64::NAN, 1.0, f64::NAN, 2.0]).unwrap();
1440
1441 let mut imputer = SimpleImputer::with_strategy(ImputeStrategy::Mean);
1442 assert!(imputer.fit(&data).is_err());
1443 }
1444
1445 #[test]
1446 fn test_knn_imputer_basic() {
1447 let data = Array::from_shape_vec(
1454 (4, 3),
1455 vec![
1456 1.0,
1457 2.0,
1458 3.0,
1459 4.0,
1460 f64::NAN,
1461 6.0,
1462 7.0,
1463 8.0,
1464 f64::NAN,
1465 10.0,
1466 11.0,
1467 12.0,
1468 ],
1469 )
1470 .unwrap();
1471
1472 let mut imputer = KNNImputer::with_n_neighbors(2);
1473 let transformed = imputer.fit_transform(&data).unwrap();
1474
1475 assert_eq!(transformed.shape(), &[4, 3]);
1477
1478 assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1480 assert_abs_diff_eq!(transformed[[0, 1]], 2.0, epsilon = 1e-10);
1481 assert_abs_diff_eq!(transformed[[0, 2]], 3.0, epsilon = 1e-10);
1482 assert_abs_diff_eq!(transformed[[3, 0]], 10.0, epsilon = 1e-10);
1483 assert_abs_diff_eq!(transformed[[3, 1]], 11.0, epsilon = 1e-10);
1484 assert_abs_diff_eq!(transformed[[3, 2]], 12.0, epsilon = 1e-10);
1485
1486 assert!(!transformed[[1, 1]].is_nan()); assert!(!transformed[[2, 2]].is_nan()); }
1490
1491 #[test]
1492 fn test_knn_imputer_simple_case() {
1493 let data = Array::from_shape_vec((3, 2), vec![1.0, 1.0, f64::NAN, 2.0, 3.0, 3.0]).unwrap();
1495
1496 let mut imputer = KNNImputer::with_n_neighbors(2);
1497 let transformed = imputer.fit_transform(&data).unwrap();
1498
1499 assert_abs_diff_eq!(transformed[[1, 0]], 2.0, epsilon = 1e-1);
1503 }
1504
1505 #[test]
1506 fn test_knn_imputer_manhattan_distance() {
1507 let data =
1508 Array::from_shape_vec((4, 2), vec![0.0, 0.0, 1.0, f64::NAN, 2.0, 2.0, 10.0, 10.0])
1509 .unwrap();
1510
1511 let mut imputer = KNNImputer::new(
1512 2,
1513 DistanceMetric::Manhattan,
1514 WeightingScheme::Uniform,
1515 f64::NAN,
1516 );
1517 let transformed = imputer.fit_transform(&data).unwrap();
1518
1519 assert!(!transformed[[1, 1]].is_nan());
1522 assert!(transformed[[1, 1]] < 5.0); }
1525
1526 #[test]
1527 fn test_knn_imputer_validation_errors() {
1528 let small_data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1530 let mut imputer = KNNImputer::with_n_neighbors(5); assert!(imputer.fit(&small_data).is_err());
1532
1533 let data =
1535 Array::from_shape_vec((4, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]).unwrap();
1536 let unfitted_imputer = KNNImputer::with_n_neighbors(2);
1537 assert!(unfitted_imputer.transform(&data).is_err());
1538 }
1539
1540 #[test]
1541 fn test_knn_imputer_no_missing_values() {
1542 let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1544
1545 let mut imputer = KNNImputer::with_n_neighbors(2);
1546 let transformed = imputer.fit_transform(&data).unwrap();
1547
1548 assert_eq!(transformed, data);
1550 }
1551
1552 #[test]
1553 fn test_knn_imputer_accessors() {
1554 let imputer = KNNImputer::new(
1555 3,
1556 DistanceMetric::Manhattan,
1557 WeightingScheme::Distance,
1558 -999.0,
1559 );
1560
1561 assert_eq!(imputer._nneighbors(), 3);
1562 assert_eq!(imputer.metric(), &DistanceMetric::Manhattan);
1563 assert_eq!(imputer.weights(), &WeightingScheme::Distance);
1564 }
1565
1566 #[test]
1567 fn test_knn_imputer_multiple_missing_features() {
1568 let data = Array::from_shape_vec(
1570 (4, 3),
1571 vec![
1572 1.0,
1573 2.0,
1574 3.0,
1575 f64::NAN,
1576 f64::NAN,
1577 6.0,
1578 7.0,
1579 8.0,
1580 9.0,
1581 10.0,
1582 11.0,
1583 12.0,
1584 ],
1585 )
1586 .unwrap();
1587
1588 let mut imputer = KNNImputer::with_n_neighbors(2);
1589 let transformed = imputer.fit_transform(&data).unwrap();
1590
1591 assert!(!transformed[[1, 0]].is_nan());
1593 assert!(!transformed[[1, 1]].is_nan());
1594 assert_abs_diff_eq!(transformed[[1, 2]], 6.0, epsilon = 1e-10);
1596 }
1597
1598 #[test]
1599 fn test_iterative_imputer_basic() {
1600 let data = Array::from_shape_vec(
1605 (4, 2),
1606 vec![1.0, 2.0, 2.0, 4.0, 3.0, f64::NAN, f64::NAN, 8.0],
1607 )
1608 .unwrap();
1609
1610 let mut imputer = IterativeImputer::with_max_iter(5);
1611 let transformed = imputer.fit_transform(&data).unwrap();
1612
1613 assert!(!transformed[[2, 1]].is_nan()); assert!(!transformed[[3, 0]].is_nan()); assert_abs_diff_eq!(transformed[[0, 0]], 1.0, epsilon = 1e-10);
1619 assert_abs_diff_eq!(transformed[[0, 1]], 2.0, epsilon = 1e-10);
1620 assert_abs_diff_eq!(transformed[[1, 0]], 2.0, epsilon = 1e-10);
1621 assert_abs_diff_eq!(transformed[[1, 1]], 4.0, epsilon = 1e-10);
1622 assert_abs_diff_eq!(transformed[[2, 0]], 3.0, epsilon = 1e-10);
1623 assert_abs_diff_eq!(transformed[[3, 1]], 8.0, epsilon = 1e-10);
1624
1625 let imputed_f1_row2 = transformed[[2, 1]];
1628 let expected_f1_row2 = 2.0 * transformed[[2, 0]]; assert!((imputed_f1_row2 - expected_f1_row2).abs() < 1.0); let imputed_f0_row3 = transformed[[3, 0]];
1632 let expected_f0_row3 = transformed[[3, 1]] / 2.0; assert!((imputed_f0_row3 - expected_f0_row3).abs() < 1.0); }
1635
1636 #[test]
1637 fn test_iterative_imputer_no_missing_values() {
1638 let data = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 3.0, 4.0, 5.0, 6.0]).unwrap();
1640
1641 let mut imputer = IterativeImputer::with_defaults();
1642 let transformed = imputer.fit_transform(&data).unwrap();
1643
1644 for i in 0..3 {
1646 for j in 0..2 {
1647 assert_abs_diff_eq!(transformed[[i, j]], data[[i, j]], epsilon = 1e-10);
1648 }
1649 }
1650 }
1651
1652 #[test]
1653 fn test_iterative_imputer_convergence() {
1654 let data = Array::from_shape_vec(
1656 (5, 3),
1657 vec![
1658 1.0,
1659 2.0,
1660 3.0,
1661 2.0,
1662 f64::NAN,
1663 6.0,
1664 3.0,
1665 6.0,
1666 f64::NAN,
1667 4.0,
1668 8.0,
1669 12.0,
1670 f64::NAN,
1671 10.0,
1672 15.0,
1673 ],
1674 )
1675 .unwrap();
1676
1677 let mut imputer = IterativeImputer::new(
1678 20, 1e-4, ImputeStrategy::Mean,
1681 f64::NAN,
1682 1e-6, );
1684
1685 let transformed = imputer.fit_transform(&data).unwrap();
1686
1687 for i in 0..5 {
1689 for j in 0..3 {
1690 assert!(!transformed[[i, j]].is_nan());
1691 }
1692 }
1693 }
1694
1695 #[test]
1696 fn test_iterative_imputer_different_strategies() {
1697 let data = Array::from_shape_vec(
1698 (4, 2),
1699 vec![1.0, f64::NAN, 2.0, 4.0, 3.0, 6.0, f64::NAN, 8.0],
1700 )
1701 .unwrap();
1702
1703 let mut imputer_median =
1705 IterativeImputer::new(5, 1e-3, ImputeStrategy::Median, f64::NAN, 1e-6);
1706 let transformed_median = imputer_median.fit_transform(&data).unwrap();
1707 assert!(!transformed_median[[0, 1]].is_nan());
1708 assert!(!transformed_median[[3, 0]].is_nan());
1709
1710 let mut imputer_constant =
1712 IterativeImputer::new(5, 1e-3, ImputeStrategy::Constant(999.0), f64::NAN, 1e-6);
1713 let transformed_constant = imputer_constant.fit_transform(&data).unwrap();
1714 assert!(!transformed_constant[[0, 1]].is_nan());
1715 assert!(!transformed_constant[[3, 0]].is_nan());
1716 }
1717
1718 #[test]
1719 fn test_iterative_imputer_builder_methods() {
1720 let imputer = IterativeImputer::with_defaults()
1721 .with_random_seed(42)
1722 .with_alpha(1e-3)
1723 .with_min_improvement(1e-5);
1724
1725 assert_eq!(imputer.random_seed, Some(42));
1726 assert_abs_diff_eq!(imputer.alpha, 1e-3, epsilon = 1e-10);
1727 assert_abs_diff_eq!(imputer.min_improvement, 1e-5, epsilon = 1e-10);
1728 }
1729
1730 #[test]
1731 fn test_iterative_imputer_errors() {
1732 let imputer = IterativeImputer::with_defaults();
1734 let test_data = Array::from_shape_vec((2, 2), vec![1.0, 2.0, 3.0, 4.0]).unwrap();
1735 assert!(imputer.transform(&test_data).is_err());
1736
1737 let bad_data =
1739 Array::from_shape_vec((3, 2), vec![f64::NAN, 1.0, f64::NAN, 2.0, f64::NAN, 3.0])
1740 .unwrap();
1741 let mut imputer = IterativeImputer::with_defaults();
1742 assert!(imputer.fit(&bad_data).is_err());
1743 }
1744
1745 #[test]
1746 fn test_simple_regressor() {
1747 let x = Array::from_shape_vec((3, 2), vec![1.0, 2.0, 2.0, 3.0, 3.0, 4.0]).unwrap();
1749 let y = Array::from_vec(vec![5.0, 8.0, 11.0]); let mut regressor = SimpleRegressor::new(true, 1e-6);
1752 regressor.fit(&x, &y).unwrap();
1753
1754 let test_x = Array::from_shape_vec((2, 2), vec![4.0, 5.0, 5.0, 6.0]).unwrap();
1755 let predictions = regressor.predict(&test_x).unwrap();
1756
1757 assert_eq!(predictions.len(), 2);
1759 assert!(!predictions[0].is_nan());
1760 assert!(!predictions[1].is_nan());
1761 }
1762}